diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 0000000..87adcc3
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,75 @@
+name: Documentation
+
+on:
+ pull_request:
+ paths:
+ - 'docs/**'
+ - 'packages/dataset_config_dart/**'
+ - 'tool/**'
+ - '.github/workflows/docs.yml'
+ push:
+ branches:
+ - main
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v6
+
+ - name: Set up Python
+ uses: actions/setup-python@v6
+ with:
+ python-version: '3.13'
+
+ - name: Set up Flutter
+ uses: subosito/flutter-action@fd55f4c5af5b953cc57a2be44cb082c8f6635e8e
+ with:
+ channel: stable
+
+ - name: Install Python dependencies
+ run: |
+ pip install --upgrade pip
+ pip install -r docs/requirements.txt
+ pip install -e packages/dash_evals
+
+ - name: Install Dart dependencies
+ run: |
+ flutter pub get
+ cd tool/dartdoc_to_md && dart pub get
+
+ - name: Build documentation
+ working-directory: docs
+ run: make html
+
+ - name: Upload build artifact
+ uses: actions/upload-artifact@v6
+ with:
+ name: docs-html
+ path: docs/_build/html
+ retention-days: 1
+
+ deploy:
+ # Only deploy on push to main
+ if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+ needs: build
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v6
+
+ - name: Download build artifact
+ uses: actions/download-artifact@v7
+ with:
+ name: docs-html
+ path: docs/_build/html
+
+ - name: Deploy to Firebase Hosting
+ uses: FirebaseExtended/action-hosting-deploy@v0
+ with:
+ repoToken: ${{ secrets.GITHUB_TOKEN }}
+ firebaseServiceAccount: ${{ secrets.FIREBASE_SERVICE_ACCOUNT }}
+ projectId: evals
+ target: evals-docs
+ channelId: live
diff --git a/README.md b/README.md
index e2a6dd5..20a2d6d 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,48 @@
-# Flutter evals
+# evals
-Evaluation framework for testing AI agents ability to write Dart and Flutter code.
\ No newline at end of file
+Evaluation framework for testing AI agents' ability to write Dart and Flutter code. Built on [Inspect AI](https://inspect.aisi.org.uk/).
+
+> [!TIP]
+> Full documentation at [evals-docs.web.app/](https://evals-docs.web.app/)
+
+## Overview
+
+evals provides:
+
+- **Evaluation Runner** — Python package for running LLM evaluations with configurable tasks, variants, and models
+- **Evaluation Configuration** — Dart and Python packages that resolve dataset YAML into EvalSet JSON for the runner
+- **devals CLI** — Dart CLI for creating and managing dataset samples, tasks, and jobs
+- **Evaluation Explorer** — Dart/Flutter app for browsing and analyzing results
+- **Dataset** — Curated samples for Dart/Flutter Q&A, code generation, and debugging tasks
+
+## Packages
+
+| Package | Description | Docs |
+|---------|-------------|------|
+| [dash_evals](packages/dash_evals/) | Python evaluation runner using Inspect AI | [dash_evals docs](docs/contributing/packages/dash_evals.md) |
+| [dataset_config_dart](packages/dataset_config_dart/) | Dart library for resolving dataset YAML into EvalSet JSON (includes shared data models) | [dataset_config_dart docs](docs/contributing/packages/dataset_config_dart.md) |
+| [dataset_config_python](packages/dataset_config_python/) | Python configuration models | — |
+| [devals_cli](packages/devals_cli/) | Dart CLI for managing evaluation tasks and jobs | [CLI docs](docs/reference/cli.md) |
+| [eval_explorer](packages/eval_explorer/) | Dart/Flutter results viewer (Serverpod) | [eval_explorer docs](docs/contributing/packages/eval_explorer.md) |
+
+> [!NOTE]
+> The **uploader** and **report_app** packages are deprecated and will be replaced by **eval_explorer**.
+
+## Documentation
+
+| Doc | Description |
+|-----|-------------|
+| [Quick Start](docs/guides/quick_start.md) | Get started authoring your own evals |
+| [Contributing Guide](docs/contributing/guide.md) | Development setup and guidelines |
+| [CLI Reference](docs/reference/cli.md) | Full devals CLI command reference |
+| [Configuration Reference](docs/reference/configuration_reference.md) | YAML configuration file reference |
+| [Repository Structure](docs/contributing/repository_structure.md) | Project layout |
+| [Glossary](docs/reference/glossary.md) | Terminology guide |
+
+## Contributing
+
+See [CONTRIBUTING.md](CONTRIBUTING.md) for details, or go directly to the [Contributing Guide](docs/contributing/guide.md).
+
+## License
+
+See [LICENSE](LICENSE) for details.
\ No newline at end of file
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..7776cf8
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,38 @@
+# Makefile for Sphinx + Dart API documentation
+
+SPHINXOPTS ?=
+SPHINXBUILD ?= sphinx-build
+SOURCEDIR = .
+BUILDDIR = _build
+
+# Dart API generator
+DARTDOC_TOOL = ../tool/dartdoc_to_md
+REPO_ROOT = ..
+
+.PHONY: help clean html livehtml dartdoc html-python
+
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+clean:
+ rm -rf $(BUILDDIR)
+ rm -rf reference/dart_api
+
+# Generate Dart API markdown using the custom analyzer-based generator
+dartdoc:
+ @echo "Generating Dart API documentation..."
+ cd $(DARTDOC_TOOL) && dart run bin/generate.dart --root $(shell cd $(REPO_ROOT) && pwd) --output docs/reference/dart_api
+ @echo "Dart API markdown generated in dart_api/"
+
+# Build HTML docs (runs Dart generator first, then Sphinx)
+html: dartdoc
+ @$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+ @echo "Build finished. Open $(BUILDDIR)/html/index.html"
+
+# Build HTML docs without Dart doc generation (faster for Python-only changes)
+html-python:
+ @$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+ @echo "Build finished. Open $(BUILDDIR)/html/index.html"
+
+livehtml:
+ sphinx-autobuild "$(SOURCEDIR)" "$(BUILDDIR)/html" $(SPHINXOPTS) $(O)
diff --git a/docs/_static/custom.css b/docs/_static/custom.css
new file mode 100644
index 0000000..9243a6b
--- /dev/null
+++ b/docs/_static/custom.css
@@ -0,0 +1,420 @@
+/* Custom styling for dash_evals documentation */
+
+
+/* ============================================
+ BRAND COLORS (PyData CSS variables)
+ ============================================ */
+
+html[data-theme="light"] {
+ --pst-color-primary: #7C4DFF;
+ --pst-color-primary-highlight: #9C7CFF;
+}
+
+html[data-theme="dark"] {
+ --pst-color-primary: #B388FF;
+ --pst-color-primary-highlight: #D1B3FF;
+}
+
+
+/* ============================================
+ LINKS: Color and underline overrides
+ ============================================ */
+
+html[data-theme="light"] {
+ --pst-color-link: #7C4DFF;
+ --pst-color-link-hover: #5C35CC;
+}
+
+html[data-theme="dark"] {
+ --pst-color-link: #B388FF;
+ --pst-color-link-hover: #D1B3FF;
+}
+
+.bd-article-container a {
+ color: var(--pst-color-link);
+ text-decoration: none;
+ text-decoration-thickness: max(1px, .0625rem);
+ text-underline-offset: 0.15em;
+ overflow-wrap: break-word;
+}
+
+.bd-article-container a:hover {
+ color: var(--pst-color-link-hover);
+ text-decoration-thickness: max(3px, .1875rem, .12em);
+ text-decoration-skip-ink: none;
+}
+
+.bd-article-container a:visited {
+ color: var(--pst-color-link);
+}
+
+/* Links in headings should not be underlined */
+.bd-article-container h1 a,
+.bd-article-container h2 a,
+.bd-article-container h3 a,
+.bd-article-container h4 a {
+ text-decoration: none;
+}
+
+.bd-article-container h1 a:hover,
+.bd-article-container h2 a:hover,
+.bd-article-container h3 a:hover,
+.bd-article-container h4 a:hover {
+ text-decoration: underline;
+}
+
+.navbar-brand:hover, .navbar-brand:visited:hover {
+ text-decoration: none;
+ color: var(--pst-color-link-hover);
+}
+
+.bd-header ul.navbar-nav>li.nav-item.current>.nav-link:before {
+ border-bottom: none;
+}
+
+.bd-header ul.navbar-nav>li.nav-item>.nav-link:hover:before {
+ border-bottom: none;
+}
+
+.prev-next-area a.left-prev:hover,
+.prev-next-area a.right-next:hover {
+ text-decoration: none !important;
+ border-bottom: none;
+}
+
+/* The theme puts the underline on p.prev-next-title inside the , not on itself */
+.prev-next-area a p.prev-next-title, .prev-next-area a:hover p.prev-next-title, .prev-next-area a p.prev-next-title:hover {
+ text-decoration: none !important;
+}
+
+.prev-next-info {
+ width: 200px;
+ padding: .25rem;
+ border: .5px solid #D1B3FF;
+ border-radius: 4px;
+}
+
+.prev-next-info:hover {
+ background: #eae0f9;
+}
+
+/* ============================================
+ LISTS: Spacing and bullet styles
+ ============================================ */
+
+.bd-article-container ul {
+ list-style-type: disc;
+ padding-left: 1.5em;
+}
+
+.bd-article-container ol {
+ padding-left: 1.5em;
+}
+
+.bd-article-container li {
+ margin-bottom: 0.35em;
+ line-height: 1.65;
+}
+
+.bd-article-container ul ul {
+ list-style-type: circle;
+ margin-top: 0.35em;
+}
+
+.bd-article-container ul ul ul {
+ list-style-type: square;
+}
+
+/* Tighter spacing for nested lists */
+.bd-article-container li > ul,
+.bd-article-container li > ol {
+ margin-bottom: 0;
+}
+
+
+/* ============================================
+ BLOCKQUOTES
+ ============================================ */
+
+.bd-article-container blockquote {
+ border-left: 3px solid var(--pst-color-primary);
+ padding: 0.5rem 1rem;
+ margin: 1rem 0 1.2rem 0;
+ color: var(--pst-color-text-muted);
+ background-color: transparent;
+}
+
+.bd-article-container blockquote p {
+ margin-bottom: 0.5rem;
+}
+
+.bd-article-container blockquote p:last-child {
+ margin-bottom: 0;
+}
+
+
+/* ============================================
+ INLINE CODE (not in code blocks)
+ ============================================ */
+
+html[data-theme="light"] {
+ --pst-color-inline-code: #912583;
+}
+
+html[data-theme="dark"] {
+ --pst-color-inline-code: #f3c7ee;
+}
+
+.bd-article-container code:not(pre code) {
+ color: var(--pst-color-inline-code);
+ font-size: 0.875em;
+}
+
+
+/* ============================================
+ HORIZONTAL RULES
+ ============================================ */
+
+.bd-article-container hr {
+ border: none;
+ border-top: 1px solid var(--pst-color-border);
+ margin: 2rem 0;
+ opacity: 0.65;
+}
+
+
+/* ============================================
+ STRONG / EMPHASIS
+ ============================================ */
+
+.bd-article-container strong {
+ font-weight: 600;
+ color: var(--pst-color-text-base);
+}
+
+
+/* ============================================
+ DEFINITION LISTS (dl/dt/dd)
+ ============================================ */
+
+.bd-article-container dl {
+ margin-bottom: 1.2rem;
+}
+
+.bd-article-container dt {
+ font-weight: 600;
+ margin-top: 0.8rem;
+ color: var(--pst-color-text-base);
+}
+
+.bd-article-container dd {
+ margin-left: 1.5em;
+ margin-bottom: 0.5rem;
+}
+
+
+/* ============================================
+ MAIN CONTENT: Base font size (~10% smaller)
+ ============================================ */
+
+.bd-article-container .bd-content {
+ font-size: 0.9rem;
+ line-height: 1.7;
+}
+
+
+/* ============================================
+ MAIN CONTENT: Headings (smaller)
+ ============================================ */
+
+.bd-article-container h1 {
+ font-size: 1.6rem;
+ margin-top: 1.5rem;
+ margin-bottom: 1rem;
+}
+
+.bd-article-container h2 {
+ font-size: 1.25rem;
+ margin-top: 1.8rem;
+ margin-bottom: 0.8rem;
+}
+
+.bd-article-container h3 {
+ font-size: 1.05rem;
+ margin-top: 1.5rem;
+ margin-bottom: 0.6rem;
+}
+
+.bd-article-container h4 {
+ font-size: 0.95rem;
+ margin-top: 1.2rem;
+ margin-bottom: 0.5rem;
+}
+
+
+/* ============================================
+ MAIN CONTENT: More spacing between elements
+ ============================================ */
+
+.bd-article-container p {
+ margin-bottom: 1rem;
+}
+
+.bd-article-container ul,
+.bd-article-container ol {
+ margin-bottom: 1.2rem;
+}
+
+.bd-article-container section {
+ margin-bottom: 1.5rem;
+}
+
+/* Spacing after code blocks */
+.bd-article-container .highlight {
+ margin-bottom: 1.2rem;
+}
+
+/* Spacing after tables */
+.bd-article-container table {
+ margin-bottom: 1.5rem;
+}
+
+/* Spacing after admonitions */
+.bd-article-container .admonition {
+ margin-bottom: 1.5rem;
+}
+
+
+/* ============================================
+ CODE BLOCKS: Slightly darker background
+ ============================================ */
+
+pre {
+ border: 1px solid #e0e0e0;
+ border-radius: 6px;
+ background-color: #f5f5f5;
+}
+
+code.literal {
+ border: 1px solid #e0e0e0;
+ border-radius: 3px;
+ padding: 1px 4px;
+ background-color: #f2f2f2;
+}
+
+html[data-theme="light"] .highlight pre {
+ line-height: 170%;
+}
+
+html[data-theme="dark"] pre {
+ border-color: #444;
+ background-color: #1e1e1e;
+}
+
+html[data-theme="dark"] code.literal {
+ border-color: #444;
+ background-color: #2a2a2a;
+}
+
+
+/* ============================================
+ TABLES: Padding & header background
+ ============================================ */
+
+.bd-article-container table th {
+ padding: 6px 14px;
+ background-color: #f0f0f0;
+ font-weight: 600;
+ font-size: 0.85rem;
+}
+
+.bd-article-container table td {
+ padding: 5px 14px;
+ font-size: 0.85rem;
+}
+
+/* Subtle row striping for readability */
+.bd-article-container table tbody tr:nth-child(even) {
+ background-color: #fafafa;
+}
+
+html[data-theme="dark"] .bd-article-container table th {
+ background-color: #2a2a2a;
+}
+
+html[data-theme="dark"] .bd-article-container table tbody tr:nth-child(even) {
+ background-color: #1e1e1e;
+}
+
+
+/* ============================================
+ SIGNATURE COLORS (class/function definitions)
+ ============================================ */
+
+/* Module path: dash_evals.runner.models. */
+.sig-prename.descclassname {
+ color: #666666 !important;
+}
+
+/* Class/function name: TaskResult, flutter_bug_fix */
+.sig-name.descname {
+ color: #7C4DFF !important;
+ font-weight: 600;
+}
+
+/* Property/attribute names in signatures */
+dt.sig.sig-object .sig-name:not(.descname) {
+ color: #005577 !important;
+}
+
+
+/* ============================================
+ TYPE ANNOTATION COLORS
+ ============================================ */
+
+/* The "class" keyword */
+dt.sig.sig-object > .property {
+ color: #0077AA !important;
+ font-weight: 800;
+}
+
+/* Type names in annotations */
+.sig .sig-param span.pre,
+.sig > span.pre:not(:first-child) {
+ color: #A90D91;
+}
+
+/* Parentheses */
+.sig-paren {
+ color: #666666;
+}
+
+
+/* ============================================
+ DARK MODE: Signatures
+ ============================================ */
+
+html[data-theme="dark"] .sig-prename.descclassname {
+ color: #888888 !important;
+}
+
+html[data-theme="dark"] .sig-name.descname {
+ color: #B388FF !important;
+}
+
+html[data-theme="dark"] dt.sig.sig-object .sig-name:not(.descname) {
+ color: #61AFEF !important;
+}
+
+html[data-theme="dark"] dt.sig.sig-object > span.pre:first-child {
+ color: #56B6C2 !important;
+}
+
+html[data-theme="dark"] .sig .sig-param span.pre,
+html[data-theme="dark"] .sig > span.pre:not(:first-child) {
+ color: #CE93D8;
+}
+
+html[data-theme="dark"] .sig-paren {
+ color: #888888;
+}
diff --git a/docs/_static/images/eval-set.png b/docs/_static/images/eval-set.png
new file mode 100644
index 0000000..0b58b4d
Binary files /dev/null and b/docs/_static/images/eval-set.png differ
diff --git a/docs/_static/images/evals-dataset.png b/docs/_static/images/evals-dataset.png
new file mode 100644
index 0000000..ebaebfe
Binary files /dev/null and b/docs/_static/images/evals-dataset.png differ
diff --git a/docs/_static/images/job.png b/docs/_static/images/job.png
new file mode 100644
index 0000000..7963112
Binary files /dev/null and b/docs/_static/images/job.png differ
diff --git a/docs/_static/images/logo.png b/docs/_static/images/logo.png
new file mode 100644
index 0000000..779227b
Binary files /dev/null and b/docs/_static/images/logo.png differ
diff --git a/docs/_static/images/repo-separation.png b/docs/_static/images/repo-separation.png
new file mode 100644
index 0000000..c830b22
Binary files /dev/null and b/docs/_static/images/repo-separation.png differ
diff --git a/docs/_static/images/task.png b/docs/_static/images/task.png
new file mode 100644
index 0000000..a451400
Binary files /dev/null and b/docs/_static/images/task.png differ
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..db1b270
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,117 @@
+# Configuration file for the Sphinx documentation builder.
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+import os
+import sys
+
+# Add the source directory to the path so Sphinx can find the modules
+sys.path.insert(0, os.path.abspath("../packages/dash_evals/src"))
+
+# -- Project information -----------------------------------------------------
+
+project = "dash_evals"
+copyright = "2025, Flutter Authors"
+author = "Flutter Authors"
+
+# -- General configuration ---------------------------------------------------
+
+extensions = [
+ "sphinx.ext.autodoc", # Auto-generate docs from docstrings
+ "sphinx.ext.napoleon", # Support Google/NumPy docstring styles
+ "sphinx.ext.viewcode", # Add links to source code
+ "sphinx.ext.intersphinx", # Link to other projects' docs
+ "sphinx_autodoc_typehints", # Better type hint rendering
+ "myst_parser", # Support Markdown files
+ "sphinx_design", # Cards, grids, tabs
+]
+
+# Autodoc settings
+autodoc_default_options = {
+ "members": True,
+ "member-order": "bysource",
+ "special-members": "__init__",
+ "undoc-members": True,
+ "exclude-members": "__weakref__",
+}
+autodoc_typehints = "description"
+autodoc_class_signature = "separated"
+
+# Napoleon settings (for Google-style docstrings)
+napoleon_google_docstring = True
+napoleon_numpy_docstring = True
+napoleon_include_init_with_doc = True
+napoleon_include_private_with_doc = False
+
+# MyST parser settings (for Markdown support)
+myst_enable_extensions = [
+ "colon_fence",
+ "fieldlist",
+]
+
+# Intersphinx mapping
+intersphinx_mapping = {
+ "python": ("https://docs.python.org/3", None),
+}
+
+# -- Options for HTML output -------------------------------------------------
+
+html_theme = "pydata_sphinx_theme"
+html_title = "evals"
+root_doc = "index"
+
+html_theme_options = {
+ # # Logo
+ # "logo": {
+ # "image_light": "_static/images/logo.png",
+ # "image_dark": "_static/images/logo.png",
+ # },
+ # Show all top-nav tabs instead of collapsing to "More ▾"
+ "header_links_before_dropdown": 4,
+ # Top-right icons
+ "icon_links": [
+ {
+ "name": "GitHub",
+ "url": "https://github.com/flutter/evals",
+ "icon": "fa-brands fa-github",
+ },
+ ],
+ # --- Header / Navigation Bar ---
+ # Left: logo
+ "navbar_start": ["navbar-logo"],
+ # Center: top-level section tabs (Guides, Reference, Contributing)
+ # These are auto-generated from root-level toctree entries in index.md.
+ "navbar_center": ["navbar-nav"],
+ # Right: theme switcher + icon links
+ "navbar_end": ["theme-switcher", "navbar-icon-links"],
+ # Persistent right (stays visible even on small screens)
+ "navbar_persistent": ["search-button"],
+ # Align nav tabs to the left, closer to the logo
+ "navbar_align": "left",
+ # --- Primary sidebar (left) ---
+ # Show 2 levels of nav expanded by default
+ "show_nav_level": 1,
+ # --- Secondary sidebar (right) ---
+ # Shows the current page's table of contents
+ "secondary_sidebar_items": ["page-toc"],
+ # --- Syntax highlighting ---
+ "pygments_light_style": "xcode",
+ "pygments_dark_style": "monokai",
+}
+
+# --- Primary sidebar (left) ---
+# Shows section sub-navigation (e.g. Guides subpages) via sidebar-nav-bs.
+# This is the correct way to configure the left sidebar in PyData theme.
+# Use page-glob patterns to customise per-section, e.g. {"index": []} to hide.
+html_sidebars = {
+ "**": ["sidebar-nav-bs"],
+}
+
+# Static files
+html_static_path = ["_static"]
+html_css_files = ["custom.css"]
+
+# Source file suffixes
+source_suffix = {
+ ".rst": "restructuredtext",
+ ".md": "markdown",
+}
diff --git a/docs/contributing/guide.md b/docs/contributing/guide.md
new file mode 100644
index 0000000..f001c14
--- /dev/null
+++ b/docs/contributing/guide.md
@@ -0,0 +1,305 @@
+# Contributing
+
+Welcome to the Dart/Flutter LLM evaluation project! This repository contains tools for running and analyzing AI model evaluations on Dart and Flutter tasks.
+
+---
+
+## Table of Contents
+
+- [dash_evals](#dash_evals)
+ - [Setup](#setup)
+ - [Write a New Eval](#write-a-new-eval)
+ - [Add Your Sample to the Dataset](#add-your-sample-to-the-dataset)
+ - [Edit the Config to Run Only Your New Sample](#edit-the-config-to-run-only-your-new-sample)
+ - [Verify the Sample Works](#verify-the-sample-works)
+ - [What to Commit (and Not Commit!)](#what-to-commit-and-not-commit)
+ - [Add Functionality to the Runner](#add-functionality-to-the-runner)
+ - [Understand Tasks, Solvers, and Scorers](#understand-tasks-solvers-and-scorers)
+ - [Add a New Task](#add-a-new-task)
+ - [Test and Verify](#test-and-verify)
+- [eval_explorer](#eval_explorer)
+
+---
+
+### Setup
+
+1. **Prerequisites**
+ - Python 3.13+
+ - Podman or Docker (for sandbox execution)
+ - API keys for the models you want to test
+
+2. **Create and activate a virtual environment**
+
+ ```bash
+ cd packages/dash_evals
+ python -m venv .venv
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
+ ```
+
+3. **Install dependencies**
+
+ ```bash
+ pip install -e . # Core dependencies
+ pip install -e ".[dev]" # Development dependencies (pytest, ruff, etc.)
+ ```
+
+4. **Configure API keys**
+
+ You only need to configure the keys you plan on testing.
+
+ ```bash
+ export GEMINI_API_KEY=your_key_here
+ export ANTHROPIC_API_KEY=your_key_here
+ export OPENAI_API_KEY=your_key_here
+ ```
+
+5. **Verify installation**
+
+ ```bash
+ run-evals --help
+ ```
+
+---
+
+### Write a New Eval
+
+The most common contribution is adding new evaluation samples. Each sample tests a specific capability or scenario.
+
+#### Add Your Sample to the Dataset
+
+1. **Decide which task your sample belongs to**
+
+ Review the available tasks in `dataset/tasks/` or run `devals create task` to see available task functions:
+
+ | Task | Purpose |
+ |------|---------|
+ | `question_answer` | Q&A evaluation for Dart/Flutter knowledge |
+ | `bug_fix` | Agentic debugging of code in a sandbox |
+ | `flutter_bug_fix` | Flutter-specific bug fix (wraps `bug_fix`) |
+ | `code_gen` | Generate code from specifications |
+ | `flutter_code_gen` | Flutter-specific code gen (wraps `code_gen`) |
+ | `mcp_tool` | Test MCP tool usage |
+ | `analyze_codebase` | Evaluate codebase analysis |
+ | `skill_test` | Test skill file usage in sandboxes |
+
+2. **Create your sample file**
+
+ Use `devals create sample` for interactive sample creation, or add a sample inline in the task's `task.yaml` file under `dataset/tasks//task.yaml`:
+
+ ```yaml
+ id: dart_your_sample_id
+ input: |
+ Your prompt to the model goes here.
+ target: |
+ Criteria for grading the response. This is used by the scorer
+ to determine if the model's output is acceptable.
+ metadata:
+ added: 2025-02-04
+ tags: [dart, async] # Optional categorization
+ ```
+
+ For agentic tasks (bug fix, code gen), you'll also need a workspace:
+
+ ```yaml
+ id: flutter_fix_some_bug
+ input: |
+ The app crashes when the user taps the submit button.
+ Debug and fix the issue.
+ target: |
+ The fix should handle the null check in the onPressed callback.
+ workspace:
+ template: flutter_app # Use a reusable template
+ # OR
+ path: ./project # Custom project relative to sample directory
+ ```
+
+3. **Add your sample to the task's `task.yaml`**
+
+ Add your sample inline in the appropriate task's `samples` list:
+
+ ```yaml
+ # dataset/tasks/dart_question_answer/task.yaml
+ func: question_answer
+ samples:
+ - id: dart_your_sample_id
+ input: |
+ Your prompt to the model goes here.
+ target: |
+ Criteria for grading the response.
+ ```
+
+#### Edit the Config to Run Only Your New Sample
+
+Before committing, test your sample by creating a job file. Use `devals create job` interactively, or manually create one in `dataset/jobs/`:
+
+```yaml
+# jobs/test_my_sample.yaml
+name: test_my_sample
+
+# Run only the task containing your sample
+tasks:
+ dart_question_answer:
+ allowed_variants: [baseline] # Start with baseline variant
+ include-samples:
+ - dart_your_sample_id # Only run your specific sample
+
+# Use a fast model for testing
+models: [google/gemini-2.5-flash]
+```
+
+Then run with your job:
+
+```bash
+devals run test_my_sample
+```
+
+#### Verify the Sample Works
+
+1. **Dry run first** — validates configuration without making API calls:
+
+ ```bash
+ devals run test_my_sample --dry-run
+ ```
+
+2. **Run the evaluation**:
+
+ ```bash
+ devals run test_my_sample
+ ```
+
+3. **Check the output** in the `logs/` directory. Verify:
+ - The model received your prompt correctly
+ - The scorer evaluated the response appropriately
+ - No errors occurred during execution
+
+#### What to Commit (and Not Commit!)
+
+**Do commit:**
+- Your updated task file(s) in `dataset/tasks/`
+- Any new workspace templates or context files
+
+**Do NOT commit:**
+- Test job files in `dataset/jobs/` (if they were only for local testing)
+- Log files in `logs/`
+- API keys or `.env` files
+
+Before submitting your PR, clean up any test job files you created:
+
+```bash
+git status # Check for untracked/modified job files
+```
+
+---
+
+### Add Functionality to the Runner
+
+If you're adding new task types, scorers, or solvers, this section is for you.
+
+#### Understand Tasks, Solvers, and Scorers
+
+The dash_evals runner uses [Inspect AI](https://inspect.aisi.org.uk/) concepts:
+
+| Component | Purpose | Location |
+|-----------|---------|----------|
+| **Task** | Defines what to evaluate — combines dataset, solver chain, and scorers | `runner/tasks/` |
+| **Solver** | Processes inputs (e.g., injects context, runs agent loops) | `runner/solvers/` |
+| **Scorer** | Evaluates outputs (e.g., model grading, dart analyze, flutter test) | `runner/scorers/` |
+
+A typical task structure:
+
+```python
+from inspect_ai import Task, task
+from inspect_ai.dataset import MemoryDataset
+
+@task
+def your_new_task(dataset: MemoryDataset, task_def: dict) -> Task:
+ return Task(
+ name=task_def.get("name", "your_new_task"),
+ dataset=dataset,
+ solver=[
+ add_system_message("Your system prompt"),
+ context_injector(task_def),
+ # ... more solvers
+ ],
+ scorer=[
+ model_graded_scorer(),
+ dart_analyze_scorer(),
+ ],
+ )
+```
+
+#### Add a New Task
+
+1. **Create your task file** at `src/dash_evals/runner/tasks/your_task.py`
+
+2. **Export it** from `src/dash_evals/runner/tasks/__init__.py`:
+
+ ```python
+ from .your_task import your_new_task
+
+ __all__ = [
+ # ... existing tasks ...
+ "your_new_task",
+ ]
+ ```
+
+ Task functions are discovered dynamically via `importlib`. If the function name matches a module in `runner/tasks/`, it will be found automatically when referenced from a `task.yaml` file. No registry is needed.
+
+3. **Create a task directory** in `dataset/tasks/`:
+
+ ```
+ dataset/tasks/your_task_id/
+ └── task.yaml
+ ```
+
+ ```yaml
+ # dataset/tasks/your_task_id/task.yaml
+ func: your_new_task # Must match the function name
+ samples:
+ - id: sample_001
+ input: |
+ Your prompt here.
+ target: |
+ Expected output or grading criteria.
+ ```
+
+#### Test and Verify
+
+1. **Run the test suite**:
+
+ ```bash
+ cd packages/dash_evals
+ pytest
+ ```
+
+2. **Run linting**:
+
+ ```bash
+ ruff check src/dash_evals
+ ruff format src/dash_evals
+ ```
+
+3. **Test your task end-to-end**:
+
+ ```bash
+ devals run test_my_sample --dry-run # Validate config
+ devals run test_my_sample # Run actual evaluation
+ ```
+
+---
+
+## eval_explorer
+
+A Dart/Flutter application for exploring evaluation results, built with [Serverpod](https://serverpod.dev/).
+
+> [!NOTE]
+> The eval_explorer is under active development. Contribution guidelines coming soon!
+
+The package is located in `packages/eval_explorer/` and consists of:
+
+| Package | Description |
+|---------|-------------|
+| `eval_explorer_client` | Dart client package |
+| `eval_explorer_flutter` | Flutter web/mobile app |
+| `eval_explorer_server` | Serverpod backend |
+| `eval_explorer_shared` | Shared models |
diff --git a/docs/contributing/index.md b/docs/contributing/index.md
new file mode 100644
index 0000000..9b2ea78
--- /dev/null
+++ b/docs/contributing/index.md
@@ -0,0 +1,20 @@
+# Contributor Guides
+
+Documentation about how it all works.
+
+```{toctree}
+:maxdepth: 2
+
+guide
+repository_structure
+```
+
+```{toctree}
+:maxdepth: 2
+:caption: Packages
+
+packages/dash_evals
+packages/dataset_config_dart
+packages/devals_cli
+packages/eval_explorer
+```
diff --git a/docs/contributing/packages/dash_evals.md b/docs/contributing/packages/dash_evals.md
new file mode 100644
index 0000000..3fd714a
--- /dev/null
+++ b/docs/contributing/packages/dash_evals.md
@@ -0,0 +1,91 @@
+# dash_evals
+
+Python package for running LLM evaluations on Dart and Flutter tasks using [Inspect AI](https://inspect.aisi.org.uk/). Located in `packages/dash_evals/`.
+
+For setup instructions, see the [Quick Start](/guides/quick_start.md) or [Contributing Guide](../guide.md).
+
+---
+
+## Available Tasks
+
+| Task | Description |
+|------|-------------|
+| `question_answer` | Q&A evaluation for Dart/Flutter knowledge |
+| `bug_fix` | Agentic debugging of code in a sandbox |
+| `flutter_bug_fix` | Flutter-specific bug fix (wraps `bug_fix`) |
+| `code_gen` | Generate code from specifications |
+| `flutter_code_gen` | Flutter-specific code gen (wraps `code_gen`) |
+| `mcp_tool` | Evaluate MCP tool usage (pub.dev search, project creation, etc.) |
+| `analyze_codebase` | Evaluate codebase analysis and comprehension |
+| `skill_test` | Evaluate use of skill files in a sandbox |
+
+---
+
+## Architecture
+
+```
+src/dash_evals/
+├── main.py # CLI entry point (dual-mode)
+├── runner/
+│ ├── json_runner.py # Mode 1: run from EvalSet JSON manifest
+│ ├── args_runner.py # Mode 2: run from direct CLI arguments
+│ ├── tasks/ # @task functions (question_answer, bug_fix, code_gen, etc.)
+│ ├── scorers/ # Scoring logic (model_graded, dart_analyze, flutter_test, etc.)
+│ ├── solvers/ # Solver chains (context injection, system messages)
+│ └── sandboxes/ # Sandbox environments (podman)
+├── models/ # Data models
+└── utils/ # Logging and helpers
+```
+
+### Data Flow
+
+1. **Configure**: The Dart `dataset_config_dart` package parses dataset YAML and resolves it into an `EvalSet` JSON manifest
+2. **Load**: The Python runner reads the JSON manifest via `json_runner.py`, resolving task functions dynamically with `importlib`
+3. **Execute**: Each task function receives its dataset and task definition, producing an `inspect_ai.Task`
+4. **Score**: Scorers evaluate model outputs against targets
+5. **Log**: Results written to the configured `log_dir`
+
+Alternatively, the runner can be invoked directly with `--task` and `--model` arguments (via `args_runner.py`), bypassing the Dart config pipeline.
+
+---
+
+## Usage
+
+```bash
+# Mode 1: Run from JSON manifest (produced by Dart CLI)
+run-evals --json ./eval_set.json
+
+# Mode 2: Run a single task directly
+run-evals --task flutter_code_gen --model google/gemini-2.5-flash --dataset samples.jsonl
+
+# Additional options (both modes)
+run-evals --task bug_fix --model openai/gpt-4o \
+ --log-dir ./logs \
+ --sandbox podman compose.yaml \
+ --max-connections 10
+```
+
+---
+
+## Testing
+
+```bash
+# Run all tests
+pytest
+
+# Run with coverage
+pytest --cov=dash_evals
+
+# Run specific test
+pytest tests/test_parsers.py -v
+```
+
+---
+
+## Linting
+
+```bash
+# Run ruff
+ruff check src/dash_evals
+ruff format src/dash_evals
+```
diff --git a/docs/contributing/packages/dataset_config_dart.md b/docs/contributing/packages/dataset_config_dart.md
new file mode 100644
index 0000000..fe78883
--- /dev/null
+++ b/docs/contributing/packages/dataset_config_dart.md
@@ -0,0 +1,129 @@
+# dataset_config_dart
+
+Dart library for resolving eval dataset YAML into EvalSet JSON for the Python runner. Also contains the shared data models (e.g., `EvalSet`, `Task`, `Sample`, `Variant`, `Job`) used across the eval pipeline. Python equivalents of these models live in `dash_evals_config`. Located in `packages/dataset_config_dart/`.
+
+---
+
+## Architecture
+
+The package follows a layered pipeline design:
+
+```
+YAML / JSON files
+ │
+ ▼
+┌──────────┐
+│ Parser │ YamlParser · JsonParser
+└────┬─────┘
+ │ => List, Job
+ ▼
+┌──────────┐
+│ Resolver │ EvalSetResolver
+└────┬─────┘
+ │ => List
+ ▼
+┌──────────┐
+│ Writer │ EvalSetWriter
+└────┬─────┘
+ │ => JSON file(s) on disk
+ ▼
+ Python dash_evals
+```
+
+The JSON files written to disk conform to the InspectAI API for `eval_set`, which is an
+entry point from which to start running evals.
+
+
+| Layer | Class | Responsibility |
+|-------|-------|----------------|
+| **Parsers** | `YamlParser`, `JsonParser` | Read task YAML and job files into `ParsedTask` and `Job` objects |
+| **Resolvers** | `EvalSetResolver` | Combine parsed tasks with a job to produce fully resolved `EvalSet` objects (expanding models, variants, sandbox config, etc.) |
+| **Writers** | `EvalSetWriter` | Serialize `EvalSet` objects to JSON files that the Python runner can consume |
+| **Facade** | `ConfigResolver` | Single-call convenience that composes Parser → Resolver |
+
+---
+
+## Quick Start
+
+```dart
+import 'package:dataset_config_dart/dataset_config_dart.dart';
+
+// Single-call convenience
+final resolver = ConfigResolver();
+final configs = resolver.resolve(datasetPath, ['my_job']);
+
+// Or use the layers individually
+final parser = YamlParser();
+final tasks = parser.parseTasks(datasetPath);
+final job = parser.parseJob(jobPath, datasetPath);
+
+final evalSetResolver = EvalSetResolver();
+final evalSets = evalSetResolver.resolve(tasks, job, datasetPath);
+
+final writer = EvalSetWriter();
+writer.write(evalSets, outputDir);
+```
+
+---
+
+## Data Models
+
+This package also contains the shared Dart data models used across the eval pipeline. All models are built with [Freezed](https://pub.dev/packages/freezed) for immutability, pattern matching, and JSON serialization via [json_serializable](https://pub.dev/packages/json_serializable).
+
+> [!NOTE]
+> Python equivalents of these models live in the `dash_evals_config` package.
+
+### Config Models
+
+| Model | Description |
+|-------|-------------|
+| `Job` | A job configuration — runtime settings, model/variant/task selection, and `eval_set()` overrides |
+| `JobTask` | Per-task overrides within a job (sample filtering, custom system messages) |
+| `Variant` | A named configuration variant (e.g. `baseline`, `with_docs`) applied to task runs |
+| `ContextFile` | A file to inject into the sandbox as additional context for the model |
+
+### Inspect AI Models
+
+Mirror the Python [Inspect AI](https://inspect.aisi.org.uk/) types so that Dart can produce JSON the Python runner understands directly.
+
+| Model | Description |
+|-------|-------------|
+| `EvalSet` | Maps to `inspect_ai.eval_set()` parameters — the top-level run definition |
+| `Task` | A single evaluation task with its solver, scorer, dataset, and sandbox config |
+| `TaskInfo` | Lightweight task metadata (name and function reference) |
+| `Sample` | An individual evaluation sample (input, target, metadata) |
+| `Dataset` | A dataset definition (samples file path and field mappings) |
+| `FieldSpec` | Maps dataset columns to sample fields |
+| `EvalLog` | Comprehensive log structure for evaluation results |
+
+---
+
+## Source Layout
+
+```
+lib/
+├── dataset_config_dart.dart # Library barrel file
+└── src/
+ ├── config_resolver.dart # Convenience facade
+ ├── parsed_task.dart # Intermediate parsed-task model
+ ├── parsers/
+ │ ├── parser.dart # Abstract parser interface
+ │ ├── yaml_parser.dart # YAML file parser
+ │ └── json_parser.dart # JSON map parser
+ ├── resolvers/
+ │ └── eval_set_resolver.dart
+ ├── writers/
+ │ └── eval_set_writer.dart
+ ├── runner_config_exception.dart
+ └── utils/
+ └── yaml_utils.dart
+```
+
+---
+
+## Testing
+
+```bash
+cd packages/dataset_config_dart
+dart test
+```
diff --git a/docs/contributing/packages/devals_cli.md b/docs/contributing/packages/devals_cli.md
new file mode 100644
index 0000000..96c8e8f
--- /dev/null
+++ b/docs/contributing/packages/devals_cli.md
@@ -0,0 +1,95 @@
+# devals_cli (devals)
+
+Dart CLI for managing evals — initialize datasets, create samples, run evaluations, and view results. Located in `packages/devals_cli/`.
+
+For setup instructions, see the [Quick Start](../../guides/quick_start.md) or [Contributing Guide](../guide.md).
+
+---
+
+## Commands
+
+| Command | Description |
+|---------|-------------|
+| `devals init` | Initialize a new dataset in the current directory (creates `devals.yaml`, a starter task, and a starter job) |
+| `devals doctor` | Check that prerequisites are installed (Dart, Python, dash_evals, Podman, Flutter, Serverpod, API keys) |
+| `devals create sample` | Interactively add a new sample to an existing task |
+| `devals create task` | Interactively create a new task file in `tasks//task.yaml` |
+| `devals create job` | Interactively create a new job file |
+| `devals create pipeline` | Guided flow to create a task and job together |
+| `devals run ` | Resolve config and run evaluations via the Python dash_evals |
+| `devals publish ` | Upload Inspect AI log files to Google Cloud Storage |
+| `devals view [log_path]` | Launch the Inspect AI viewer to browse evaluation results |
+
+---
+
+## Usage
+
+```bash
+# Scaffold a new dataset
+devals init
+
+# Check your environment
+devals doctor
+
+# Create a new eval (task + job in one step)
+devals create pipeline
+
+# Run evaluations
+devals run local_dev
+
+# Preview without executing
+devals run local_dev --dry-run
+
+# Upload logs to GCS
+devals publish logs/2026-01-07_17-11-47/
+
+# View results
+devals view
+```
+
+---
+
+## How `devals run` Works
+
+1. The CLI resolves the job YAML into `EvalSet` objects using the [dataset_config_dart](./dataset_config_dart.md) package (entirely in Dart)
+2. `EvalSetWriter` writes the resolved config to a JSON file
+3. The CLI invokes `run-evals --manifest ` to hand off to the Python [dash_evals](./dash_evals.md)
+
+With `--dry-run`, the CLI resolves and validates the config without calling the Python runner.
+
+---
+
+## Source Layout
+
+```
+bin/
+└── devals.dart # Entry point
+lib/
+├── devals.dart # Library barrel file
+└── src/
+ ├── runner.dart # DevalRunner (CommandRunner)
+ ├── cli_exception.dart # CLI-specific exceptions
+ ├── commands/ # Command implementations
+ │ ├── init_command.dart
+ │ ├── doctor_command.dart
+ │ ├── create_command.dart
+ │ ├── create_sample_command.dart
+ │ ├── create_task_command.dart
+ │ ├── create_job_command.dart
+ │ ├── create_pipeline_command.dart
+ │ ├── run_command.dart
+ │ ├── publish_command.dart
+ │ └── view_command.dart
+ ├── config/ # Environment and .env helpers
+ ├── dataset/ # Dataset reading, writing, templates
+ └── gcs/ # Google Cloud Storage client
+```
+
+---
+
+## Testing
+
+```bash
+cd packages/devals_cli
+dart test
+```
diff --git a/docs/contributing/packages/eval_explorer.md b/docs/contributing/packages/eval_explorer.md
new file mode 100644
index 0000000..41ea0bd
--- /dev/null
+++ b/docs/contributing/packages/eval_explorer.md
@@ -0,0 +1,70 @@
+# eval_explorer
+
+Dart/Flutter application for browsing and analyzing evaluation results. Built with [Serverpod](https://serverpod.dev/). Located in `packages/eval_explorer/`.
+
+> [!NOTE]
+> The eval_explorer is under active development and will eventually replace the legacy `report_app` + `uploader` pipeline.
+
+## Sub-packages
+
+| Package | Description |
+|---------|-------------|
+| `eval_explorer_client` | Dart client package (mostly generated by Serverpod) |
+| `eval_explorer_flutter` | Flutter web/mobile app |
+| `eval_explorer_server` | Serverpod backend |
+| `eval_explorer_shared` | Shared models |
+
+---
+
+## Prerequisites
+
+- [Podman](https://podman.io/) (Docker substitute for Googlers)
+- Podman Compose (`brew install podman-compose`)
+
+---
+
+## Running the Server
+
+Start Postgres and Redis:
+
+```bash
+cd packages/eval_explorer/eval_explorer_server
+podman-compose up --detach
+```
+
+Start the Serverpod server:
+
+```bash
+dart bin/main.dart
+```
+
+When finished, stop the server with `Ctrl-C`, then shut down Postgres and Redis:
+
+```bash
+podman-compose down
+```
+
+---
+
+## Running the Flutter App
+
+Make sure the server is running first, then:
+
+```bash
+cd packages/eval_explorer/eval_explorer_flutter
+flutter run
+```
+
+---
+
+## Installing Fixtures
+
+Eval datasets and individual questions are kept in the `datasets` folder at the root of this repository. To load them into the database:
+
+> [!NOTE]
+> Make sure Postgres is running via `podman-compose up --detach` before running this command.
+
+```bash
+cd packages/eval_explorer/eval_explorer_server
+serverpod run fixtures
+```
diff --git a/docs/contributing/repository_structure.md b/docs/contributing/repository_structure.md
new file mode 100644
index 0000000..4ed859f
--- /dev/null
+++ b/docs/contributing/repository_structure.md
@@ -0,0 +1,108 @@
+# Repository Structure
+
+Overview of the evals repository layout.
+
+```
+evals/
+├── dataset/ # Evaluation data and configuration
+├── docs/ # Documentation
+├── packages/
+│ ├── devals_cli/ # Dart CLI for managing dataset (devals)
+│ ├── dataset_config_dart/ # Dart library: YAML → EvalSet JSON
+│ ├── dash_evals/ # Python evaluation runner
+│ ├── dataset_config_python/ # Python configuration models
+│ └── eval_explorer/ # Dart/Flutter results viewer (Serverpod)
+├── tool/ # Utility scripts
+├── pubspec.yaml # Dart workspace configuration
+└── firebase.json # Firebase configuration
+```
+
+---
+
+## dataset/
+
+Contains all evaluation data, configurations, and resources. See the [Configuration Overview](./config/about.md) for detailed file formats.
+
+| Path | Description |
+|------|-------------|
+| `tasks/` | Task subdirectories with `task.yaml` files and inline samples |
+| `jobs/` | Job files for different run configurations |
+| `context_files/` | Context markdown files for prompt injection |
+| `sandboxes/` | Container configuration (Containerfile, compose.yaml) |
+| `workspaces/` | Reusable project templates (flutter_app, dart_package) |
+
+---
+
+## packages/
+
+### dataset_config_dart/
+
+Dart package that converts dataset YAML into EvalSet JSON for the Python runner. Provides a layered API:
+
+```
+dataset_config_dart/
+├── lib/
+│ ├── dataset_config_dart.dart # Library barrel file
+│ └── src/
+│ ├── config_resolver.dart # Facade: single-call convenience API
+│ ├── parsed_task.dart # Intermediate parsing type
+│ ├── parsers/ # YamlParser, JsonParser
+│ ├── resolvers/ # EvalSetResolver
+│ ├── writers/ # EvalSetWriter
+│ └── utils/ # YAML helpers
+├── bin/ # CLI entry points
+└── test/ # Dart test suite
+```
+
+---
+
+### dash_evals/
+
+Python package for running LLM evaluations using [Inspect AI](https://inspect.aisi.org.uk/).
+
+```
+dash_evals/
+├── src/dash_evals/
+│ ├── main.py # CLI entry point (--json or --task mode)
+│ ├── runner/
+│ │ ├── json_runner.py # Run from EvalSet JSON manifest
+│ │ ├── args_runner.py # Run from direct CLI arguments
+│ │ ├── tasks/ # Task implementations (@task functions)
+│ │ ├── scorers/ # Scoring logic
+│ │ ├── solvers/ # Solver chains
+│ │ └── sandboxes/ # Sandbox environments
+│ ├── models/ # Data models
+│ └── utils/ # Logging and helpers
+├── tests/ # Pytest test suite
+└── pyproject.toml # Package configuration
+```
+
+---
+
+### devals_cli/ (devals)
+
+Dart CLI for creating and managing evaluation tasks and jobs. See the [CLI documentation](./cli.md) for full command reference.
+
+```
+devals_cli/
+├── bin/devals.dart # CLI entry point
+├── lib/src/
+│ ├── commands/ # Command implementations
+│ ├── console/ # Console UI and prompts
+│ ├── dataset/ # Dataset file utilities and discovery
+│ └── yaml/ # YAML generation and parsing
+└── test/ # Dart test suite
+```
+
+
+### eval_explorer/
+
+Dart/Flutter application for exploring evaluation results. Built with [Serverpod](https://serverpod.dev/).
+
+```
+eval_explorer/
+├── eval_explorer_client/ # Dart client package
+├── eval_explorer_flutter/ # Flutter web/mobile app
+├── eval_explorer_server/ # Serverpod backend
+└── eval_explorer_shared/ # Shared models
+```
diff --git a/docs/guides/config.md b/docs/guides/config.md
new file mode 100644
index 0000000..aef6aba
--- /dev/null
+++ b/docs/guides/config.md
@@ -0,0 +1,43 @@
+# Config guide
+
+Evals uses a layered YAML configuration system. You define **what** to evaluate (tasks and samples), **how** to run it (jobs), and **where** code executes (sandboxes). The CLI resolves these files into a single manifest and hands it to the Python runner — so most of the time you're just editing YAML.
+
+This page walks through the main concepts and how they connect.
+
+## **Dataset**
+
+The Dataset is the collection of Tasks and Samples that are run through the python tool. A
+Sample is, at a minimum, an input and target. These are essentially test cases.
+
+In evals, the definition of dataset is expanded to include all fixtures of running evals, and all of these definitions exist in the dataset directory of the github.
+
+| 🗒️ Note! The following diagrams provide a mental model. (They also provide a literal representation of how it works, but…) A lot of this is hidden from you, the user or sample author, so don’t let it overwhelm! |
+| :---- |
+
+
+
+* **Samples** - individual eval case
+* **Models** we run against
+* **Variants** - Different configurations for the agent being evaluated, e.g. with Dart MCP, with or without skills, with and without rules files, and every combination of those things.
+* **Tasks** - A task is a Python function entrypoint for one “type” of evals. For example, “question_answer”, “code_gen”, “mcp_create_project” are a few of the tasks we support. Each task generally takes a list of specific samples that are configured to run for that task.
+* **Workspaces** (The codebase that the agent is tinkering with in an eval)
+* **Sandbox definitions** (host machine, podman, docker)
+* **Default runtime configurations**
+
+### **Tasks are the basic unit of defining eval runs.**
+
+
+
+### **Job files are run configuration**
+
+
+
+### **Then evals run based on that job file:**
+
+
+
+This means you care about job files and task files. Job files might look like this:
+
+- job/main.yaml (runs the whole thing)
+- job/ci.yaml (a job that runs as part of ci)
+- job/local_dev.yaml (a job that is .gitignored, used for quick iteration)
diff --git a/docs/guides/index.md b/docs/guides/index.md
new file mode 100644
index 0000000..73e04a8
--- /dev/null
+++ b/docs/guides/index.md
@@ -0,0 +1,11 @@
+# Guides
+
+Get started with evals — learn how to author and run your own evaluations.
+
+```{toctree}
+:maxdepth: 1
+
+quick_start
+tutorial
+config
+```
diff --git a/docs/guides/quick_start.md b/docs/guides/quick_start.md
new file mode 100644
index 0000000..dd70a26
--- /dev/null
+++ b/docs/guides/quick_start.md
@@ -0,0 +1,140 @@
+# Get started
+
+A guide to using evals as a framework for the local development of your own evals.
+
+## Prerequisites
+
+| Tool | Version | Purpose |
+|------|---------|---------|
+| [Dart SDK](https://dart.dev/get-dart) | 3.10+ | Runs the `devals` CLI |
+| [Python](https://www.python.org/) | 3.13+ | Runs the `dash_evals` runner |
+
+You'll also need an API key for at least one model provider (`GOOGLE_API_KEY`, `ANTHROPIC_API_KEY`, or `OPENAI_API_KEY`).
+
+## 1. Install the packages
+
+```bash
+git clone https://github.com/flutter/evals.git
+pip install -e /packages/dash_evals
+dart pub global activate devals --source path /packages/devals_cli
+
+
+## TODO: Integrate in the new repo. This is wrong for this repo
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -e "packages/dash_evals[dev]"
+pip install -e "packages/dataset_config_python[dev]"
+```
+
+This installs two things:
+
+- **`devals`** (Dart) — the CLI you'll use for every command. It resolves YAML configuration into a JSON manifest and delegates execution.
+- **`dash_evals`** (Python) — the runtime that receives the manifest and drives [Inspect AI](https://inspect.aisi.org.uk/)'s `eval_set()` to actually run evaluations.
+
+## 2. Check your environment
+
+```bash
+devals doctor
+```
+
+This runs a series of prerequisite checks — Dart SDK, Python version, whether `dash_evals` is installed, API keys, and optional tools like Podman and Flutter. Fix any errors it reports before continuing; warnings are safe to ignore for now.
+
+## 3. Set up Podman (optional)
+
+If your evals use containerized execution (`sandbox_type: podman` in a job YAML), you need Podman installed and a container image built. You can skip this step for basic evals that run locally.
+
+**Install Podman** (macOS):
+
+```bash
+brew install podman
+podman machine init
+podman machine start
+```
+
+**Build the Flutter sandbox image:**
+
+```bash
+cd /examples/evals-dataset/evals/sandboxes/podman
+podman build -t flutter-sandbox:latest .
+```
+
+This builds `localhost/flutter-sandbox:latest`, which includes Ubuntu 24.04 and the Flutter SDK. The build takes a few minutes.
+
+> **Tip:** To target a different Flutter channel, pass `--build-arg FLUTTER_CHANNEL=beta` (or `main`).
+
+## 4. Configure API keys
+
+Make sure you have at least one model provider API key set as an environment variable. You can set them in your shell profile or in a `.env` file in your project root.
+
+```bash
+export GEMINI_API_KEY=your_key_here
+```
+
+## 5. Initialize your dataset
+
+Run `devals init` from the root of the project you want to evaluate. This is typically a Dart or Flutter project — the scaffolded starter task will point back at your project as its workspace.
+
+```bash
+cd ~/my-flutter-app
+devals init
+```
+
+This creates two things:
+
+- **`devals.yaml`** in your project root — a marker file that tells the CLI where your eval dataset lives (defaults to `./evals`).
+- **`evals/`** directory with the following structure:
+
+```
+my-flutter-app/
+├── devals.yaml # ← marker file
+└── evals/
+ ├── tasks/
+ │ └── get_started/
+ │ └── task.yaml # starter task + sample
+ └── jobs/
+ └── local_dev.yaml # job ready to run
+```
+
+The starter task uses the `analyze_codebase` task function, which asks the model to
+explore your project and suggest an improvement. It's a good smoke-test that
+doesn't require a sandbox or any extra setup.
+
+
+## 6. Run your first eval
+
+```bash
+devals run local_dev
+```
+
+Behind the scenes, this:
+
+1. Resolves your YAML config (job + tasks + samples) into an EvalSet JSON manifest
+2. Passes the manifest to the Python `dash_evals` runner
+3. `dash_evals` calls Inspect AI's `eval_set()`, which sends prompts, collects responses, and scores results
+4. Logs are written to a `logs/` directory (a sibling of `evals/`)
+
+To preview the resolved configuration without actually making API calls:
+
+```bash
+devals run local_dev --dry-run
+```
+
+This prints every task × model × variant combination that would execute, so you can verify your setup before spending API credits.
+
+## 7. View results
+
+```bash
+devals view
+```
+
+This launches the [Inspect AI log viewer](https://inspect.aisi.org.uk/log-viewer.html) — a local web UI where you can browse runs, inspect individual samples, view scores, and read full conversation transcripts. It automatically finds your `logs/` directory based on `devals.yaml`.
+
+---
+
+## Next steps
+
+- **Add more samples** — `devals create sample`
+- **Add tasks** — `devals create task`
+- **Create targeted jobs** — `devals create job`
+- **Interactive walkthrough** — `devals create pipeline` guides you through creating a sample, task, and job in one go
+- **[Follow the tutorial](tutorial.md)** — a hands-on walkthrough of authoring a code-generation task from scratch
diff --git a/docs/guides/tutorial.md b/docs/guides/tutorial.md
new file mode 100644
index 0000000..5776963
--- /dev/null
+++ b/docs/guides/tutorial.md
@@ -0,0 +1,287 @@
+# Author evals
+
+This tutorial picks up where [Get Started](quick_start.md) left off.
+By the end, you'll have:
+
+1. Authored a task file with two **code-generation** samples
+2. Created a job file that targets your new task
+3. Run the job and watched Inspect AI execute it
+4. Opened the Inspect log viewer to review results
+
+> [!NOTE]
+> This guide assumes you've already completed the [Get Started](quick_start.md) guide and
+> have a working `devals` installation with at least one model API key configured.
+
+---
+
+## 1. Create the task
+
+A **task** tells the framework *what* to evaluate. Each task lives in its own subdirectory
+under `evals/tasks/` and contains a `task.yaml` file.
+
+### 1.1 Set up a workspace
+
+Code-generation tasks need a **workspace** — a starter project the model writes code into
+and where tests run. Create a minimal Dart package to use as a template:
+
+```
+evals/
+└── workspaces/
+ └── dart_package/
+ ├── pubspec.yaml
+ └── lib/
+ └── main.dart
+```
+
+```{code-block} yaml
+---
+caption: evals/workspaces/dart_package/pubspec.yaml
+---
+name: dart_package_template
+description: Minimal Dart package template
+version: 1.0.0
+publish_to: none
+
+environment:
+ sdk: '>=3.0.0 <4.0.0'
+
+dev_dependencies:
+ test: ^1.24.0
+```
+
+```{code-block} dart
+---
+caption: evals/workspaces/dart_package/lib/main.dart
+---
+// Starter file — the model will overwrite this.
+```
+
+> [!TIP]
+> You can also point `workspace` at your existing project root, a Flutter app,
+> or any directory that already has a `pubspec.yaml`.
+
+### 1.2 Write a test file
+
+Each sample can have its own test file that the scorer runs automatically. Create a
+test for the first sample:
+
+```
+evals/
+└── tasks/
+ └── dart_code_gen/
+ ├── task.yaml ← (you'll create this next)
+ └── tests/
+ └── fizzbuzz_test.dart
+```
+
+```{code-block} dart
+---
+caption: evals/tasks/dart_code_gen/tests/fizzbuzz_test.dart
+---
+import 'package:test/test.dart';
+import 'package:dart_package_template/main.dart';
+
+void main() {
+ test('fizzBuzz returns correct values', () {
+ expect(fizzBuzz(3), 'Fizz');
+ expect(fizzBuzz(5), 'Buzz');
+ expect(fizzBuzz(15), 'FizzBuzz');
+ expect(fizzBuzz(7), '7');
+ });
+
+ test('fizzBuzz handles 1', () {
+ expect(fizzBuzz(1), '1');
+ });
+}
+```
+
+### 1.3 Write the task.yaml
+
+Now create the task definition with two inline samples:
+
+```{code-block} yaml
+---
+caption: evals/tasks/dart_code_gen/task.yaml
+---
+# ============================================================
+# Task: Dart Code Generation
+# ============================================================
+# Uses the built-in `code_gen` task function which:
+# 1. Sends the prompt to the model
+# 2. Parses the structured code response
+# 3. Writes the code into the sandbox workspace
+# 4. Runs tests and scores the result
+
+func: code_gen
+workspace: ../../workspaces/dart_package
+
+samples:
+ inline:
+ # ── Sample 1: FizzBuzz ──────────────────────────────────
+ - id: fizzbuzz
+ difficulty: easy
+ tags: [dart, functions]
+ input: |
+ Write a top-level function called `fizzBuzz` that takes an
+ integer `n` and returns a String:
+ - "Fizz" if n is divisible by 3
+ - "Buzz" if n is divisible by 5
+ - "FizzBuzz" if divisible by both
+ - The number as a string otherwise
+
+ Write the complete lib/main.dart file.
+ target: |
+ The code must define a top-level `String fizzBuzz(int n)` function
+ that returns the correct value for all cases.
+ It must pass the tests in test/.
+ tests:
+ path: ./tests/fizzbuzz_test.dart
+
+ # ── Sample 2: Stack implementation ──────────────────────
+ - id: stack_class
+ difficulty: medium
+ tags: [dart, data-structures, classes]
+ input: |
+ Implement a generic Stack class in Dart with the
+ following methods:
+ - push(T item) — adds an item to the top
+ - T pop() — removes and returns the top item,
+ throws StateError if empty
+ - T peek() — returns the top item without removing it,
+ throws StateError if empty
+ - bool get isEmpty
+ - int get length
+
+ Write the complete lib/main.dart file.
+ target: |
+ The code must define a generic Stack class with push,
+ pop, peek, isEmpty, and length. pop and peek must throw
+ StateError when the stack is empty.
+```
+
+**Key fields explained:**
+
+| Field | What it does |
+|-------|-------------|
+| `func` | The Python `@task` function that runs the evaluation. `code_gen` is a built-in generic code-generation task. |
+| `workspace` | Path to the starter project (relative to the task directory). |
+| `samples.inline` | A list of test cases, each with an `input` prompt and a `target` grading criteria. |
+| `tests.path` | Path to test files the scorer runs against the generated code. |
+
+> [!NOTE]
+> See [Tasks](config/tasks.md) and [Samples](config/samples.md) for the
+> complete field reference.
+
+---
+
+## 2. Create the job
+
+A **job** controls *how* to run your tasks — which models to use, how many
+connections, and which tasks/variants to include.
+
+Create `evals/jobs/tutorial.yaml`:
+
+```{code-block} yaml
+---
+caption: evals/jobs/tutorial.yaml
+---
+# ============================================================
+# Job: tutorial
+# ============================================================
+# A focused job for the tutorial walkthrough.
+
+# Which model(s) to evaluate
+models:
+ - google/gemini-2.5-flash
+
+# Only run the code-gen task we just created
+tasks:
+ inline:
+ dart_code_gen: {}
+```
+
+That's the minimal job — it will:
+
+- Evaluate `google/gemini-2.5-flash`
+- Run every sample in the `dart_code_gen` task
+- Use the default `baseline` variant (no extra tools or context)
+
+> [!TIP]
+> You can add **variants** to test the model with additional context or tools.
+> For example:
+> ```yaml
+> variants:
+> baseline: {}
+> with_context:
+> context_files: [./context_files/dart_docs.md]
+> ```
+> See [Configuration Overview](config/about.md#variants) for details.
+
+---
+
+## 3. Run the job
+
+Make sure you're in your project directory (the one containing `devals.yaml`), then run:
+
+```bash
+devals run tutorial
+```
+
+What happens behind the scenes:
+
+1. The Dart `dataset_config_dart` package resolves your YAML into an EvalSet JSON manifest
+2. The Python `dash_evals` reads the manifest and calls Inspect AI's `eval_set()`
+3. Inspect AI creates a sandbox, sets up the workspace, sends prompts, runs tests, and scores results
+4. Logs are written to the `logs/` directory
+
+### Dry run first
+
+To preview the resolved configuration without making any API calls:
+
+```bash
+devals run tutorial --dry-run
+```
+
+This prints a summary of every task × model × variant combination that would
+execute, so you can verify everything looks right before spending API credits.
+
+### What to expect
+
+When the eval runs, you'll see Inspect AI's interactive terminal display showing
+progress for each sample. A typical run with two samples against one model takes
+1–3 minutes, depending on the model's response time.
+
+---
+
+## 4. View the results
+
+After the run completes, launch the Inspect AI log viewer:
+
+```bash
+devals view
+```
+
+This opens a local web UI (powered by Inspect AI) where you can:
+
+- **Browse runs** — see each task × model × variant combination
+- **Inspect samples** — view the model's generated code, scores, and any test output
+- **Compare variants** — if you defined multiple variants, compare how they performed side-by-side
+
+The viewer automatically points at your `logs/` directory. To view logs from a
+specific directory:
+
+```bash
+devals view path/to/logs
+```
+
+---
+
+## Next steps
+
+Now that you've run your first custom evaluation, here are some things to try:
+
+- **Add more samples** to your task: `devals create sample`
+- **Try different task types** — `question_answer`, `bug_fix`, or `flutter_code_gen`. See [all available task functions](../packages/dash_evals.md).
+- **Add variants** to test how context files or MCP tools affect performance. See [Variants](config/about.md#variants).
+- **Run multiple models** by adding more entries to the `models` list in your job file
+- **Read the config reference** for [Jobs](config/jobs.md), [Tasks](config/tasks.md), and [Samples](config/samples.md)
\ No newline at end of file
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 0000000..82d6d06
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,40 @@
+# evals
+
+A framework for authoring and running LLM evaluations on Dart and Flutter tasks.
+
+::::{grid} 1 1 3 3
+:gutter: 3
+
+:::{grid-item-card} 📖 Guides
+:link: guides/index
+:link-type: doc
+
+Learn how to author and run your own evaluations, from quick start to advanced configuration.
+:::
+
+:::{grid-item-card} 📚 Reference
+:link: reference/index
+:link-type: doc
+
+API documentation, CLI usage, configuration reference, and glossary.
+:::
+
+:::{grid-item-card} 🤝 Contributor Guides
+:link: contributing/index
+:link-type: doc
+
+Repository structure, package details, and how to contribute to the project.
+:::
+
+::::
+
+```{toctree}
+:hidden:
+
+guides/index
+reference/index
+contributing/index
+```
+
+*Example of AI doing a subpar job, maybe we should eval image gen:*
+
diff --git a/docs/reference/api/dash_evals/index.md b/docs/reference/api/dash_evals/index.md
new file mode 100644
index 0000000..2da5ec7
--- /dev/null
+++ b/docs/reference/api/dash_evals/index.md
@@ -0,0 +1,10 @@
+# dash_evals
+
+Main package entry points and overview.
+
+```{toctree}
+:maxdepth: 1
+
+overview
+main
+```
diff --git a/docs/reference/api/dash_evals/main.md b/docs/reference/api/dash_evals/main.md
new file mode 100644
index 0000000..5895b0d
--- /dev/null
+++ b/docs/reference/api/dash_evals/main.md
@@ -0,0 +1,7 @@
+# Main Entry Point
+
+CLI entry point for running evaluations.
+
+```{eval-rst}
+.. autofunction:: dash_evals.main.main
+```
diff --git a/docs/reference/api/dash_evals/overview.md b/docs/reference/api/dash_evals/overview.md
new file mode 100644
index 0000000..4100f41
--- /dev/null
+++ b/docs/reference/api/dash_evals/overview.md
@@ -0,0 +1,10 @@
+# Overview
+
+Package overview and exports.
+
+```{eval-rst}
+.. automodule:: dash_evals
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
diff --git a/docs/reference/api/runner/index.md b/docs/reference/api/runner/index.md
new file mode 100644
index 0000000..2058d5d
--- /dev/null
+++ b/docs/reference/api/runner/index.md
@@ -0,0 +1,17 @@
+# Runner Module
+
+The runner module executes evaluations using Inspect AI.
+
+It supports two modes:
+- **JSON mode** (`--json`): reads an `eval_set.json` manifest emitted by the Dart CLI
+- **Direct args mode** (`--task`, `--model`, etc.): runs a single task directly
+
+```{toctree}
+:maxdepth: 1
+
+runners
+tasks
+solvers
+scorers
+sandboxes
+```
diff --git a/docs/reference/api/runner/runners.md b/docs/reference/api/runner/runners.md
new file mode 100644
index 0000000..0297d8f
--- /dev/null
+++ b/docs/reference/api/runner/runners.md
@@ -0,0 +1,29 @@
+# Runners
+
+Core evaluation execution logic. The runner module provides two entry points:
+
+---
+
+## JSON Runner
+
+Reads an `eval_set.json` manifest (emitted by the Dart CLI) and calls `eval_set()`.
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.json_runner
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## Args Runner
+
+Runs a single task directly from CLI arguments (`--task`, `--model`, `--dataset`).
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.args_runner
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
diff --git a/docs/reference/api/runner/sandboxes.md b/docs/reference/api/runner/sandboxes.md
new file mode 100644
index 0000000..1fb06a7
--- /dev/null
+++ b/docs/reference/api/runner/sandboxes.md
@@ -0,0 +1,25 @@
+# Sandboxes
+
+Sandbox environments for isolated task execution.
+
+---
+
+## Podman Sandbox
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.sandboxes.podman.podman
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## Sandbox Provider
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.sandboxes.provider
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
diff --git a/docs/reference/api/runner/scorers.md b/docs/reference/api/runner/scorers.md
new file mode 100644
index 0000000..427c490
--- /dev/null
+++ b/docs/reference/api/runner/scorers.md
@@ -0,0 +1,102 @@
+# Scorers
+
+Scorer implementations for evaluating task outputs.
+
+---
+
+## Code Quality Scorer
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.scorers.code_quality
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## Dart Analyze Scorer
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.scorers.dart_analyze
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## Flutter Code Scorer
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.scorers.flutter_code
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## Flutter Test Scorer
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.scorers.flutter_test
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## Flutter Output Parser
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.scorers.flutter_output_parser
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## Flutter Scoring Utilities
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.scorers.flutter_scoring
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## MCP Tool Usage Scorer
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.scorers.mcp_tool_usage
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## Export Workspace
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.scorers.export_workspace
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## Skill Usage Scorer
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.scorers.skill_usage
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
diff --git a/docs/reference/api/runner/solvers.md b/docs/reference/api/runner/solvers.md
new file mode 100644
index 0000000..269a761
--- /dev/null
+++ b/docs/reference/api/runner/solvers.md
@@ -0,0 +1,69 @@
+# Solvers
+
+Solver implementations for evaluation tasks.
+
+---
+
+## Add System Message
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.solvers.add_system_message
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## Context Injector
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.solvers.context_injector
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## Extract Code
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.solvers.extract_code
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## Inject Test Files
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.solvers.inject_test_files
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## Setup Workspace
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.solvers.setup_workspace
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## Write to Sandbox
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.solvers.write_to_sandbox
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
diff --git a/docs/reference/api/runner/tasks.md b/docs/reference/api/runner/tasks.md
new file mode 100644
index 0000000..aa6131c
--- /dev/null
+++ b/docs/reference/api/runner/tasks.md
@@ -0,0 +1,82 @@
+# Tasks
+
+Task implementations for different evaluation types.
+
+---
+
+## Code Generation
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.tasks.code_gen
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## Bug Fix
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.tasks.bug_fix
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## Question Answer
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.tasks.question_answer
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## MCP Tool
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.tasks.mcp_tool
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## Analyze Codebase
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.tasks.analyze_codebase
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## Skill Test
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.tasks.skill_test
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+---
+
+## Task Helpers
+
+Shared utilities used across task implementations.
+
+```{eval-rst}
+.. automodule:: dash_evals.runner.tasks.task_helpers
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
diff --git a/docs/reference/api/utils/index.md b/docs/reference/api/utils/index.md
new file mode 100644
index 0000000..7044ed4
--- /dev/null
+++ b/docs/reference/api/utils/index.md
@@ -0,0 +1,10 @@
+# Utils Module
+
+Utility functions for dash_evals.
+
+```{toctree}
+:maxdepth: 1
+
+logging
+markdown
+```
diff --git a/docs/reference/api/utils/logging.md b/docs/reference/api/utils/logging.md
new file mode 100644
index 0000000..0509933
--- /dev/null
+++ b/docs/reference/api/utils/logging.md
@@ -0,0 +1,10 @@
+# Logging Utilities
+
+Logging configuration and utilities.
+
+```{eval-rst}
+.. automodule:: dash_evals.utils.logging
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
diff --git a/docs/reference/api/utils/markdown.md b/docs/reference/api/utils/markdown.md
new file mode 100644
index 0000000..fb72714
--- /dev/null
+++ b/docs/reference/api/utils/markdown.md
@@ -0,0 +1,10 @@
+# Markdown Utilities
+
+Markdown processing and formatting utilities.
+
+```{eval-rst}
+.. automodule:: dash_evals.utils.markdown
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
diff --git a/docs/reference/cli.md b/docs/reference/cli.md
new file mode 100644
index 0000000..deddead
--- /dev/null
+++ b/docs/reference/cli.md
@@ -0,0 +1,121 @@
+# CLI usage
+
+The `devals` CLI is a Dart command-line tool for managing the evals dataset. It provides interactive commands for creating samples, tasks, and jobs, as well as running evaluations and viewing results.
+
+```bash
+cd packages/devals_cli
+dart pub get
+dart run bin/devals.dart --help
+```
+
+> [!TIP]
+> Run `devals create pipeline` for an interactive walkthrough that creates your first sample, task, and job.
+
+Key commands:
+
+| Command | Description |
+|---------|-------------|
+| `devals init` | Initialize a new dataset configuration in the current directory |
+| `devals doctor` | Check that all prerequisites are installed and configured |
+| `devals create pipeline` | Interactive guide to create a sample, task, and job in one go |
+| `devals create sample` | Create a new sample interactively |
+| `devals create task` | Create a new task directory with a starter `task.yaml` |
+| `devals create job` | Create a new job file |
+| `devals run ` | Run evaluations (wraps `run-evals`) |
+| `devals run --dry-run` | Preview what would be run without executing |
+| `devals view [log_dir_path]` | Launch the Inspect AI log viewer |
+
+---
+
+## Usage
+
+```
+CLI for managing evals - create samples, run evaluations, and view results.
+
+Usage: devals [arguments]
+
+Global options:
+-h, --help Print this usage information.
+
+Available commands:
+ create Create samples, jobs, and tasks for the dataset.
+ job Create a new job file interactively.
+ pipeline Interactive guide to create a sample, task, and job in one go.
+ sample Create a new sample and set it up to run.
+ task Create a new task directory with a starter task.yaml.
+ doctor Check that all prerequisites are installed and configured.
+ init Initialize a new dataset configuration in the current directory.
+ run Run evaluations using the dash_evals.
+ view Launch the Inspect AI viewer to view evaluation results.
+
+Run "devals help " for more information about a command.
+```
+
+## Commands
+
+### `devals init`
+
+Initializes a new dataset configuration in the current directory. Creates:
+
+- `evals/tasks/get_started/task.yaml` — a starter task with an example sample
+- `evals/jobs/local_dev.yaml` — a default job for local development
+
+Use this when starting a new project that needs its own evaluation dataset.
+
+### `devals doctor`
+
+Checks that all prerequisites for the CLI, `dash_evals`, and `eval_explorer` are installed and correctly configured. Similar to `flutter doctor`, it verifies:
+
+- **Dart SDK** — required for the CLI itself
+- **Python 3.13+** — required for `dash_evals`
+- **dash_evals** (`run-evals`) — the Python evaluation package
+- **Podman** — container runtime for sandboxed execution
+- **Flutter SDK** — needed for Flutter-based eval tasks
+- **Serverpod** — needed for the `eval_explorer` app
+- **API Keys** — checks for `GOOGLE_API_KEY`, `ANTHROPIC_API_KEY`, `OPENAI_API_KEY`
+
+### `devals create pipeline`
+
+An interactive walkthrough that guides you through creating your first sample, task, and job — ideal for first-time contributors.
+
+### `devals create sample`
+
+Interactively creates a new sample directory with a `sample.yaml` file. Prompts for:
+
+- Sample ID (snake_case identifier)
+- Difficulty level
+- Whether a workspace is needed
+- Workspace type (template, path, git, or create command)
+
+### `devals create task`
+
+Creates a new task directory under `tasks/` with a starter `task.yaml`. Prompts for:
+
+- Task ID
+- Task function (selected from the Python registry)
+- Optional system message
+
+### `devals create job`
+
+Creates a new job YAML file in `jobs/`. Prompts for:
+
+- Job name
+- Which models, variants, and tasks to include
+
+### `devals run `
+
+Runs evaluations using the `dash_evals`. Wraps the Python `run-evals` entry point.
+
+```bash
+devals run local_dev # Run a specific job
+devals run local_dev --dry-run # Preview without executing
+```
+
+### `devals view [log_path]`
+
+Launches the Inspect AI log viewer to browse evaluation results. If no path is given, defaults to the `logs/` directory in the dataset.
+
+```bash
+devals view # Auto-detect log directory
+devals view /path/to/logs # View specific log directory
+```
diff --git a/docs/reference/configuration_reference.md b/docs/reference/configuration_reference.md
new file mode 100644
index 0000000..deb2193
--- /dev/null
+++ b/docs/reference/configuration_reference.md
@@ -0,0 +1,558 @@
+# Configuration Reference
+
+This document describes the *standard* `eval/` directory structure and YAML configuration files used by the evaluation framework.
+
+## Overview
+
+The evaluation framework uses the `eval/` directory as its entry point. It contains:
+
+- Task definitions autodiscovered from `tasks/*/task.yaml`
+- Job files in `jobs/` that control what to run
+- Shared resources (context files, sandboxes, workspaces)
+
+Configuration is parsed and resolved by the Dart `dataset_config_dart` package, which produces an EvalSet JSON manifest consumed by the Python `dash_evals`.
+
+## Directory Structure
+
+```
+eval/
+├── jobs/ # Job files for different run configurations
+│ ├── local_dev.yaml
+│ └── ci.yaml
+├── tasks/ # Task definitions (autodiscovered)
+│ ├── flutter_bug_fix/
+│ │ ├── task.yaml # Task config with inline samples
+│ │ └── project/ # Workspace files (if applicable)
+│ ├── dart_question_answer/
+│ │ └── task.yaml
+│ └── generate_flutter_app/
+│ ├── task.yaml
+│ └── todo_tests/ # Test files for a sample
+├── context_files/ # Context files injected into prompts
+│ └── flutter.md
+├── sandboxes/ # Container configurations
+│ └── podman/
+│ ├── Containerfile
+│ └── compose.yaml
+└── workspaces/ # Reusable project templates
+ ├── dart_package/
+ ├── flutter_app/
+ └── jaspr_app/
+```
+
+---
+
+## Task files
+
+Each subdirectory in `tasks/` that contains a `task.yaml` is automatically discovered as a task. The **directory name** is the task ID.
+
+```yaml
+# tasks/flutter_bug_fix/task.yaml
+func: flutter_bug_fix
+system_message: |
+ You are an expert Flutter developer. Fix the bug and explain your changes.
+
+# Task-level workspace (inherited by all samples)
+workspace:
+ path: ./project
+
+# Task-level tests (inherited by all samples)
+tests:
+ path: ./tests
+
+# Restrict which job-level variants apply to this task (optional)
+allowed_variants: [baseline, mcp_only]
+
+samples:
+ inline:
+ - id: flutter_bloc_cart_mutation_001
+ difficulty: medium
+ tags: [bloc, state]
+ input: |
+ Fix the bug where adding items to cart doesn't update the total.
+ target: |
+ The fix should modify the BLoC to emit a new state instead of mutating.
+
+ - id: navigation_crash
+ difficulty: hard
+ tags: [navigation]
+ workspace:
+ path: ./nav_project # Override task-level workspace
+ input: |
+ Fix the crash when navigating back from the detail screen.
+ target: |
+ The fix should handle the disposed controller properly.
+```
+
+### Task-Level Fields
+
+#### Core Fields
+
+| Field | Type | Required | Description |
+|-------|------|----------|-------------|
+| `func` | string | Yes | Name of the `@task` function (resolved dynamically via `importlib`) |
+| `description` | string | No | Human-readable description |
+| `samples` | object | Yes | Samples config with `inline` and/or `paths` keys |
+| `allowed_variants` | list | No | Whitelist of variant names this task accepts (omit to accept all) |
+| `system_message` | string | No | Custom system prompt for this task |
+| `workspace` | object | No | Default workspace for all samples |
+| `tests` | object | No | Default test files for all samples |
+
+#### Inspect AI Task Parameters
+
+These map directly to [Inspect AI's `Task` constructor](https://inspect.aisi.org.uk/reference/inspect_ai.html#task). All are optional and override any `task_defaults` set in the job file.
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `model` | string | Default model for this task (overrides the eval model) |
+| `config` | object | Model generation config (e.g., `{temperature: 0.2, max_tokens: 4096}`) |
+| `model_roles` | object | Named roles for use in `get_model()` |
+| `sandbox` | string/object | Sandbox environment type or `[type, config_path]` |
+| `approval` | string/object | Tool use approval policies |
+| `epochs` | int/object | Number of times to repeat each sample (optionally with score reducer) |
+| `fail_on_error` | number/bool | `true` = fail on first error, `0.0–1.0` = fail if proportion exceeds threshold |
+| `continue_on_fail` | bool | Continue running if `fail_on_error` condition is met |
+| `message_limit` | int | Max total messages per sample |
+| `token_limit` | int | Max total tokens per sample |
+| `time_limit` | int | Max clock time (seconds) per sample |
+| `working_limit` | int | Max working time (seconds) per sample (excludes wait time) |
+| `cost_limit` | float | Max cost (dollars) per sample |
+| `early_stopping` | string/object | Early stopping callbacks |
+| `display_name` | string | Task display name (e.g., for plotting) |
+| `version` | int | Version of task spec (to distinguish evolutions) |
+| `metadata` | object | Additional metadata to associate with the task |
+
+### Samples Object
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `inline` | list | Inline sample definitions |
+| `paths` | list | Glob patterns for external sample YAML files (relative to task dir) |
+
+### Sample Fields (inline in task.yaml)
+
+#### Core Fields
+
+| Field | Type | Required | Description |
+|-------|------|----------|-------------|
+| `id` | string | Yes | Unique sample identifier |
+| `input` | string | Yes | The prompt given to the model |
+| `target` | string | Yes | Expected output or grading criteria |
+| `difficulty` | string | No | `easy`, `medium`, or `hard` |
+| `tags` | list | No | Categories for filtering |
+| `system_message` | string | No | Override system prompt for this sample |
+| `metadata` | object | No | Arbitrary metadata |
+| `workspace` | object | No | Override task-level workspace |
+| `tests` | object | No | Override task-level tests |
+
+#### Inspect AI Sample Parameters
+
+These map directly to [Inspect AI's `Sample`](https://inspect.aisi.org.uk/reference/inspect_ai.dataset.html#sample).
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `choices` | list | Answer choices for multiple-choice evaluations |
+| `sandbox` | string/object | Override sandbox environment for this sample |
+| `files` | object | Files to copy into the sandbox (`{destination: source}`) |
+| `setup` | string | Setup script to run in the sandbox before evaluation |
+
+### Workspace/Tests References
+
+```yaml
+# Reference a reusable template
+workspace:
+ template: flutter_app
+
+# Reference a path relative to task directory
+workspace:
+ path: ./project
+
+# Clone from git
+workspace:
+ git: https://github.com/example/repo.git
+
+# Shorthand (equivalent to path:)
+workspace: ./project
+```
+
+> [!NOTE]
+> Paths in `workspace` and `tests` are resolved **relative to the task directory** (e.g., `tasks/flutter_bug_fix/`).
+
+---
+
+## Sample files
+
+A sample is a single test case containing an input prompt, expected output (grading target), and optional configuration. Samples are defined inline in `task.yaml` or in external YAML files referenced via `paths`.
+
+```yaml
+# Inline in task.yaml
+samples:
+ inline:
+ - id: dart_async_await_001
+ difficulty: medium
+ tags: [async, dart]
+ input: |
+ Explain the difference between Future.then() and async/await in Dart.
+ target: |
+ The answer should cover both approaches, explain that they are
+ functionally equivalent, and note when each is preferred.
+ metadata:
+ added: 2025-02-04
+ category: language_fundamentals
+```
+
+---
+
+### Core Fields
+
+| Field | Type | Required | Description |
+|-------|------|----------|-------------|
+| `id` | string | Yes | Unique sample identifier |
+| `input` | string | Yes | The prompt given to the model |
+| `target` | string | Yes | Expected output or grading criteria |
+| `difficulty` | string | No | `easy`, `medium`, or `hard` |
+| `tags` | list | No | Categories for filtering |
+| `system_message` | string | No | Override system prompt for this sample |
+| `metadata` | object | No | Arbitrary metadata |
+| `workspace` | object | No | Override task-level workspace |
+| `tests` | object | No | Override task-level tests |
+
+---
+
+### Inspect AI Sample Parameters
+
+These map directly to [Inspect AI's `Sample`](https://inspect.aisi.org.uk/reference/inspect_ai.dataset.html#sample).
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `choices` | list | Answer choices for multiple-choice evaluations |
+| `sandbox` | string/object | Override sandbox environment for this sample |
+| `files` | object | Files to copy into the sandbox (`{destination: source}`) |
+| `setup` | string | Setup script to run in the sandbox before evaluation |
+
+### Multiple Choice Example
+
+```yaml
+- id: dart_null_safety_quiz
+ input: "Which of the following is NOT a valid way to handle null in Dart 3?"
+ target: C
+ choices:
+ - "Use the null-aware operator ?."
+ - "Use a null check with if (x != null)"
+ - "Use the ! operator on every nullable variable"
+ - "Use late initialization"
+```
+
+### Sandbox Files Example
+
+```yaml
+- id: flutter_fix_counter
+ input: "Fix the bug in the counter app."
+ target: "The fix should update the state correctly."
+ sandbox: docker
+ files:
+ /workspace/lib/main.dart: ./fixtures/broken_counter.dart
+ /workspace/test/widget_test.dart: ./fixtures/counter_test.dart
+ setup: "cd /workspace && flutter pub get"
+```
+
+---
+
+### Workspace & Tests References
+
+Workspaces and test paths can be specified at task level (inherited by all samples) or per-sample (overrides task level).
+
+```yaml
+# Reference a reusable template
+workspace:
+ template: flutter_app
+
+# Reference a path relative to task directory
+workspace:
+ path: ./project
+
+# Clone from git
+workspace:
+ git: https://github.com/example/repo.git
+
+# Shorthand (equivalent to path:)
+workspace: ./project
+```
+
+> [!NOTE]
+> Paths in `workspace` and `tests` are resolved **relative to the task directory** (e.g., `tasks/flutter_bug_fix/`).
+
+
+---
+
+## Job files
+
+Job files define **what to run** and can **override built-in runtime defaults**. They're selected via `devals run `. Multiple jobs can be run sequentially.
+
+```yaml
+# jobs/local_dev.yaml
+name: local_dev
+
+# Override runtime defaults
+sandbox_type: podman
+max_connections: 15
+max_retries: 10
+
+# Save the agent's final workspace output to logs//examples/
+# save_examples: true
+
+# Filter what to run (optional - omit to run all)
+models:
+ - google/gemini-2.5-flash
+
+# Variants are defined as a named map.
+# Each key is a variant name; the value is the variant configuration.
+variants:
+ baseline: {}
+ context_only: { context_files: [./context_files/flutter.md] }
+ mcp_only: { mcp_servers: [dart] }
+ full: { context_files: [./context_files/flutter.md], mcp_servers: [dart] }
+
+# Inspect AI eval_set() parameters (all optional)
+retry_attempts: 20
+fail_on_error: 0.05
+log_level: info
+tags: [nightly]
+
+# Default Task-level overrides applied to every task
+task_defaults:
+ time_limit: 600
+ message_limit: 50
+
+# Additional eval_set() parameters not covered above
+# eval_set_overrides:
+# bundle_dir: ./bundle
+# log_images: true
+```
+
+
+### Core Job Fields
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `logs_dir` | string | Override logs directory (default: `../logs`) |
+| `sandbox_type` | string | Sandbox type: `local`, `docker`, or `podman` (default: `local`) |
+| `max_connections` | int | Max concurrent API connections (default: `10`) |
+| `max_retries` | int | Max retry attempts for failed samples (default: `3`) |
+| `save_examples` | bool | If `true`, copies the agent's final workspace to `//examples/` after each sample. (default: `false`) |
+| `models` | list | Filter to specific models — omit to run all |
+| `variants` | map | Named variant definitions (see Variants section) — omit to run all defined in task files |
+| `tasks` | object | Task discovery and overrides (see below) |
+
+### Inspect AI eval_set() Parameters
+
+All [Inspect AI `eval_set()` parameters](https://inspect.aisi.org.uk/reference/inspect_ai.html#eval_set) are available as top-level keys in the job file. These control retry behavior, concurrency, logging, and more.
+
+#### Retry & Error Handling
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `retry_attempts` | int | `10` | Max retry attempts before giving up |
+| `retry_wait` | float | `60` | Seconds between retries (exponential backoff) |
+| `retry_connections` | float | `0.5` | Reduce max_connections at this rate per retry |
+| `retry_cleanup` | bool | `true` | Cleanup failed log files after retries |
+| `retry_on_error` | int | — | Retry samples on error (per-sample) |
+| `fail_on_error` | float | `0.05` | Fail if error proportion exceeds threshold |
+| `continue_on_fail` | bool | — | Continue running even if fail_on_error is met |
+| `debug_errors` | bool | `false` | Raise task errors for debugging |
+
+#### Concurrency
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `max_samples` | int | `max_connections` | Max concurrent samples per task |
+| `max_tasks` | int | `max(4, models)` | Max tasks to run in parallel |
+| `max_subprocesses` | int | `cpu_count` | Max subprocesses in parallel |
+| `max_sandboxes` | int | — | Max sandboxes per-provider in parallel |
+
+#### Logging
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `log_level` | string | `info` | Console log level (`debug`, `info`, `warning`, `error`) |
+| `log_level_transcript` | string | `info` | Log file level |
+| `log_format` | string | `json` | Log format (`eval` or `json`) |
+| `log_samples` | bool | `true` | Log detailed samples and scores |
+| `log_realtime` | bool | `true` | Log events in realtime |
+| `log_images` | bool | `false` | Log base64-encoded images |
+| `log_buffer` | int | — | Samples to buffer before log write |
+| `log_shared` | int | — | Sync sample events for realtime viewing |
+| `log_dir_allow_dirty` | bool | `false` | Allow log dir with unrelated logs |
+
+#### Model Configuration
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `model_base_url` | string | Base URL for the model API |
+| `model_args` | object | Model creation arguments |
+| `model_roles` | object | Named roles for `get_model()` |
+| `task_args` | object | Task creation arguments |
+| `model_cost_config` | object | Model prices for cost tracking |
+
+#### Sample Control
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `limit` | int/list | Limit samples (count or `[start, end]` range) |
+| `sample_id` | string/list | Evaluate specific sample(s) |
+| `sample_shuffle` | bool/int | Shuffle samples (pass seed for deterministic order) |
+| `epochs` | int/object | Repeat samples and optional score reducer |
+
+#### Limits (Applied to All Samples)
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `message_limit` | int | Max messages per sample |
+| `token_limit` | int | Max tokens per sample |
+| `time_limit` | int | Max clock time (seconds) per sample |
+| `working_limit` | int | Max working time (seconds) per sample |
+| `cost_limit` | float | Max cost (dollars) per sample |
+
+#### Miscellaneous
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `tags` | list | Tags for this evaluation run |
+| `metadata` | object | Metadata for this evaluation run |
+| `trace` | bool | Trace model interactions to terminal |
+| `display` | string | Task display type (default: `full`) |
+| `score` | bool | Score output (default: `true`) |
+| `approval` | string/object | Tool use approval policies |
+| `solver` | string/object | Alternative solver(s) |
+| `sandbox_cleanup` | bool | Cleanup sandbox after task (default: `true`) |
+| `bundle_dir` | string | Directory for bundled logs + viewer |
+| `bundle_overwrite` | bool | Overwrite files in bundle_dir |
+| `eval_set_id` | string | Custom ID for the eval set |
+
+### Pass-Through Sections
+
+#### `task_defaults`
+
+Default [Task parameters](#inspect-ai-task-parameters) applied to **every task** in this job. Per-task overrides from `task.yaml` take precedence.
+
+```yaml
+task_defaults:
+ time_limit: 600
+ message_limit: 50
+ cost_limit: 2.0
+ epochs: 3
+```
+
+#### `eval_set_overrides`
+
+Arbitrary `eval_set()` kwargs for parameters not covered by the named fields above. Top-level fields take precedence over overrides.
+
+```yaml
+eval_set_overrides:
+ bundle_dir: ./bundle
+ log_images: true
+```
+
+### Tasks Object
+
+```yaml
+tasks:
+ # Discover tasks via glob patterns (relative to dataset root)
+ paths: [tasks/*]
+ # Per-task overrides (keys must match directory names in tasks/)
+ inline:
+ flutter_bug_fix:
+ allowed_variants: [baseline] # Override variants for this task
+ include-samples: [sample_001] # Only run these samples
+ exclude-samples: [slow_test] # Exclude these samples
+```
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `paths` | list | Glob patterns for discovering task directories |
+| `inline` | object | Per-task configuration overrides |
+
+---
+
+## Variants
+
+Variants modify how tasks execute, controlling context injection, tool availability, and skill access. Variants are defined as **named maps** in job files.
+
+```yaml
+variants:
+ baseline: {}
+ context_only: { context_files: [./context_files/flutter.md] }
+ mcp_only: { mcp_servers: [dart] }
+ full: { context_files: [./context_files/flutter.md], mcp_servers: [dart] }
+```
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `context_files` | list | `[]` | Paths or glob patterns to context files (relative to task dir) |
+| `skills` | list | `[]` | Paths or glob patterns to skill directories (relative to task dir) |
+| `mcp_servers` | list | `[]` | MCP server identifiers |
+
+Tasks can optionally restrict which variants apply to them via `allowed_variants` in their `task.yaml`:
+
+```yaml
+# task.yaml — only run baseline and mcp_only variants for this task
+allowed_variants: [baseline, mcp_only]
+```
+
+Glob patterns (containing `*`, `?`, or `[`) are expanded automatically. For skills, only directories containing `SKILL.md` are included.
+
+> [!IMPORTANT]
+> The `skills` feature requires a sandbox (docker/podman). Skill directories are copied into the sandbox filesystem by Inspect AI's built-in `skill()` tool. Each skill directory must contain a `SKILL.md` file.
+
+---
+
+## Context Files
+
+Markdown files with YAML frontmatter providing additional context to the model.
+
+```markdown
+---
+title: "AI Rules for Flutter"
+version: "1.0.0"
+description: "Recommended patterns and best practices"
+dart_version: "3.10.0"
+flutter_version: "3.24.0"
+updated: "2025-12-24"
+---
+
+## Flutter Best Practices
+
+Content here is injected into the model's context when the variant
+has context_files pointing to this file.
+```
+
+| Field | Type | Required | Description |
+|-------|------|----------|-------------|
+| `title` | string | Yes | Context file title |
+| `version` | string | Yes | Version identifier |
+| `description` | string | Yes | Brief description |
+| `dart_version` | string | No | Target Dart version |
+| `flutter_version` | string | No | Target Flutter version |
+| `updated` | string | No | Last update date |
+
+---
+
+## CLI Usage
+
+```bash
+# Run a specific job
+devals run local_dev
+devals run ci
+
+# Dry run — validate config without executing
+devals run local_dev --dry-run
+
+# Create a new task
+devals create task
+
+# Add a sample to an existing task
+devals create sample
+
+# Initialize a new dataset
+devals init
+```
diff --git a/docs/reference/dart_api/dataset_config_dart/dataset_config_dart.md b/docs/reference/dart_api/dataset_config_dart/dataset_config_dart.md
new file mode 100644
index 0000000..460a6a4
--- /dev/null
+++ b/docs/reference/dart_api/dataset_config_dart/dataset_config_dart.md
@@ -0,0 +1,1750 @@
+# dataset_config_dart
+
+Core library for resolving eval dataset YAML into EvalSet JSON.
+
+This package contains the business logic for:
+- Parsing task and job YAML files (or pre-parsed JSON maps)
+- Resolving configs (models, sandboxes, variants)
+- Writing EvalSet JSON for the Python runner
+
+It is frontend-agnostic — both the CLI and a future web interface
+can use this library.
+
+## Quick start
+
+Use [ConfigResolver] for a single-call convenience facade:
+
+```dart
+final resolver = ConfigResolver();
+final configs = resolver.resolve(datasetPath, ['my_job']);
+```
+
+## Layered API
+
+For finer-grained control, use the individual layers:
+
+1. **Parsers** — [YamlParser], [JsonParser]
+2. **Resolvers** — [EvalSetResolver]
+3. **Writers** — [EvalSetWriter]
+
+---
+
+## abstract class `ChatCompletionChoice`
+
+**Mixins:** `_$ChatCompletionChoice`
+
+Choice generated for completion.
+
+### Constructors
+
+#### `ChatCompletionChoice`
+
+```dart
+ChatCompletionChoice({required ChatMessageAssistant message, String stopReason, Logprobs? logprobs})
+```
+
+Creates a chat completion choice.
+
+#### `ChatCompletionChoice.fromJson`
+
+```dart
+ChatCompletionChoice.fromJson(Map json)
+```
+
+---
+
+## abstract class `ChatMessage`
+
+**Mixins:** `_$ChatMessage`
+
+Chat message.
+
+### Constructors
+
+#### `ChatMessage.system`
+
+```dart
+ChatMessage.system({String? id, required Object content, String? source, Map? metadata, String role})
+```
+
+System chat message.
+
+#### `ChatMessage.user`
+
+```dart
+ChatMessage.user({String? id, required Object content, String? source, Map? metadata, String role, Object? toolCallId})
+```
+
+User chat message.
+
+#### `ChatMessage.assistant`
+
+```dart
+ChatMessage.assistant({String? id, required Object content, String? source, Map? metadata, String role, List? toolCalls, String? model})
+```
+
+Assistant chat message.
+
+#### `ChatMessage.tool`
+
+```dart
+ChatMessage.tool({String? id, required Object content, String? source, Map? metadata, String role, String? toolCallId, String? function, ToolCallError? error})
+```
+
+Tool chat message.
+
+#### `ChatMessage.fromJson`
+
+```dart
+ChatMessage.fromJson(Map json)
+```
+
+---
+
+## class `ConfigException`
+
+**Implements:** `Exception`
+
+Exception thrown when runner config resolution fails.
+
+This is the library-level exception for the runner_config package.
+CLI or web frontends can catch this and present the error appropriately.
+
+### Constructors
+
+#### `ConfigException`
+
+```dart
+ConfigException(String message)
+```
+
+### Properties
+
+- **`message`** → `String` *(final)*
+
+---
+
+## class `ConfigResolver`
+
+Convenience facade that composes Parser → Resolver into a single call.
+
+For finer-grained control, use [YamlParser], [JsonParser],
+and [EvalSetResolver] directly.
+
+### Constructors
+
+#### `ConfigResolver`
+
+```dart
+ConfigResolver()
+```
+
+### Methods
+
+#### `resolve`
+
+```dart
+List resolve(String datasetPath, List jobNames)
+```
+
+Resolve dataset + job(s) into [EvalSet] objects.
+
+[datasetPath] is the root directory containing `tasks/` and `jobs/`.
+[jobNames] are the job names (looked up in `jobs/`) or paths.
+
+**Parameters:**
+
+- `datasetPath` (`String`) *(required)*
+- `jobNames` (`List`) *(required)*
+
+---
+
+## abstract class `Content`
+
+**Mixins:** `_$Content`
+
+Content sent to or received from a model.
+
+### Constructors
+
+#### `Content.text`
+
+```dart
+Content.text({required String text, bool refusal, List