Skip to content

Commit 58f5f84

Browse files
authored
Merge pull request #9 from flutter/initial-runner-migration
Add python packages that run evals
2 parents 08951e5 + 3be68eb commit 58f5f84

137 files changed

Lines changed: 7087 additions & 43 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,14 @@ name: Config Tests
33
on:
44
pull_request:
55
paths:
6-
- 'packages/dataset_config/**'
7-
- '.github/workflows/config_tests.yml'
6+
- 'packages/dataset_config_dart/**'
7+
- '.github/workflows/config_dart_tests.yml'
88
push:
99
branches:
1010
- main
1111
paths:
12-
- 'packages/dataset_config/**'
13-
- '.github/workflows/config_tests.yml'
12+
- 'packages/dataset_config_dart/**'
13+
- '.github/workflows/config_dart_tests.yml'
1414

1515
jobs:
1616
config-tests:
@@ -31,9 +31,9 @@ jobs:
3131
run: flutter pub get
3232

3333
- name: Analyze
34-
working-directory: packages/dataset_config
34+
working-directory: packages/dataset_config_dart
3535
run: dart analyze --fatal-infos
3636

3737
- name: Run tests
38-
working-directory: packages/dataset_config
38+
working-directory: packages/dataset_config_dart
3939
run: dart test
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
name: Config Parity
2+
3+
on:
4+
pull_request:
5+
paths:
6+
- 'packages/dataset_config_dart/**'
7+
- 'packages/dataset_config_python/**'
8+
- 'tool/config_parity/**'
9+
- '.github/workflows/config_parity.yml'
10+
push:
11+
branches:
12+
- main
13+
paths:
14+
- 'packages/dataset_config_dart/**'
15+
- 'packages/dataset_config_python/**'
16+
- 'tool/config_parity/**'
17+
- '.github/workflows/config_parity.yml'
18+
19+
jobs:
20+
config-parity:
21+
runs-on: ubuntu-latest
22+
timeout-minutes: 10
23+
24+
steps:
25+
- name: Checkout repository
26+
uses: actions/checkout@v6
27+
28+
- name: Install Flutter
29+
run: |
30+
git clone https://github.com/flutter/flutter.git --depth 1 -b stable $HOME/flutter
31+
echo "$HOME/flutter/bin" >> $GITHUB_PATH
32+
echo "$HOME/.pub-cache/bin" >> $GITHUB_PATH
33+
34+
- name: Install Dart dependencies
35+
run: flutter pub get
36+
37+
- name: Set up Python
38+
uses: actions/setup-python@v6
39+
with:
40+
python-version: '3.13'
41+
42+
- name: Install Python config package
43+
run: pip install -e packages/dataset_config_python
44+
45+
- name: Verify config parity
46+
run: dart run tool/config_parity/bin/config_partiy.dart
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
name: dash_evals module - Python tests
2+
3+
on:
4+
pull_request:
5+
paths:
6+
- 'packages/dash_evals/**'
7+
- '.github/workflows/dash_evals_module_tests.yml'
8+
push:
9+
branches:
10+
- main
11+
paths:
12+
- 'packages/dash_evals/**'
13+
- '.github/workflows/dash_evals_module_tests.yml'
14+
15+
jobs:
16+
runner-tests:
17+
runs-on: ubuntu-latest
18+
timeout-minutes: 15
19+
20+
steps:
21+
- name: Checkout repository
22+
uses: actions/checkout@v6
23+
24+
- name: Set up Python
25+
uses: actions/setup-python@v6
26+
with:
27+
python-version: '3.13'
28+
29+
- name: Create virtual environment
30+
working-directory: packages/dash_evals
31+
run: python -m venv .venv
32+
33+
- name: Install dependencies
34+
working-directory: packages/dash_evals
35+
run: |
36+
source .venv/bin/activate
37+
pip install --upgrade pip
38+
pip install -e ".[dev]"
39+
40+
- name: Run tests
41+
working-directory: packages/dash_evals
42+
run: |
43+
source .venv/bin/activate
44+
pytest -v

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ coverage
1515
/docs/_build
1616
/docs/dart_docs
1717
logs/
18+
**/pyrefly.toml
1819

1920

2021
##

packages/dash_evals/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# dash_evals
2+
3+
Python package for running LLM evaluations on Dart and Flutter tasks using [Inspect AI](https://inspect.aisi.org.uk/).

packages/dash_evals/pyproject.toml

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
[project]
2+
name = "dash-evals"
3+
version = "0.1.0"
4+
description = ""
5+
authors = [{ name = "Eric Windmill", email = "eric@ericwindmill.com" }]
6+
readme = "README.md"
7+
requires-python = ">=3.13,<4.0.0"
8+
dependencies = [
9+
"inspect-ai>=0.3.142,<0.4.0",
10+
"pyyaml>=6.0.3,<7.0.0",
11+
"google-genai>=1.47.0,<2.0.0",
12+
"mcp>=1.20.0,<2.0.0",
13+
"python-dotenv>=1.2.1,<2.0.0",
14+
"anthropic>=0.75.0,<0.81.0",
15+
"openai>=2.8.1,<3.0.0",
16+
"firebase-admin>=6.0.0,<8.0.0",
17+
"pydantic>=2.0.0,<3.0.0",
18+
]
19+
20+
[project.optional-dependencies]
21+
dev = [
22+
"pytest>=8.0.0",
23+
"pytest-mock>=3.12.0",
24+
"pytest-cov>=4.1.0",
25+
"pylint>=3.0.0",
26+
]
27+
28+
[project.scripts]
29+
run-evals = "dash_evals.main:main"
30+
31+
[build-system]
32+
requires = ["setuptools>=61.0"]
33+
build-backend = "setuptools.build_meta"
34+
35+
# Register podman sandbox with inspect_ai
36+
[project.entry-points.inspect_ai]
37+
dash_evals = "dash_evals.runner.sandboxes"
38+
39+
[tool.setuptools.packages.find]
40+
where = ["src"]
41+
42+
[tool.setuptools.package-data]
43+
dash_evals = ["data/*.yaml"]
44+
45+
[tool.pytest.ini_options]
46+
testpaths = ["tests"]
47+
python_files = ["test_*.py"]
48+
python_classes = ["Test*"]
49+
python_functions = ["test_*"]
50+
51+
[tool.coverage.run]
52+
omit = [
53+
"src/dash_evals/main.py",
54+
"src/dash_evals/uploader.py",
55+
"src/dash_evals/uploader_aggregates.py",
56+
"src/dash_evals/tasks/*",
57+
]
58+
59+
[tool.pylint.messages_control]
60+
disable = [
61+
"logging-fstring-interpolation", # Allow f-strings in logging (modern Python standard)
62+
]
63+
64+
[tool.pylint.format]
65+
max-line-length = 100
66+
67+
[tool.ruff]
68+
line-length = 100
69+
70+
[tool.ruff.lint]
71+
select = ["E", "F", "W", "I"]
72+
ignore = ["E501"] # Line too long (handled by formatter)

packages/dash_evals/pyrefly.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Pyrefly configuration
2+
# Tell Pyrefly to use the repo-root venv Python interpreter
3+
4+
python-interpreter = "../../.venv/bin/python"
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
"""dash_evals - Evaluation framework for Dart and Flutter AI assistants.
2+
3+
This package provides tools for running evaluations using Inspect AI
4+
to measure model performance on Dart/Flutter tasks.
5+
6+
Configuration is resolved by the Dart CLI (devals) and emitted as JSONL
7+
datasets + a run manifest. The Python package reads the manifest and
8+
calls eval_set() directly.
9+
10+
Main entry point:
11+
run-evals --manifest <path-to-manifest>
12+
"""
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# Copyright 2025 The Flutter Authors. All rights reserved.
2+
# Use of this source code is governed by a BSD-style license that can be
3+
# found in the LICENSE file.
4+
5+
"""CLI entry point for running evaluations.
6+
7+
Usage:
8+
run-evals --json ./eval_set.json
9+
run-evals --task my_task --model openai/gpt-4o --dataset samples.jsonl
10+
"""
11+
12+
import argparse
13+
import logging
14+
import sys
15+
from pathlib import Path
16+
17+
from dotenv import load_dotenv
18+
19+
# Import sandbox environments to register them with InspectAI
20+
# The @sandboxenv decorator registers the sandbox type when the module is imported
21+
import dash_evals.runner.sandboxes.podman.podman # noqa: F401 # Registers 'podman'
22+
from dash_evals.runner.args_runner import _run_from_args
23+
from dash_evals.runner.json_runner import run_from_json
24+
25+
# Basic console logger for early startup messages
26+
logging.basicConfig(level=logging.INFO, format="%(message)s")
27+
_startup_logger = logging.getLogger("startup")
28+
29+
30+
def main():
31+
"""Parse command-line arguments and run evaluations."""
32+
# Load .env from the repo root (walks up from cwd).
33+
# This populates os.environ with API keys, credentials, etc.
34+
# System env vars take precedence over .env values (python-dotenv default).
35+
load_dotenv(override=False)
36+
37+
parser = argparse.ArgumentParser(
38+
description="Run Inspect AI evaluations for the Dart/Flutter plugin.",
39+
epilog="Example: run-evals --json ./eval_set.json",
40+
)
41+
42+
# ---------- JSON mode (mutually exclusive with direct args) ----------
43+
parser.add_argument(
44+
"--json",
45+
type=Path,
46+
help="Path to eval_set.json (emitted by Dart CLI).",
47+
)
48+
49+
# ---------- Direct-args mode ----------
50+
parser.add_argument(
51+
"--task",
52+
type=str,
53+
help="Task function name (e.g. 'flutter_code_gen' or dotted path).",
54+
)
55+
parser.add_argument(
56+
"--model",
57+
type=str,
58+
action="append",
59+
help="Model to evaluate (can be repeated). Example: openai/gpt-4o",
60+
)
61+
parser.add_argument(
62+
"--dataset",
63+
type=Path,
64+
help="Path to a dataset file (JSON/JSONL/CSV).",
65+
)
66+
parser.add_argument(
67+
"--log-dir",
68+
type=Path,
69+
help="Directory to write evaluation logs.",
70+
)
71+
parser.add_argument(
72+
"--sandbox",
73+
type=str,
74+
nargs=2,
75+
metavar=("TYPE", "CONFIG"),
76+
help="Sandbox type and config path. Example: podman compose.yaml",
77+
)
78+
parser.add_argument(
79+
"--max-connections",
80+
type=int,
81+
help="Maximum concurrent model connections.",
82+
)
83+
parser.add_argument(
84+
"--max-samples",
85+
type=int,
86+
help="Maximum concurrent samples per task.",
87+
)
88+
parser.add_argument(
89+
"--fail-on-error",
90+
type=float,
91+
help="Proportion of sample errors to tolerate (0.0-1.0).",
92+
)
93+
94+
args = parser.parse_args()
95+
96+
# Ensure either --json or direct args are provided, but not both.
97+
direct_args_provided = any([args.task, args.model, args.dataset])
98+
if args.json and direct_args_provided:
99+
parser.error(
100+
"Cannot combine --json with --task/--model/--dataset. Use one mode or the other."
101+
)
102+
if not args.json and not direct_args_provided:
103+
parser.error("Provide either --json or at least --task and --model.")
104+
105+
try:
106+
if args.json:
107+
has_failures = run_from_json(args.json)
108+
else:
109+
has_failures = _run_from_args(args)
110+
except Exception as e:
111+
_startup_logger.error(f"Failed to run evaluation: {e}")
112+
sys.exit(1)
113+
114+
sys.exit(1 if has_failures else 0)
115+
116+
117+
if __name__ == "__main__":
118+
main()
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
"""Runner module for executing evaluations.
2+
3+
This module contains the core evaluation logic including:
4+
- Task definitions and registry
5+
- Solvers for setting up workspaces
6+
- Scorers for evaluating model outputs
7+
"""

0 commit comments

Comments
 (0)