diff --git a/apps/cli/src/commands/init/index.ts b/apps/cli/src/commands/init/index.ts index f9d6c7790..eb2149e3b 100644 --- a/apps/cli/src/commands/init/index.ts +++ b/apps/cli/src/commands/init/index.ts @@ -11,9 +11,11 @@ export interface InitCommandOptions { function printSkillFirstInstructions(): void { console.log('\nAI-skills-first setup (recommended):'); + console.log(' agentv skills get agentv-bench'); + console.log(' Then ask your agent: "Set up AgentV in this repo."'); + console.log('\nFor Claude Code users, the agentv-dev plugin also provides skill discovery:'); console.log(' npx allagents plugin marketplace add EntityProcess/agentv'); console.log(' npx allagents plugin install agentv-dev@agentv'); - console.log(' Then ask your agent: "Set up AgentV in this repo."'); } async function promptYesNo(message: string): Promise { diff --git a/apps/cli/src/commands/skills/index.ts b/apps/cli/src/commands/skills/index.ts new file mode 100644 index 000000000..38b1a14ce --- /dev/null +++ b/apps/cli/src/commands/skills/index.ts @@ -0,0 +1,377 @@ +/** + * `agentv skills` — serve bundled skill content from inside the CLI tarball. + * + * Skills are bundled into `dist/skills//` at build time (see tsup.config.ts). + * This ensures skill content always matches the installed CLI version — no drift possible. + * + * Subcommands: + * list — print skill names (one per line, or JSON with --json) + * get — print SKILL.md content + * get --full — also include references/, templates/, agents/ + * get --ref — print one reference file (searches references/, templates/, agents/, then skill root) + * get --all — get all skills + * get --all — get all skills + * path [] — print resolved path to skills dir or specific skill dir + * + * Resolution: walk from this module's file upward to find `dist/skills/` or `skills/` + * that contains actual skill content (validated by presence of SKILL.md files). + * Production npm install: binary at dist/cli.js → dist/skills/ is a sibling. + * Source run (bun src/cli.ts): walks up to apps/cli/ where dist/skills/ lives. + * + * JSON output (--json) schema: + * { success: true, data: [{ name: string, content: string, files?: Record }] } + * { success: false, error: string } + */ + +import { existsSync, readFileSync, readdirSync } from 'node:fs'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { command, flag, option, optional, positional, string, subcommands } from 'cmd-ts'; + +// ── Resolution ──────────────────────────────────────────────────────────────── + +/** A valid skills dir contains at least one subdirectory with a SKILL.md file. */ +function isValidSkillsDir(dir: string): boolean { + if (!existsSync(dir)) return false; + try { + return readdirSync(dir, { withFileTypes: true }).some( + (e) => e.isDirectory() && existsSync(path.join(dir, e.name, 'SKILL.md')), + ); + } catch { + return false; + } +} + +/** + * Walk from the directory containing this module's source file up to find + * a directory that contains actual skill content. In priority order at + * each ancestor level: + * 1. `dist/skills/` — production npm install (binary at dist/cli.js, + * skills are a sibling) and post-build dev runs. + * 2. `skills-data/` — repo-root source layout (mirrors agent-browser's + * top-level `skill-data/`); used when running from TypeScript source + * without a build. + * 3. `skills/` — legacy in-package location, retained for backward + * compatibility with any downstream consumer that still bundles + * this module without the dist copy step. + */ +function findSkillsDir(): string | null { + const selfFile = fileURLToPath(import.meta.url); + let dir = path.dirname(selfFile); + for (let i = 0; i < 6; i++) { + const distCandidate = path.join(dir, 'dist', 'skills'); + if (isValidSkillsDir(distCandidate)) return distCandidate; + const repoRootCandidate = path.join(dir, 'skills-data'); + if (isValidSkillsDir(repoRootCandidate)) return repoRootCandidate; + const legacyCandidate = path.join(dir, 'skills'); + if (isValidSkillsDir(legacyCandidate)) return legacyCandidate; + dir = path.dirname(dir); + } + return null; +} + +function requireSkillsDir(): string { + const dir = findSkillsDir(); + if (!dir) { + console.error( + 'Error: bundled skills directory not found. This is a build issue — please reinstall agentv.', + ); + process.exit(1); + } + return dir; +} + +// ── Skill reading ───────────────────────────────────────────────────────────── + +interface SkillData { + name: string; + content: string; + files?: Record; +} + +function listSkillNames(skillsDir: string): string[] { + if (!existsSync(skillsDir)) return []; + return readdirSync(skillsDir, { withFileTypes: true }) + .filter((e) => e.isDirectory()) + .map((e) => e.name) + .sort(); +} + +function readSkillFile(skillDir: string, relPath: string): string | null { + const full = path.join(skillDir, relPath); + if (!existsSync(full)) return null; + return readFileSync(full, 'utf-8'); +} + +/** + * Recursively collect all files under a subdirectory. + * Returns a map of relative paths → contents. + */ +function collectDir(dir: string, prefix = ''): Record { + const result: Record = {}; + if (!existsSync(dir)) return result; + for (const entry of readdirSync(dir, { withFileTypes: true })) { + const relPath = prefix ? `${prefix}/${entry.name}` : entry.name; + if (entry.isDirectory()) { + Object.assign(result, collectDir(path.join(dir, entry.name), relPath)); + } else { + result[relPath] = readFileSync(path.join(dir, entry.name), 'utf-8'); + } + } + return result; +} + +function readSkill(skillsDir: string, name: string, full: boolean): SkillData | null { + const skillDir = path.join(skillsDir, name); + if (!existsSync(skillDir)) return null; + + const content = readSkillFile(skillDir, 'SKILL.md'); + if (content === null) return null; + + if (!full) return { name, content }; + + // Collect extra directories: references/, templates/, agents/ + const files: Record = {}; + for (const sub of ['references', 'templates', 'agents']) { + const subDir = path.join(skillDir, sub); + const collected = collectDir(subDir, sub); + Object.assign(files, collected); + } + return { name, content, files: Object.keys(files).length > 0 ? files : undefined }; +} + +/** + * Find a single reference file by name within a skill. + * + * Search order: references/, templates/, agents/, then the skill root for + * a bare filename. The name may include or omit the `.md` extension — we + * try the literal name first, then with `.md` appended, so callers can + * write `--ref eval-yaml-spec` instead of `--ref eval-yaml-spec.md`. + */ +function findRefFile( + skillDir: string, + refName: string, +): { relPath: string; content: string } | null { + const candidates = refName.endsWith('.md') ? [refName] : [refName, `${refName}.md`]; + for (const sub of ['references', 'templates', 'agents']) { + for (const candidate of candidates) { + const filePath = path.join(skillDir, sub, candidate); + if (existsSync(filePath)) { + return { relPath: `${sub}/${candidate}`, content: readFileSync(filePath, 'utf-8') }; + } + } + } + // Bare name in the skill root (e.g. LICENSE.txt) + for (const candidate of candidates) { + const filePath = path.join(skillDir, candidate); + if (existsSync(filePath)) { + return { relPath: candidate, content: readFileSync(filePath, 'utf-8') }; + } + } + return null; +} + +/** + * List ref-discoverable filenames inside a skill (used to print a useful + * error when a `--ref` lookup misses). + */ +function listRefFiles(skillDir: string): string[] { + const out: string[] = []; + for (const sub of ['references', 'templates', 'agents']) { + const subDir = path.join(skillDir, sub); + if (!existsSync(subDir)) continue; + for (const entry of readdirSync(subDir, { withFileTypes: true })) { + if (entry.isFile()) out.push(`${sub}/${entry.name}`); + } + } + return out.sort(); +} + +// ── Output helpers ──────────────────────────────────────────────────────────── + +function printSkill(skill: SkillData, json: boolean): void { + if (json) { + process.stdout.write(`${JSON.stringify({ success: true, data: [skill] })}\n`); + return; + } + process.stdout.write(skill.content); + if (!skill.content.endsWith('\n')) process.stdout.write('\n'); + if (skill.files) { + for (const [relPath, content] of Object.entries(skill.files)) { + process.stdout.write(`\n--- ${relPath} ---\n`); + process.stdout.write(content); + if (!content.endsWith('\n')) process.stdout.write('\n'); + } + } +} + +// ── Subcommands ─────────────────────────────────────────────────────────────── + +const skillsListCommand = command({ + name: 'list', + description: 'List available bundled skills', + args: { + json: flag({ long: 'json', description: 'Output as JSON' }), + }, + handler: ({ json }) => { + const skillsDir = requireSkillsDir(); + const names = listSkillNames(skillsDir); + if (json) { + process.stdout.write(`${JSON.stringify({ success: true, data: names })}\n`); + } else { + for (const name of names) { + console.log(name); + } + } + }, +}); + +const skillsGetCommand = command({ + name: 'get', + description: 'Get skill content by name (or --all for all skills)', + args: { + name: positional({ type: optional(string), displayName: 'name', description: 'Skill name' }), + all: flag({ long: 'all', description: 'Get all skills' }), + full: flag({ + long: 'full', + description: 'Also include files under references/, templates/, and agents/', + }), + ref: option({ + type: optional(string), + long: 'ref', + description: + 'Load a single reference file by name (searches references/, templates/, agents/). Takes precedence over --full.', + }), + json: flag({ long: 'json', description: 'Output as JSON' }), + }, + handler: ({ name, all, full, ref, json }) => { + const skillsDir = requireSkillsDir(); + + if (ref !== undefined && all) { + const msg = '--ref is incompatible with --all'; + if (json) { + process.stdout.write(`${JSON.stringify({ success: false, error: msg })}\n`); + } else { + console.error(`Error: ${msg}`); + } + process.exit(1); + } + + if (ref !== undefined) { + if (name === undefined) { + const msg = '--ref requires a skill name'; + if (json) { + process.stdout.write(`${JSON.stringify({ success: false, error: msg })}\n`); + } else { + console.error(`Error: ${msg}`); + } + process.exit(1); + } + const skillDir = path.join(skillsDir, name); + if (!existsSync(skillDir)) { + const msg = `skill '${name}' not found`; + if (json) { + process.stdout.write(`${JSON.stringify({ success: false, error: msg })}\n`); + } else { + console.error(`Error: ${msg}`); + const available = listSkillNames(skillsDir); + if (available.length > 0) { + console.error(`Available skills: ${available.join(', ')}`); + } + } + process.exit(1); + } + const file = findRefFile(skillDir, ref); + if (!file) { + const msg = `reference '${ref}' not found in skill '${name}'`; + if (json) { + process.stdout.write(`${JSON.stringify({ success: false, error: msg })}\n`); + } else { + console.error(`Error: ${msg}`); + const available = listRefFiles(skillDir); + if (available.length > 0) { + console.error(`Available reference files:\n ${available.join('\n ')}`); + } + } + process.exit(1); + } + if (json) { + process.stdout.write( + `${JSON.stringify({ success: true, data: [{ name, content: file.content, files: { [file.relPath]: file.content } }] })}\n`, + ); + return; + } + process.stdout.write(file.content); + if (!file.content.endsWith('\n')) process.stdout.write('\n'); + return; + } + + if (all || name === undefined) { + const names = listSkillNames(skillsDir); + const skills = names + .map((n) => readSkill(skillsDir, n, full)) + .filter((s): s is SkillData => s !== null); + + if (json) { + process.stdout.write(`${JSON.stringify({ success: true, data: skills })}\n`); + return; + } + for (const skill of skills) { + if (skills.length > 1) { + process.stdout.write(`\n=== ${skill.name} ===\n\n`); + } + printSkill(skill, false); + } + return; + } + + const skill = readSkill(skillsDir, name, full); + if (!skill) { + if (json) { + process.stdout.write( + `${JSON.stringify({ success: false, error: `Skill '${name}' not found` })}\n`, + ); + } else { + console.error(`Error: skill '${name}' not found`); + const available = listSkillNames(skillsDir); + if (available.length > 0) { + console.error(`Available skills: ${available.join(', ')}`); + } + } + process.exit(1); + } + + printSkill(skill, json); + }, +}); + +const skillsPathCommand = command({ + name: 'path', + description: 'Print path to bundled skills directory (or specific skill directory)', + args: { + name: positional({ type: optional(string), displayName: 'name', description: 'Skill name' }), + }, + handler: ({ name }) => { + const skillsDir = requireSkillsDir(); + if (name) { + const skillDir = path.join(skillsDir, name); + if (!existsSync(skillDir)) { + console.error(`Error: skill '${name}' not found`); + process.exit(1); + } + console.log(skillDir); + } else { + console.log(skillsDir); + } + }, +}); + +export const skillsCommand = subcommands({ + name: 'skills', + description: 'List and retrieve bundled AgentV skills', + cmds: { + list: skillsListCommand, + get: skillsGetCommand, + path: skillsPathCommand, + }, +}); diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts index 93d4156a9..e32e20772 100644 --- a/apps/cli/src/index.ts +++ b/apps/cli/src/index.ts @@ -15,6 +15,7 @@ import { pipelineCommand } from './commands/pipeline/index.js'; import { resultsCommand } from './commands/results/index.js'; import { resultsServeCommand } from './commands/results/serve.js'; import { selfCommand } from './commands/self/index.js'; +import { skillsCommand } from './commands/skills/index.js'; import { transpileCommand } from './commands/transpile/index.js'; import { trendCommand } from './commands/trend/index.js'; import { trimCommand } from './commands/trim/index.js'; @@ -36,6 +37,7 @@ export const app = subcommands({ pipeline: pipelineCommand, results: resultsCommand, self: selfCommand, + skills: skillsCommand, serve: resultsServeCommand, studio: resultsServeCommand, inspect: inspectCommand, @@ -67,6 +69,7 @@ const TOP_LEVEL_COMMANDS = new Set([ 'pipeline', 'results', 'self', + 'skills', 'serve', 'studio', 'trend', diff --git a/apps/cli/test/unit/skills.test.ts b/apps/cli/test/unit/skills.test.ts new file mode 100644 index 000000000..4685dbfee --- /dev/null +++ b/apps/cli/test/unit/skills.test.ts @@ -0,0 +1,223 @@ +/** + * Unit tests for the skills command helpers. + * + * Tests cover: discovery from install layout, SKILL.md reading, + * --full collecting references/, and the not-found error path. + */ +import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; +import { mkdirSync, rmSync, writeFileSync } from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; + +// We test the pure helper functions by importing them after patching. +// Since findSkillsDir() relies on import.meta.url (which points to the +// compiled test file), we exercise the helpers by building a temp skills dir +// and calling the internal functions directly. + +// ── Test doubles ──────────────────────────────────────────────────────────── + +/** + * Mirrors the internal listSkillNames / readSkill logic without depending on + * import.meta.url resolution so we can test it with a fixture directory. + */ +import { existsSync, readFileSync, readdirSync } from 'node:fs'; + +function listSkillNames(skillsDir: string): string[] { + if (!existsSync(skillsDir)) return []; + return readdirSync(skillsDir, { withFileTypes: true }) + .filter((e) => e.isDirectory()) + .map((e) => e.name) + .sort(); +} + +function collectDir(dir: string, prefix = ''): Record { + const result: Record = {}; + if (!existsSync(dir)) return result; + for (const entry of readdirSync(dir, { withFileTypes: true })) { + const relPath = prefix ? `${prefix}/${entry.name}` : entry.name; + if (entry.isDirectory()) { + Object.assign(result, collectDir(path.join(dir, entry.name), relPath)); + } else { + result[relPath] = readFileSync(path.join(dir, entry.name), 'utf-8'); + } + } + return result; +} + +function readSkill( + skillsDir: string, + name: string, + full: boolean, +): { name: string; content: string; files?: Record } | null { + const skillDir = path.join(skillsDir, name); + if (!existsSync(skillDir)) return null; + const skillFile = path.join(skillDir, 'SKILL.md'); + if (!existsSync(skillFile)) return null; + const content = readFileSync(skillFile, 'utf-8'); + if (!full) return { name, content }; + const files: Record = {}; + for (const sub of ['references', 'templates', 'agents']) { + Object.assign(files, collectDir(path.join(skillDir, sub), sub)); + } + return { name, content, files: Object.keys(files).length > 0 ? files : undefined }; +} + +function findRefFile( + skillDir: string, + refName: string, +): { relPath: string; content: string } | null { + const candidates = refName.endsWith('.md') ? [refName] : [refName, `${refName}.md`]; + for (const sub of ['references', 'templates', 'agents']) { + for (const candidate of candidates) { + const filePath = path.join(skillDir, sub, candidate); + if (existsSync(filePath)) { + return { relPath: `${sub}/${candidate}`, content: readFileSync(filePath, 'utf-8') }; + } + } + } + for (const candidate of candidates) { + const filePath = path.join(skillDir, candidate); + if (existsSync(filePath)) { + return { relPath: candidate, content: readFileSync(filePath, 'utf-8') }; + } + } + return null; +} + +// ── Fixtures ───────────────────────────────────────────────────────────────── + +let tmpDir: string; + +function write(relPath: string, content: string): void { + const full = path.join(tmpDir, relPath); + mkdirSync(path.dirname(full), { recursive: true }); + writeFileSync(full, content, 'utf-8'); +} + +beforeEach(() => { + tmpDir = path.join(os.tmpdir(), `agentv-skills-test-${Date.now()}`); + mkdirSync(tmpDir, { recursive: true }); + + // Create two skill directories + write( + 'agentv-bench/SKILL.md', + '---\nname: agentv-bench\ndescription: Run evals\n---\n# AgentV Bench\nContent here.\n', + ); + write('agentv-bench/references/cli.md', '# CLI Reference\nSome commands.\n'); + write('agentv-bench/references/eval-yaml-spec.md', '# Eval YAML spec\nSchema details.\n'); + write('agentv-bench/agents/executor.md', '# Executor agent\nRole definition.\n'); + write('agentv-bench/templates/sample.yaml', 'name: sample\n'); + write('agentv-bench/LICENSE.txt', 'MIT License\n'); + write( + 'agentv-eval-writer/SKILL.md', + '---\nname: agentv-eval-writer\ndescription: Write evals\nhidden: true\n---\n# Eval Writer\n', + ); +}); + +afterEach(() => { + rmSync(tmpDir, { recursive: true, force: true }); +}); + +// ── Tests ───────────────────────────────────────────────────────────────────── + +describe('listSkillNames', () => { + it('returns sorted skill names from the skills directory', () => { + const names = listSkillNames(tmpDir); + expect(names).toEqual(['agentv-bench', 'agentv-eval-writer']); + }); + + it('returns empty array for a non-existent directory', () => { + expect(listSkillNames('/does/not/exist')).toEqual([]); + }); +}); + +describe('readSkill', () => { + it('reads SKILL.md content', () => { + const skill = readSkill(tmpDir, 'agentv-bench', false); + expect(skill).not.toBeNull(); + expect(skill?.name).toBe('agentv-bench'); + expect(skill?.content).toContain('# AgentV Bench'); + expect(skill?.files).toBeUndefined(); + }); + + it('includes frontmatter including hidden: true', () => { + const skill = readSkill(tmpDir, 'agentv-eval-writer', false); + expect(skill?.content).toContain('hidden: true'); + }); + + it('returns null for non-existent skill', () => { + expect(readSkill(tmpDir, 'does-not-exist', false)).toBeNull(); + }); + + it('--full collects references/ files', () => { + const skill = readSkill(tmpDir, 'agentv-bench', true); + expect(skill?.files).toBeDefined(); + expect(skill?.files?.['references/cli.md']).toContain('# CLI Reference'); + }); + + it('--full collects agents/ files', () => { + const skill = readSkill(tmpDir, 'agentv-bench', true); + expect(skill?.files?.['agents/executor.md']).toContain('# Executor agent'); + }); + + it('--full collects templates/ files alongside references/ and agents/', () => { + const skill = readSkill(tmpDir, 'agentv-bench', true); + expect(skill?.files?.['templates/sample.yaml']).toContain('name: sample'); + }); + + it('--full returns no files key when no references/, templates/, or agents/', () => { + const skill = readSkill(tmpDir, 'agentv-eval-writer', true); + expect(skill?.files).toBeUndefined(); + }); +}); + +describe('findRefFile', () => { + it('locates a reference by bare name (auto-appends .md)', () => { + const skillDir = path.join(tmpDir, 'agentv-bench'); + const file = findRefFile(skillDir, 'eval-yaml-spec'); + expect(file?.relPath).toBe('references/eval-yaml-spec.md'); + expect(file?.content).toContain('# Eval YAML spec'); + }); + + it('locates a reference when caller already includes .md', () => { + const skillDir = path.join(tmpDir, 'agentv-bench'); + const file = findRefFile(skillDir, 'cli.md'); + expect(file?.relPath).toBe('references/cli.md'); + }); + + it('finds an agents/ file', () => { + const skillDir = path.join(tmpDir, 'agentv-bench'); + const file = findRefFile(skillDir, 'executor'); + expect(file?.relPath).toBe('agents/executor.md'); + }); + + it('finds a templates/ file by exact name', () => { + const skillDir = path.join(tmpDir, 'agentv-bench'); + const file = findRefFile(skillDir, 'sample.yaml'); + expect(file?.relPath).toBe('templates/sample.yaml'); + }); + + it('falls back to a bare file in the skill root (e.g. LICENSE.txt)', () => { + const skillDir = path.join(tmpDir, 'agentv-bench'); + const file = findRefFile(skillDir, 'LICENSE.txt'); + expect(file?.relPath).toBe('LICENSE.txt'); + expect(file?.content).toContain('MIT License'); + }); + + it('returns null for a missing reference', () => { + const skillDir = path.join(tmpDir, 'agentv-bench'); + expect(findRefFile(skillDir, 'no-such-ref')).toBeNull(); + }); +}); + +describe('collectDir', () => { + it('recursively collects files with relative paths', () => { + const refsDir = path.join(tmpDir, 'agentv-bench', 'references'); + const files = collectDir(refsDir, 'references'); + expect(files['references/cli.md']).toContain('# CLI Reference'); + }); + + it('returns empty record for missing directory', () => { + expect(collectDir('/does/not/exist', 'references')).toEqual({}); + }); +}); diff --git a/apps/cli/tsup.config.ts b/apps/cli/tsup.config.ts index 96445beb1..3ecef5a9a 100644 --- a/apps/cli/tsup.config.ts +++ b/apps/cli/tsup.config.ts @@ -46,6 +46,21 @@ export default defineConfig({ console.log('✓ Template files copied to dist/templates'); + // Copy bundled skills from /skills-data/ → dist/skills/. + // `skills-data/` at the repo root is the source of truth for full skill + // content (mirrors agent-browser's top-level `skill-data/` layout); the + // marketplace plugin files (plugins/agentv-dev/skills/) are stubs that + // redirect agents to `agentv skills get `. + const srcSkillsDir = path.resolve('..', '..', 'skills-data'); + const distSkillsDir = path.join('dist', 'skills'); + rmSync(distSkillsDir, { recursive: true, force: true }); + if (existsSync(srcSkillsDir)) { + cpSync(srcSkillsDir, distSkillsDir, { recursive: true }); + console.log('✓ Skills copied to dist/skills'); + } else { + console.log('⚠ Skills source not found at', srcSkillsDir, '— skipping'); + } + // Copy studio dist if available (built by apps/studio) const studioDistDir = path.resolve('..', 'studio', 'dist'); const cliStudioDir = path.join('dist', 'studio'); diff --git a/apps/web/src/content/docs/docs/getting-started/installation.mdx b/apps/web/src/content/docs/docs/getting-started/installation.mdx index 085dbfa7e..dca9211f9 100644 --- a/apps/web/src/content/docs/docs/getting-started/installation.mdx +++ b/apps/web/src/content/docs/docs/getting-started/installation.mdx @@ -1,51 +1,41 @@ --- title: Installation -description: Install AgentV with Claude plugin manager or CLI +description: Install AgentV CLI and get started with bundled skills sidebar: order: 2 --- ## Prerequisites -- **Claude Code** with plugin manager support - **Node.js** 20 or later -## Canonical Setup (Claude Plugin Manager) +## Canonical Setup -Install the AgentV marketplace and plugin: +Install the AgentV CLI: ```bash -npx allagents plugin marketplace add EntityProcess/agentv -npx allagents plugin install agentv-dev@agentv +npm install -g agentv ``` -`npx allagents` is command-surface compatible with `claude` and `copilot`. - -Then ask Claude to bootstrap AgentV in your current repository: +Then load the onboarding skill and follow the instructions: -```text -Set up AgentV in this repo. +```bash +agentv skills get agentv-onboarding ``` -The onboarding skill will: -- verify `agentv` CLI availability -- install `agentv` if needed -- run `agentv init` -- verify setup artifacts +Paste the output to your AI agent and ask it to set up AgentV in your repository. -## CLI-Only Setup (Fallback) +## Skills -If you are not using Claude plugins: +AgentV ships skill content inside the CLI package, version-matched to the binary. +No separate plugin install required. ```bash -npm install -g agentv -agentv init -``` - -Verify the CLI: - -```bash -agentv --version +agentv skills list # list available skills +agentv skills get agentv-bench # load a specific skill +agentv skills get agentv-bench --full # include references and templates +agentv skills get agentv-bench --json # machine-readable output +agentv skills get --all # load all skills ``` ## Verify Workspace Files @@ -61,21 +51,28 @@ test -f .agentv/config.yaml test -f .agentv/targets.yaml ``` -## Troubleshooting - -### Plugin installed but setup skill does not run +## Claude Code Plugin (Optional) -Reinstall and retry: +For Claude Code users who prefer plugin-based skill discovery, the `agentv-dev` plugin +provides marketplace integration. Each plugin SKILL.md is a discovery stub that loads +the full skill content from the CLI: ```bash npx allagents plugin marketplace add EntityProcess/agentv npx allagents plugin install agentv-dev@agentv ``` -Then ask Claude: +`npx allagents` is command-surface compatible with `claude` and `copilot`. + +## Troubleshooting + +### Skills directory not found + +Reinstall the CLI to ensure bundled skills are present: -```text -Set up AgentV in this repo. +```bash +npm install -g agentv +agentv skills list ``` ### Recover setup manually diff --git a/evals/self/skills/README.md b/evals/self/skills/README.md new file mode 100644 index 000000000..e476eeb9e --- /dev/null +++ b/evals/self/skills/README.md @@ -0,0 +1,128 @@ +# `agentv skills` evals + +Three-category eval suite covering the `agentv skills` CLI subcommand +(`list`, `get`, `path`) shipped in #1224. Each category isolates one +aspect of the skill UX so a regression in any single dimension is easy +to read off the run report. + +## Categories + +| File | What it tests | Tests | +|------|---------------|-------| +| `skill-invocation.eval.yaml` | Does the agent invoke the right CLI command + flag for a given task? | 8 | +| `skill-selection.eval.yaml` | Does the agent pick the right skill for a natural-language task? | 8 | +| `output-correctness.eval.yaml` | Does the agent produce structurally and factually correct output (YAML, CLI commands, descriptions)? | 7 | + +The three categories mirror the structure used by `agent-browser`'s +skill evals: invocation (does the agent reach for the tool), selection +(does it pick the right entry), and output (does the result hold up). + +## Fixtures + +`fixtures/` holds frozen snapshots of CLI output. They are checked in +so eval runs are deterministic and don't require network or build +state. Two flavours per skill: + +- `agentv-.txt` — bare `SKILL.md` content (`agentv skills get `). + Used in most tests; small (1.5–25 KB). +- `agentv--full.txt` — same plus every file under `references/` + and `templates/` (`agentv skills get --full`). Only used in + tests that specifically validate `--full` behaviour, since these can + be hundreds of KB. + +Plus two single-purpose fixtures: + +- `skills-list-all.txt` — output of `agentv skills list --json`. +- `skills-get-nonexistent.txt` — error output of `agentv skills get does-not-exist`. + +### Regenerating fixtures + +After any change to bundled skill content or the `agentv skills` CLI, +regenerate fixtures from the worktree root: + +```bash +cd evals/self/skills + +# Bare SKILL.md per skill +for skill in agentv-bench agentv-eval-review agentv-eval-writer \ + agentv-governance agentv-onboarding agentv-trace-analyst; do + node ../../../apps/cli/dist/cli.js skills get "$skill" \ + > "fixtures/${skill}.txt" 2>&1 +done + +# --full variants (with references/ + templates/) +for skill in agentv-bench agentv-eval-review agentv-eval-writer \ + agentv-governance agentv-onboarding agentv-trace-analyst; do + node ../../../apps/cli/dist/cli.js skills get "$skill" --full \ + > "fixtures/${skill}-full.txt" 2>&1 +done + +# Listing + error fixtures +node ../../../apps/cli/dist/cli.js skills list --json \ + > fixtures/skills-list-all.txt 2>&1 +node ../../../apps/cli/dist/cli.js skills get does-not-exist \ + > fixtures/skills-get-nonexistent.txt 2>&1 +``` + +`bun apps/cli/src/cli.ts skills …` works equivalently when running +against TypeScript sources. + +## Running + +From the worktree root: + +```bash +# All three categories against one target +node apps/cli/dist/cli.js eval run evals/self/skills/*.eval.yaml --target + +# A single category +node apps/cli/dist/cli.js eval run \ + evals/self/skills/skill-selection.eval.yaml --target azure + +# A single test +node apps/cli/dist/cli.js eval run \ + evals/self/skills/skill-invocation.eval.yaml \ + --test-id invoke-get-full-flag --target azure +``` + +`` is any name resolvable from `targets.yaml` in the worktree +(e.g. `azure`, `claude`, `mock`). + +## Adding test cases + +Tests are plain entries under `tests:`. Each test must have: + +- `id` (kebab-case, unique within the file) +- `criteria` — one-line human description +- `input` — either a bare string or a `[{role, content: [...]}]` block + when injecting fixtures via `type: file` +- `assertions` — at least one entry; prefer deterministic types + (`contains`, `regex`, `icontains-any`) over `rubrics` so the eval + stays cheap and stable. Use `rubrics` only for genuinely qualitative + checks. + +Pattern for fixture-driven tests: + +```yaml +- id: my-new-test + criteria: One-line description + input: + - role: user + content: + - type: file + value: fixtures/agentv-onboarding.txt + - type: text + value: | + + assertions: + - type: contains + value: + - type: rubrics + criteria: + - + - +``` + +When every test in a file shares the same fixture, declare it once at +the top level under `input:` (suite-level input, prepended to every +test) instead of repeating it per-test — see `skill-selection.eval.yaml`. diff --git a/evals/self/skills/fixtures/agentv-bench-full.txt b/evals/self/skills/fixtures/agentv-bench-full.txt new file mode 100644 index 000000000..270384675 --- /dev/null +++ b/evals/self/skills/fixtures/agentv-bench-full.txt @@ -0,0 +1,1938 @@ +--- +name: agentv-bench +description: >- + Run AgentV evaluations and optimize agents through eval-driven iteration. + Triggers: run evals, benchmark agents, optimize prompts/skills against evals, compare + agent outputs across providers, analyze eval results, offline evaluation of recorded sessions, + run autoresearch, optimize unattended, run overnight optimization loop. + Not for: writing/editing eval YAML without running (use agentv-eval-writer), + analyzing existing traces/JSONL without re-running (use agentv-trace-analyst). +--- + +# AgentV Bench + + +A skill for evaluating agents and iteratively improving them through data-driven optimization. + +At a high level, the process goes like this: + +- Understand what the agent does and what "good" looks like +- Write evaluation test cases (EVAL.yaml or evals.json) +- Run the agent on those test cases, grade the outputs +- Analyze the results — what's working, what's failing, and why +- Improve the agent's prompts/skills/config based on the analysis +- Repeat until you're satisfied + +Your job when using this skill is to figure out where the user is in this process and then jump in and help them progress. Maybe they want to start from scratch — help them write evals, run them, and iterate. Maybe they already have results — jump straight to analysis and improvement. + +Be flexible. If the user says "I don't need a full benchmark, just help me debug this failure", do that instead. + +After the agent is working well, you can also run description optimization to improve skill triggering accuracy (see `references/description-optimization.md`). + +## Communicating with the user + +This skill is used by people across a wide range of familiarity with evaluation tooling. Pay attention to context cues: + +- "evaluation" and "benchmark" are borderline but OK in most cases +- For "YAML", "grader", "assertion", "deterministic judge" — see serious cues from the user that they know what those mean before using them without explanation +- Briefly explain terms if in doubt + +When presenting results, default to summary tables. Offer detail on request. In CI/headless mode, skip interactive prompts and exit with status codes. + +--- + +## Step 1: Understand the Agent + +Before running or optimizing, understand what you're working with. + +1. **Read the agent's artifacts** — prompts, skills, configs, recent changes. Understand the full picture: what tools are available, what the expected input/output looks like, what constraints exist. + +2. **Identify success criteria** — what does "good" look like for this agent? What are the edge cases? What would a failure look like? Talk to the user if this isn't clear from the artifacts alone. + +3. **Understand the target harness** — which provider runs the agent (Claude, GPT, Copilot CLI, Gemini, custom CLI)? This affects what grader types are available and how to run tests. Targets are configured in `.agentv/targets.yaml` (canonical location, searched from the eval file directory upward). Sensitive values like `api_key` must use `${{ ENV_VAR }}` syntax — literal secrets are rejected as a security guardrail. + +4. **Challenge assumptions** — if evals already exist, review their quality before running: + - Are the test cases testing the right things? + - Are assertions specific enough to catch real failures? + - Are there ambiguous or contradictory test cases? + - Flag eval issues before proceeding — running bad evals wastes time. + +5. **Check integrity** — ensure task prompts (what the agent receives) are not also used as grader prompts (how outputs are scored). If a prompt file appears in both locations, note the overlap and optimize only for the task purpose. + +--- + +## Step 2: Write Evaluations + +AgentV supports two evaluation formats: + +**EVAL.yaml** (native, full features) — supports workspaces, code graders, multi-turn conversations, tool trajectory scoring, workspace file tracking, multi-provider targets. Use this for agent evaluation. + +```yaml +# example.eval.yaml +tests: + - id: basic-code-review + input: "Review this TypeScript file for bugs and suggest improvements" + criteria: "Identifies the null pointer bug on line 12 and suggests a fix" + assertions: + - type: contains + value: "null" + - Review identifies the null pointer bug and suggests a concrete fix + +workspace: + template: ./workspace-template + hooks: + before_each: + reset: fast +``` + +Multi-skill evaluation is handled naturally via input messages — describe the task in the test input, and the agent uses whatever skills it needs. + +**evals.json** (skill-creator compatible) — auto-promoted to EVAL-equivalent format: +- `prompt` → input messages +- `expected_output` → reference answer +- `assertions` → graders +- `files[]` paths resolved relative to the evals.json location + +```json +{ + "skill_name": "my-agent", + "evals": [ + { + "id": 1, + "prompt": "User's task prompt", + "expected_output": "Description of expected result", + "assertions": ["Output includes error handling", "Uses async/await"] + } + ] +} +``` + +### Writing good test cases + +Start with 2-3 realistic test cases — the kind of thing a real user would actually say. Share them with the user before running: "Here are a few test cases I'd like to try. Do these look right, or do you want to add more?" + +Good assertions are objectively verifiable and have descriptive names. Subjective quality ("the output is good") is better evaluated qualitatively — don't force assertions onto things that need human judgment. + +**Grader types** (cheapest to most expensive): `exact`, `contains`, `regex`, `is-json`, `field-accuracy`, `composite`, `code-grader`, `tool-trajectory`, `llm-grader`. See `references/eval-yaml-spec.md` for full config and grading recipes for each type. + +Prefer deterministic graders over LLM graders whenever possible. If an assertion can be checked with `contains` or `regex`, don't use `llm-grader`. + +--- + +## Step 3: Run and Grade + +This section is one continuous sequence — don't stop partway through. + +Each run produces a new `.agentv/results/runs//` directory automatically. Use timestamps to identify iterations when comparing runs. + +### Choosing a run mode + +**User instruction takes priority.** If the user says "run in subagent mode", "use subagent mode", or "use CLI mode", use that mode directly. + +If the user has not specified a mode, default to `subagent`. + +| `AGENT_EVAL_MODE` | Mode | How | +|----------------------|------|-----| +| `subagent` (default) | **Subagent mode** | Subagent-driven eval — parses eval.yaml, spawns executor + grader subagents. Zero CLI dependency. | +| `cli` | **AgentV CLI** | `agentv eval ` — end-to-end, multi-provider | + +Set `AGENT_EVAL_MODE` in `.env` at the project root as the default when no mode is specified. If absent, default to `subagent`. **User instruction always overrides this.** + +**`subagent`** — Parses eval.yaml directly, spawns executor subagents to run each test case in the current workspace, then spawns grader subagents to evaluate all assertion types natively. No CLI or external API calls required. Read `references/subagent-pipeline.md` for the detailed procedure. + +**`cli`** — AgentV CLI handles execution, grading, and artifact generation end-to-end. Works with all providers. Use when you need multi-provider benchmarking or CLI-specific features. + +### Running evaluations + +**AgentV CLI mode** (end-to-end, EVAL.yaml): +```bash +agentv eval --output .agentv/artifacts/ +``` + +**Subagent mode** — read `references/subagent-pipeline.md` for the detailed procedure. In brief: use `pipeline input` to extract inputs, dispatch one `executor` subagent per test case (all in parallel), then proceed to grading below. + +**Spawn all runs in the same turn.** For each test case that needs both a "with change" and a "baseline" run, launch them simultaneously. Don't run one set first and come back for the other — launch everything at once so results arrive around the same time. + +**Multi-target benchmarking:** +```bash +agentv eval --target claude --target gpt --target copilot +``` + +**Baseline strategy:** +- **New agent**: baseline is "no prompt" or minimal prompt — same eval, no agent-specific configuration +- **Improving existing**: snapshot the current version before editing (`cp -r /prompt-snapshot/`), use as baseline throughout +- **Multi-target**: each target is its own baseline — no need for a separate "without" run + +### While runs are in progress, draft graders + +Don't just wait for runs to finish — use this time productively. If assertions don't exist yet, draft them now. If they exist, review them and explain what they check to the user. + +Good assertions are *discriminating* — they pass when the agent genuinely succeeds and fail when it doesn't. An assertion that passes for both good and bad outputs is worse than no assertion. + +### As runs complete, capture timing data + +When each subagent task completes, you receive a notification containing `total_tokens` and `duration_ms`. **Save this data immediately** to `timing.json` in the run directory. See `references/schemas.md` for the timing.json schema. + +This is the only opportunity to capture this data — it comes through the task notification and isn't persisted elsewhere. Process each notification as it arrives. + +### Grading + +**In CLI mode**, `agentv eval` handles all grading end-to-end — no manual phases needed. + +**In subagent mode**, grading has three phases. **All three are required — do not stop after phase 1.** + +**Phase 1: Code graders** (deterministic, zero-cost) + +```bash +agentv pipeline grade +``` + +This evaluates all deterministic assertions against `response.md` files. Two types are handled: +- **`code-grader` scripts** — external scripts executed against the response (arbitrary logic, any language) +- **Built-in assertion types** — evaluated in-process: `contains`, `contains-any`, `contains-all`, `icontains`, `regex`, `equals`, `starts-with`, `ends-with`, `is-json`, and variants + +Both types are configured by `pipeline input` into `code_graders/.json` and graded by `pipeline grade`. Results are written to `/code_grader_results/.json`. Alternatively, pass `--grader-type code` to `pipeline run` to run these inline. + +**Do not dispatch LLM grader subagents for tests that only have `contains`, `regex`, or other built-in assertions** — `pipeline grade` handles them entirely, at zero cost. To detect which tests need Phase 2, check whether `/llm_graders/` contains any `.json` config files — `pipeline input` only writes there for `llm-grader` assertions. Tests with an empty (or missing) `llm_graders/` directory are done after Phase 1. + +**Phase 2: LLM grading** (semantic — do NOT skip this phase) + +Dispatch one `grader` subagent per (test × LLM grader) pair, **all in parallel**. Do not write a script to call an LLM API instead — the grader subagents use their own reasoning, which IS the LLM grading. +Example: 5 tests × 2 LLM graders = 10 grader subagents launched simultaneously. + +**Do NOT dispatch a single grader for multiple tests.** Each subagent grades exactly one (test, grader) pair. + +**Before dispatching graders, read `agents/grader.md` and embed its full content as the system instructions in every grader subagent prompt.** The grader is a `general-purpose` task agent — there is no auto-resolved "grader" type. Without `agents/grader.md` embedded verbatim, the subagent has no grading process, no output format, and no file-path knowledge, and will produce empty or incorrect output. + +Each grader subagent (operating under `agents/grader.md` instructions): +1. Reads `/llm_graders/.json` for the grading prompt +2. Reads `/response.md` for the candidate output +3. Grades the response against the prompt criteria +4. **Writes its result to disk**: `///llm_grader_results/.json` +5. Returns score (0.0–1.0) and per-assertion evidence to the orchestrator + +**Writing to disk is critical.** Assertion arrays are lost if accumulated only in the orchestrator's context across multiple batches (context summarization drops detail). Writing per-test results to `llm_grader_results/.json` makes grading resumable and assertion evidence durable. + +The result file format is: +```json +{ "score": 0.85, "assertions": [{"text": "...", "passed": true, "evidence": "..."}] } +``` + +After **all** grader subagents complete, run Phase 3 directly. + +**Phase 3: Merge and validate** + +```bash +agentv pipeline bench +agentv results validate +``` + +`pipeline bench` reads LLM grader results from `llm_grader_results/.json` per test automatically, merges with code-grader scores, computes weighted pass_rate, and writes `grading.json` + `index.jsonl` + `benchmark.json`. + +> **Diagnosing `pass_rate=0`:** If `pipeline bench` reports `pass_rate=0` across the board, do **not** assume the tests genuinely failed. First verify the grading pipeline ran correctly: check that `/llm_grader_results/.json` exists and is non-empty for each test. If these files are absent or empty, the grader subagents failed to produce output (most common cause: `agents/grader.md` was not embedded in the subagent prompts — see Phase 2). Treat `pass_rate=0` as a real signal only after confirming grader results exist. + +### Artifacts + +All artifacts use established schemas — see `references/schemas.md` for the full definitions. Do not modify the structure. Key artifacts per run: +- **grading.json**: per-test assertions with `{text, passed, evidence}`, plus summary +- **timing.json**: `{total_tokens, duration_ms, total_duration_seconds}` +- **benchmark.json**: per-target aggregate `{pass_rate, time_seconds, tokens}` + +Write artifacts to `.agentv/artifacts/` or the iteration directory. + +### Workspace features (EVAL.yaml only) + +- **Workspace isolation** — clone repos, run setup/teardown hooks (before_all, before_each, after_each, after_all) +- **Materialization modes** — `pooled` (reuse slots), `temp` (fresh per run), `static` (existing dir) +- **Multi-repo** — clone multiple repos with sparse checkout and shallow clone support +- **File change tracking** — grade by diffing workspace files before/after agent execution + +--- + +## Step 4: Analyze Results + +Once all runs are graded, analyze the results before attempting improvements. + +### Pattern analysis + +Read the JSONL results and look for: + +- **Always-pass tests** — assertion too loose or non-discriminating. If it passes for both good and bad outputs, it's not testing anything. +- **Always-fail tests** — task impossible, eval broken, or assertion misconfigured. Don't optimize against broken evals. +- **Flaky tests** — non-deterministic results across runs. Investigate before treating failures as real. +- **Systematic failures** — same failure pattern across multiple tests. This usually points to a missing instruction or wrong approach. +- **Deterministic upgrade candidates** — `llm-grader` assertions that could be replaced with `contains`, `regex`, or `is-json` (cheaper, faster, more reliable). + +### Dispatch subagents + +- **Dispatch `analyzer`** (read `agents/analyzer.md`) for a structured quality audit: deterministic upgrade suggestions, weak assertion detection, cost/quality flags, and benchmark pattern analysis. + +- **Dispatch `comparator`** (read `agents/comparator.md`) for blind N-way comparison between iterations or targets. The comparator blinds provider identities, generates task-specific rubrics, scores each output, then unblinds and attributes improvements. + +### Trace analysis + +Use CLI tools for deeper investigation: +```bash +agentv inspect # Detailed execution trace inspection +agentv compare # Structured diff between runs +``` + +Look for: tool call patterns, error recovery behavior, conversation flow, wasted steps. + +### Present results to the user + +Show a summary table: + +``` +| Test ID | Score | Pass/Fail | Delta | Notes | +|------------------|-------|-----------|-------|--------------------------| +| basic-code-review| 0.85 | ✓ PASS | +0.15 | Found the bug this time | +| edge-case-empty | 0.00 | ✗ FAIL | — | Crashed on empty input | +``` + +Highlight: +- Current pass rate and delta from baseline +- Comparison results (which target/iteration won and why) +- Analyst observations the aggregate stats would hide + +Ask: "How does this look? Anything you'd change about the evals or the approach?" + +--- + +## Step 5: Improve + +This is the heart of the loop. You've run the test cases, analyzed the results, and now you need to make the agent better. + +### How to think about improvements + +1. **Generalize from the analysis.** You're iterating on a small eval set, but the agent will be used on many different inputs. Don't overfit to specific test cases. Rather than fiddly patches or oppressively rigid MUSTs, try different approaches and see what works. It's cheap to experiment. + +2. **Keep the prompt lean.** Read the execution transcripts, not just the final outputs. If the agent wastes time on unproductive steps, remove the instructions causing that. If it always ignores a section, that section isn't pulling its weight. + +3. **Explain the why.** Today's LLMs are smart. They have good theory of mind and can go beyond rote instructions when given good reasoning. If you find yourself writing ALWAYS or NEVER in all caps, that's a yellow flag — reframe as an explanation of why the thing matters. That's more humane, powerful, and effective. + +4. **Look for repeated work.** Read the transcripts from test runs and notice if the agent independently takes the same multi-step approach to something across cases. If all test runs result in writing the same helper script, bundle it. If every run makes the same mistake, the instruction is missing or unclear. + +### Applying changes + +- **Surgical edits**: ADD (new rule for a missing constraint), UPDATE (refine for clarity), DELETE (remove redundant or harmful rules), NEGATIVE CONSTRAINT (explicitly state what NOT to do) +- **One change per iteration** to isolate effects. If you change three things and the score improves, you don't know which change helped. +- **Variant tracking**: When a change helps some tests but hurts others, maintain 2-3 prompt variants. Compare variants to find the best overall approach before converging. +- **When converging**: Generalize specific patches into broad principles. Remove redundancy and contradictions. Ensure the prompt is clear, focused, and under 200 lines. + +### Evaluation integrity + +**Critical**: Only optimize **task prompts** (what the agent receives), never **judge prompts** (how graders score outputs). Modifying judge prompts games the evaluation without improving the agent. + +If a prompt file is referenced in both task input and grader configs, optimize for the task purpose only. Document which prompts were modified in the optimization log. + +### The iteration loop + +After improving: + +1. Apply your changes to the agent's prompts/skills/config +2. Re-run all test cases (agentv creates a new `.agentv/results/runs//` directory automatically) +3. Compare against the previous iteration (Step 4). If running in automated mode, use the **automated keep/discard** logic below instead of manual judgment — it will decide whether to keep or revert the change for you. +4. Present results to the user (or log the decision if running automated keep/discard) +5. Stop when ANY of: + - The user says they're happy + - Feedback is all empty (everything looks good) + - You're not making meaningful progress (no improvement for 2 consecutive iterations) + - Target pass rate is reached + - Maximum iterations exhausted + +**Human checkpoints**: At iterations 3, 6, and 9, always present progress to the user regardless of automation settings. Push back if optimization is accumulating contradictory rules or overfitting to specific test cases. + +### Automated keep/discard + +For autonomous iteration, use `agentv compare --json` to automatically decide whether to keep or discard each change based on wins/losses/ties. Read `references/autoresearch.md` for the full decision rules, logging format, and integration with the iteration loop. + +--- + +## Entering Mid-Lifecycle + +Users can start at any step by providing existing data: + +| Entry point | Required input | Example prompt | +|------------|---------------|----------------| +| Step 1 (Understand) | `eval-path` | "Optimize my agent against evals/support.yaml" | +| Step 2 (Write Evals) | Agent artifacts | "Write evals for this agent" | +| Step 3 (Run + Grade) | `eval-path` | "Run this eval and show me results" | +| Step 4 (Analyze) | `results-path` | "Analyze why my agent is failing on these results" | +| Step 5 (Improve) | Analysis + strategy | "Apply these optimization suggestions" | + +When entering mid-lifecycle, run only the requested step and subsequent steps. Don't re-run earlier steps unless the user requests a full loop. + +--- + +## Advanced: Blind Comparison + +For situations where you want a rigorous comparison between two versions (e.g., "is the new version actually better?"), dispatch the `comparator` subagent. It blinds identities, generates task-specific rubrics, scores outputs, then unblinds and explains why the winner won. + +This is optional and requires subagents. The human review loop is usually sufficient. + +--- + +## Description Optimization + +After the agent is working well, offer to optimize the skill's `description` field for better triggering accuracy. Read `references/description-optimization.md` for the full procedure (generate trigger EVAL.yaml, review with user, iterate, apply). + +--- + +## Autoresearch Mode + +Autoresearch is an unattended eval-improve loop that runs multiple optimize cycles without human intervention. The user triggers it with natural language (e.g., "run autoresearch on this skill", "optimize this skill unattended"). It uses the mutator subagent (`agents/mutator.md`) to rewrite artifacts based on failure analysis, and automated keep/discard to decide whether to keep or revert each change. + +Read `references/autoresearch.md` for the full procedure (prerequisites, artifact layout, keep/discard rules, the step-by-step loop, convergence criteria, and context hygiene). + +--- + +## Environment Adaptation + +For provider-specific notes (Copilot, Codex, Claude SDK, custom CLI), CI/headless mode behavior, and fallback strategies when subagents aren't available, read `references/environment-adaptation.md`. + +--- + +## Subagent Reference + +The `agents/` directory contains instructions for specialized subagents. Read them when you need to spawn the relevant subagent. + +| Agent | File | Purpose | When to dispatch | +|-------|------|---------|-----------------| +| executor | `agents/executor.md` | Perform test case tasks as the target agent | Step 3 (agent targets — one per test case) | +| grader | `agents/grader.md` | Grade responses with per-assertion evidence | Step 3 (grading — one per test × LLM grader pair) | +| comparator | `agents/comparator.md` | Blind N-way comparison + post-hoc analysis | Step 4 (comparing iterations/targets) | +| analyzer | `agents/analyzer.md` | Quality audit, deterministic upgrades, benchmarks | Step 4 (pattern analysis) | +| mutator | `agents/mutator.md` | Rewrite artifact from failure analysis | Step 5 (autoresearch — dispatched per cycle) | + +The `references/` directory has additional documentation: +- `references/autoresearch.md` — Autoresearch unattended optimization loop and automated keep/discard rules +- `references/eval-yaml-spec.md` — Eval YAML schema and assertion grading recipes +- `references/subagent-pipeline.md` — Detailed subagent-mode pipeline commands and output structure +- `references/description-optimization.md` — Skill description optimization workflow +- `references/environment-adaptation.md` — Provider-specific notes and CI/headless behavior +- `references/schemas.md` — JSON schemas for all artifacts (grading.json, benchmark.json, etc.) +- `references/migrating-from-skill-creator.md` — Guide for users coming from Anthropic's skill-creator + +--- + +Repeating the core loop for emphasis: + +- Understand what the agent does +- Write evaluation test cases +- Run the agent and grade outputs +- Analyze results — surface patterns, dispatch analyst and comparator subagents +- Improve the agent based on analysis +- Repeat until you and the user are satisfied + +Take your time with improvements. Read the transcripts. Understand why failures happened. Make changes that generalize beyond the test set. This is important work. + +--- references/autoresearch.md --- +# Autoresearch Mode + +Autoresearch is an unattended eval-improve loop that runs multiple optimize cycles without human intervention. The user triggers it with natural language (e.g., "run autoresearch on this skill", "optimize this skill unattended"). No YAML schema changes or CLI flags are needed. + +## Automated Keep/Discard + +After each iteration, you can automatically decide whether to keep or discard the change using structured comparison output. This replaces manual judgment at steps 3–4 of the iteration loop (Step 5 in SKILL.md), except at human checkpoint iterations (3, 6, 9) where you must still present results to the user. + +### 1. Run the comparison + +After re-running test cases, compare the new results against the previous iteration's baseline: + +```bash +agentv compare .jsonl .jsonl --json +``` + +Where `.jsonl` is the `index.jsonl` from the previous best iteration and `.jsonl` is the `index.jsonl` from the run you just completed. + +### 2. Parse the output + +The `--json` flag produces structured output: + +```json +{ + "summary": { + "wins": 3, + "losses": 1, + "ties": 6, + "mean_delta": 0.05 + } +} +``` + +- **wins**: number of test cases where the candidate scored higher than the baseline +- **losses**: number of test cases where the candidate scored lower +- **ties**: number of test cases with no score change +- **mean_delta**: average score difference across all test cases (positive = candidate is better) + +### 3. Apply decision rules + +Use these rules in order: + +| Condition | Decision | Action | +|-----------|----------|--------| +| `wins > losses` | **KEEP** | Promote the candidate to the new baseline. Copy or note its `index.jsonl` path as the baseline for the next iteration. | +| `wins <= losses` | **DISCARD** | Revert the prompt/skill/config change. The previous baseline remains. Try a different mutation on the next iteration. | +| `mean_delta == 0` AND candidate prompt is shorter (fewer lines) | **KEEP** | Simpler prompts are preferred when performance is equal. Promote the candidate as the new baseline. | + +When `mean_delta == 0` and the candidate prompt is *not* shorter, treat it as a **DISCARD** — there's no reason to keep a change that adds complexity without improving results. + +### 4. Log the decision + +Before proceeding to the next iteration, log the decision and rationale so the user can review later: + +``` +Iteration 2: KEEP + wins=3, losses=1, ties=6, meanDelta=+0.05 + Rationale: candidate wins outweigh losses (3 > 1) + Baseline promoted: .agentv/results/runs/20250101-120000/index.jsonl +``` + +``` +Iteration 3: DISCARD + wins=1, losses=2, ties=7, meanDelta=-0.03 + Rationale: candidate losses outweigh wins (2 > 1) + Reverted to baseline: .agentv/results/runs/20250101-110000/index.jsonl + Next: try a different mutation +``` + +Include this log in your progress summary. At human checkpoints (iterations 3, 6, 9), present the full log of automated decisions since the last checkpoint alongside the current results. + +### 5. Integration with the iteration loop + +The automated keep/discard replaces the manual compare-and-present cycle (steps 3–4) during non-checkpoint iterations. The full flow becomes: + +1. Apply change to prompts/skills/config +2. Re-run all test cases +3. Run `agentv compare baseline.jsonl candidate.jsonl --json` +4. Apply keep/discard rules → promote or revert +5. Log the decision +6. If this is iteration 3, 6, or 9 → present progress to the user (human checkpoint) +7. Check stop conditions → continue or stop + +Both modes coexist: if the user is actively reviewing results, present to them as before. If the user has asked you to iterate autonomously, use automated keep/discard and only pause at human checkpoints. + +--- + +## Prerequisites + +- An eval file (`EVAL.yaml` or `evals.json`) must exist for the artifact being optimized. +- The artifact must be a file or directory (SKILL.md, prompt template, agent config, or a directory of related files like a skill with references/). +- The user should have run at least one interactive eval cycle to build confidence in eval quality before going unattended. + +## The loop + +``` +1. RUN EVAL — agentv eval with current artifact +2. ANALYZE — dispatch analyzer subagent on results +3. DECIDE — if score > best_score: KEEP, else DROP (automated keep/discard above) +4. MUTATE — dispatch mutator subagent with failure analysis (agents/mutator.md) +5. GOTO 1 — until convergence or max_cycles +``` + +## Experiment naming + +Derive the experiment name from the artifact: `autoresearch-` (e.g., `autoresearch-pdf-skill`). The user can also provide a custom name. + +## Artifact mutation flow + +The mutator rewrites artifacts in the working tree in place. **Git is used for versioning** — HEAD always contains the best-known version: + +1. Record the starting commit SHA before the first cycle: `initial_sha=$(git rev-parse HEAD)`. +2. On each **KEEP**: `git add && git commit -m "autoresearch cycle N: "`. +3. On each **DROP**: `git checkout -- ` (restores working tree to HEAD, the last KEEP commit). +4. The eval always runs against the real file path — no temp files or indirection. +5. The mutator can reference the original via `git show :`. + +## How the skill invokes eval + +Shell out to `agentv eval --experiment autoresearch-` via the Bash tool, same as the existing interactive bench workflow. + +## Artifact layout + +Each cycle is a standard eval run. Autoresearch session metadata lives in `_autoresearch/` within the experiment directory: + +``` +.agentv/results/runs// + _autoresearch/ + iterations.jsonl # one line per cycle — data for chart + mutator + trajectory.html # live-updating score trajectory chart + 2026-04-15T10-30-00/ # cycle 1 — standard run artifacts + index.jsonl + grading.json + timing.json + benchmark.json + report.html + 2026-04-15T10-35-00/ # cycle 2 — standard run artifacts + ... +``` + +No `original.md` or `best.md` files — git history serves as the backup. The `_` prefix convention distinguishes workflow folders from timestamped run dirs. + +## iterations.jsonl + +One JSON object per line, one line per cycle: + +```jsonl +{"cycle":1,"score":0.65,"decision":"keep","cost_usd":0.12,"assertions":{"IDENTIFIES_BUG":0.8,"SUGGESTS_FIX":0.4},"mutation":"added explicit null-check instruction","run_dir":"2026-04-15T10-30-00","timestamp":"2026-04-15T10:32:15Z"} +``` + +Fields: `cycle` (1-indexed), `score` (overall pass rate 0–1), `decision` ("keep" or "drop"), `cost_usd` (eval run cost), `assertions` (per-assertion pass rates), `mutation` (one-line description of what changed), `run_dir` (timestamped directory name), `timestamp` (ISO 8601). + +## trajectory.html + +A standalone HTML chart file with embedded Chart.js. Copy the template from `scripts/trajectory.html` into the `_autoresearch/` directory. It fetches `iterations.jsonl` from the same directory on each auto-refresh — no data injection needed. Shows: + +- Score over iterations (line chart) with KEEP (green) / DISCARD (red) markers +- Per-assertion pass rates over iterations +- Cumulative cost across iterations +- Best vs original score summary + +Auto-refreshes every 2 seconds during the loop. Becomes static after completion (remove the auto-refresh meta tag on final update). + +## Convergence + +Stop after **3** consecutive cycles with no improvement (no KEEP). Also stop at **max_cycles** (default 10). Either limit can be overridden by the user. + +## Human checkpoints + +Autoresearch mode **skips** human checkpoints at iterations 3/6/9. The user opted in to unattended operation by requesting autoresearch. + +## Context hygiene + +The orchestrator must run indefinitely without exhausting its context window. To do this: + +- **Never read eval results, artifacts, or transcripts into your own context.** Use bash commands (jq, agentv CLI) that output small structured summaries. +- **Delegate all heavy reading to subagents.** The mutator reads artifacts, grading results, and transcripts from disk — you pass it paths, not content. +- **Use bash for all file I/O** in the loop body: appending to `iterations.jsonl`, git operations, score extraction. The only tool calls per cycle should be bash commands and one subagent dispatch (mutator). +- **trajectory.html auto-loads `iterations.jsonl`** via fetch — no need to read or update the HTML file after initial copy. + +## Procedure + +Follow this step-by-step procedure to execute autoresearch: + +### 1. Setup + +1. Determine the **artifact path** (file or directory to optimize) and **eval path** (EVAL.yaml or evals.json). +2. Detect **artifact mode**: `file` if the artifact path is a file, `directory` if it's a directory. +3. Derive the **experiment name**: `autoresearch-` from the artifact filename/dirname, or use a user-provided name. +4. Set the experiment directory: `.agentv/results/runs//`. +5. Create the `_autoresearch/` subdirectory inside the experiment directory. +6. Record `initial_sha=$(git rev-parse HEAD)` — the commit before any mutations. +7. Copy `scripts/trajectory.html` to `_autoresearch/trajectory.html`. +8. Initialize variables: + - `best_score = 0` + - `convergence_count = 0` + - `cycle = 1` + - `max_cycles = 10` (or user-specified) + - `max_convergence = 3` (or user-specified) + +### 2. Main loop + +Repeat while `cycle <= max_cycles` and `convergence_count < max_convergence`: + +**a. Run eval** + +```bash +agentv eval --experiment autoresearch- +``` + +**b. Extract scores (bash only — do NOT read result files into your context)** + +Find the latest timestamped directory in the experiment folder. Use bash/jq to extract small structured values: + +```bash +# Find latest run dir +RUN_DIR=$(ls -td /20*/ | head -1) + +# Overall score (mean of all scores in index.jsonl) +SCORE=$(jq -sr '[.[].scores[].score] | add / length' "$RUN_DIR/index.jsonl") + +# Per-assertion pass rates as JSON object +PASS_RATES=$(jq -sr '[.[].scores[]] | group_by(.type) | map({key: .[0].type, value: (map(.score) | add / length)}) | from_entries' "$RUN_DIR/index.jsonl") + +# Cost (if timing.json exists) +COST=$(jq -r '.cost_usd // 0' "$RUN_DIR/timing.json" 2>/dev/null || echo 0) +``` + +Capture only these small outputs (`SCORE`, `PASS_RATES`, `COST`) — never read the full JSONL into context. + +**c. Update iterations.jsonl (bash only)** + +After the KEEP/DROP decision (step e), append one JSON line via bash: + +```bash +echo '{"cycle":'$CYCLE',"score":'$SCORE',"decision":"'$DECISION'","cost_usd":'$COST',"assertions":'$PASS_RATES',"mutation":"'"$MUTATION_DESC"'","run_dir":"'"$(basename $RUN_DIR)"'","timestamp":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'"}' >> /_autoresearch/iterations.jsonl +``` + +**d. trajectory.html — no action needed** + +The trajectory chart fetches `iterations.jsonl` directly via HTTP on each auto-refresh. No file manipulation required after the initial copy in setup. + +**e. Decide: KEEP or DROP** + +Apply the automated keep/discard rules from the section above: + +1. Run `agentv compare .jsonl .jsonl --json` where `` is the best iteration's `index.jsonl` (or the first run's `index.jsonl` for cycle 1) and `` is this cycle's `index.jsonl`. +2. If `wins > losses` → **KEEP**. +3. If `wins <= losses` → **DISCARD**. +4. If `mean_delta == 0` and the artifact is simpler → **KEEP** (simpler is better at equal performance). Simplicity: for files, compare line count; for directories, compare total size via `du -sb`. + +For cycle 1, there is no baseline to compare against — always **KEEP** the first cycle. + +**f. If KEEP** + +- Update `best_score` to this cycle's score. +- Commit the artifact: `git add && git commit -m "autoresearch cycle N: "`. +- Record the current `index.jsonl` path as the new baseline for future comparisons. +- Reset `convergence_count = 0`. + +**g. If DROP** + +- Revert the working tree to HEAD: `git checkout -- ` (for files) or `git checkout -- /` (for directories). +- Increment `convergence_count`. + +**h. Check stop conditions** + +If `convergence_count >= max_convergence` or `cycle >= max_cycles` → break out of the loop. + +**i. Mutate** + +Dispatch the **mutator** subagent (`agents/mutator.md`) with: +- `artifact-path`: the file or directory to mutate +- `artifact-mode`: `file` or `directory` +- `initial-sha`: the starting commit SHA (for referencing the original via `git show`) +- `pass-rates`: the `$PASS_RATES` JSON object from step (b) (small — just assertion names and rates) +- `run-dir`: path to this cycle's run directory (the mutator reads `grading.json` and transcripts itself) +- `iterations-path`: path to `_autoresearch/iterations.jsonl` (the mutator reads mutation history itself) +- For directory mode: `focus-files` (optional — files most likely contributing to failures, derived from assertion names) + +**Do NOT pass failure descriptions, transcripts, or grading content** to the mutator — pass paths and let it read what it needs from disk. This keeps the orchestrator's context clean. + +The mutator rewrites artifacts in place. Verify the artifact was modified (e.g., `git diff --stat`) before continuing. + +**j. Continue** + +Increment `cycle` and return to step (a). + +### 3. Completion + +1. Finalize `trajectory.html`: remove the line containing `` (which includes the `` tag) so the chart becomes static. +2. Log a final summary: + - Total cycles run + - Final best score vs original score (cycle 1) + - Number of KEEPs and DROPs + - Total cost across all cycles + - The optimized artifact is in the working tree (and the latest commit) + - Run `git diff ` to see total changes from the original + - Run `git log --oneline ..HEAD` to see the mutation history + - Path to `_autoresearch/trajectory.html` (the score chart) +3. Present results to the user with a recommendation: adopt the optimized version, revert to original (`git checkout -- `), or continue iterating interactively. + +## Interactive/autonomous hybrid + +Users can start in interactive mode (the existing Step 3–5 loop with human checkpoints), build confidence in their eval quality, and then switch to autoresearch mode to run unattended. The two modes share the same eval infrastructure and artifact layout — autoresearch simply automates the keep/discard decisions and removes human checkpoints. + +## Model empathy recommendation + +For best results, use same-model pairings: the meta-agent running autoresearch should match the model used by the task agent being evaluated (e.g., Claude optimizing a Claude agent, GPT optimizing a GPT agent). Per AutoAgent research findings, same-model pairings produce better mutations because the optimizer has implicit knowledge of how the target model interprets instructions. + +--- references/description-optimization.md --- +# Description Optimization + +Optimize the `description` field in a skill's SKILL.md frontmatter for better triggering +accuracy. Use this after the agent/skill is working well — this is a polish step, not a +core workflow step. + +**Provider compatibility**: Description optimization applies to any agent platform with +skill-discovery mechanisms — Claude Code, Codex (`.agents/` or `.codex/` folders), Copilot, +and others. The `skill-trigger` grader checks whether the agent invoked the right skill, +regardless of how discovery works on that platform. + +## Step 1: Generate Trigger EVAL.yaml + +Create 20 test cases: +- **10 should-trigger**: realistic prompts where this skill should activate — different + phrasings, casual speech, uncommon use cases, edge cases where this skill competes with + another but should win +- **10 should-not-trigger**: near-miss prompts that share keywords but actually need + something different — adjacent domains, ambiguous phrasing where naive matching would + trigger but shouldn't + +Prompts must be realistic — include file paths, personal context, typos, casual speech. +Not abstract requests like "format data" but concrete ones like "ok so my boss sent me +Q4-sales-FINAL-v2.xlsx and she wants me to add a profit margin column..." + +The should-not-trigger cases are the most valuable. "Write a fibonacci function" as a +negative test for an eval skill is useless — it doesn't test anything. The negative cases +should be genuinely tricky near-misses. + +Write as EVAL.yaml with top-level input (the user prompt doesn't specify the skill name — +it's a natural utterance): + +```yaml +# trigger-eval.eval.yaml +tests: + - id: should-trigger-casual-optimize + input: "ok so I have this agent that keeps failing on the code review tasks, can you help me figure out why and fix it" + assertions: + - type: skill-trigger + skill: agentv-bench + - id: should-not-trigger-build-error + input: "my TypeScript build is failing with type errors in src/auth.ts" + assertions: + - type: skill-trigger + skill: agentv-bench + should_trigger: false +``` + +## Step 2: Review with User + +Present the eval set. The user adjusts queries, toggles should-trigger, adds/removes cases. +This step matters — bad eval queries lead to bad descriptions. + +## Step 3: Iterate on Description + +Run the trigger eval, identify misfires, rewrite the description, re-run. Max 5 iterations. +Select best description by held-out test accuracy (split 60% train / 40% test) to avoid +overfitting. + +Use the grader and analyzer subagents to identify trigger failures and propose description +improvements — the same eval → grade → analyze → improve loop used for agent output quality. + +## Step 4: Apply + +Update the skill's SKILL.md frontmatter with the optimized description. Show the user +before/after with accuracy scores. + +--- references/environment-adaptation.md --- +# Environment Adaptation + +Provider-specific notes, CI/headless behavior, and fallback strategies for environments +with limited capabilities. + +## CI/Headless Mode + +Skip interactive prompts. Exit with pass/fail status code. Always generate artifacts for +downstream consumption. + +## No Subagents Available (e.g., Claude.ai) + +Run test cases serially. Skip blind comparison. Present results directly in conversation — +for each test case, show the prompt and output. Ask for feedback inline. Skip benchmarking +(it relies on baseline comparisons that aren't meaningful without subagents). + +## Provider-Specific Notes + +- **Copilot CLI**: Uses ACP protocol via `copilot --acp --stdio` +- **Claude SDK**: Requires `@anthropic-ai/claude-agent-sdk` installed +- **Codex**: Supports skills via `.agents/` or `.codex/` folders. Emits `command_execution` + and `file_change` tool calls. +- **Custom CLI**: Needs `command` and output file pattern in target config +- **Target config**: Uses `${{ ENV_VAR }}` syntax (not `${ENV_VAR}`) for API keys + +**Note**: "Description Optimization" (see `references/description-optimization.md`) applies +to any platform with skill-discovery mechanisms. All listed providers support skills. + +## Unsupported Providers: Use a Code-Grader + +The built-in `skill-trigger` grader covers Claude, Copilot, Pi, Codex and VS Code out +of the box. For providers with different tool-call formats, write a code-grader that inspects +the agent's tool call trace. + +A code-grader receives the full evaluation context including the agent's output messages and +tool calls. You can inspect these to determine whether the skill was invoked: + +```yaml +# Example: code-grader for Codex skill-trigger detection +tests: + - id: should-trigger-codex + input: "Analyze this CSV file" + assertions: + - type: code-grader + path: ./judges/codex-skill-trigger.ts +``` + +```typescript +// judges/codex-skill-trigger.ts +import { defineCodeGrader } from '@agentv/eval'; + +export default defineCodeGrader(({ output }) => { + const skillName = 'csv-analyzer'; + const toolCalls = (output ?? []).flatMap((msg) => msg.toolCalls ?? []); + const firstTool = toolCalls[0]; + + if (!firstTool) { + return { score: 0, reason: 'No tool calls recorded' }; + } + + // Codex reads skill files via shell commands + if (firstTool.tool === 'command_execution') { + const cmd = String(firstTool.input ?? ''); + if (cmd.includes(skillName)) { + return { score: 1, reason: `Skill "${skillName}" triggered via command: ${cmd}` }; + } + } + + // Check if skill file was read via file_change or other tools + if (firstTool.tool === 'file_change') { + const path = String((firstTool.input as Record)?.path ?? ''); + if (path.includes(skillName)) { + return { score: 1, reason: `Skill file accessed: ${path}` }; + } + } + + return { score: 0, reason: `First tool was "${firstTool.tool}" — not a skill invocation for "${skillName}"` }; +}); +``` + +This approach is more flexible than config overrides — you can match any tool-call pattern, +check multiple fields, and add provider-specific logic as needed. + +--- references/eval-yaml-spec.md --- +# Eval YAML Spec — Schema and Assertion Grading Recipes + +This reference documents the eval.yaml schema and grading recipes for every assertion type. +The grader agent uses this to evaluate assertions without the CLI. + +## 1. Eval YAML Structure + +### Top-level fields + +- `name` (string, optional) — eval name +- `description` (string, optional) — description +- `execution` (object, optional) — `target`, `model`, etc. +- `workspace` (object, optional) — workspace config (template, hooks) +- `tests` (array, required) — test cases + +### Per-test fields + +- `id` (string, required) — unique test identifier +- `input` (string | Message[], required) — task input. String shorthand expands to `[{role: user, content: "..."}]` +- `expected_output` (string | Message[], optional) — reference answer. String shorthand expands to `[{role: assistant, content: "..."}]` +- `criteria` (string, optional) — human-readable success criteria +- `assertions` (array, optional) — grader assertions +- `conversation_id` (string, optional) — groups related tests +- `execution` (object, optional) — per-test execution override + +## 2. Assertion Types and Grading Recipes + +For each assertion type: YAML config fields, grading recipe (exact pseudocode for deterministic types), and PASS/FAIL conditions. + +### Deterministic assertions (zero-cost, instant) + +#### `contains` + +- **Fields:** `value` (string, required) +- **Recipe:** + ``` + response.toLowerCase().includes(value.toLowerCase()) + ``` + Note: case-insensitive by default in AgentV. If `case_sensitive: true`, use exact match. +- **PASS:** substring found. **FAIL:** substring not found. + +#### `contains-any` + +- **Fields:** `value` (string[], required) +- **Recipe:** + ``` + value.some(v => response.toLowerCase().includes(v.toLowerCase())) + ``` +- **PASS:** at least one substring found. + +#### `contains-all` + +- **Fields:** `value` (string[], required) +- **Recipe:** + ``` + value.every(v => response.toLowerCase().includes(v.toLowerCase())) + ``` +- **PASS:** all substrings found. + +#### `icontains` / `icontains-any` / `icontains-all` + +Same as contains variants but explicitly case-insensitive. + +#### `equals` + +- **Fields:** `value` (string, required) +- **Recipe:** + ``` + response.trim() === value.trim() + ``` +- **PASS:** exact match after trimming. + +#### `regex` + +- **Fields:** `value` (string, required — a regex pattern) +- **Recipe:** + ``` + new RegExp(value).test(response) + ``` +- **PASS:** pattern matches. + +#### `starts-with` + +- **Fields:** `value` (string, required) +- **Recipe:** + ``` + response.startsWith(value) + ``` + (or case-insensitive variant) +- **PASS:** response starts with value. + +#### `ends-with` + +- **Fields:** `value` (string, required) +- **Recipe:** + ``` + response.endsWith(value) + ``` + (or case-insensitive variant) +- **PASS:** response ends with value. + +#### `is-json` + +- **Fields:** none required +- **Recipe:** + ``` + try { JSON.parse(response); return true } catch { return false } + ``` +- **PASS:** response is valid JSON. **FAIL:** parse error. + +#### `field-accuracy` + +- **Fields:** `expected` (object, required — JSON object with field paths and expected values) +- **Recipe:** Parse response as JSON. For each field path in `expected`, check if the value matches. +- **PASS:** all fields match. Partial score = `matched_fields / total_fields`. + +### Metric assertions (require timing.json) + +#### `latency` + +- **Fields:** `threshold` (number, required — max duration in ms) +- **Recipe:** Read `timing.json`. Compare `duration_ms` against threshold. +- **PASS:** `duration_ms <= threshold`. + +#### `cost` + +- **Fields:** `threshold` (number, required — max cost in USD) +- **Recipe:** Read timing/token data. Compare cost against threshold. +- **PASS:** `cost <= threshold`. + +#### `token-usage` + +- **Fields:** `threshold` (number, required — max tokens) +- **Recipe:** Read `timing.json`. Compare `total_tokens` against threshold. +- **PASS:** `total_tokens <= threshold`. + +#### `execution-metrics` + +- **Fields:** Various threshold fields for tool calls, output chars, etc. +- **Recipe:** Read timing.json, compare each metric against its threshold. + +### Tool inspection assertions + +#### `tool-trajectory` + +- **Fields:** `expected` (array of expected tool calls), `mode` (string: `exact` | `contains` | `order`) +- **Recipe:** Inspect transcript for tool call sequence. Match against expected based on mode. +- **PASS:** tool calls match expected pattern per mode. + +#### `skill-trigger` + +- **Fields:** `skill_name` (string, required) +- **Recipe:** Check if the agent invoked the named skill in its tool calls. +- **PASS:** skill was triggered. + +### LLM-judged assertions (require Claude reasoning) + +#### `llm-grader` + +- **Fields:** `prompt` (string, required — either inline text or path to .md file) +- **Recipe:** Read the prompt. Evaluate the response against the criteria using your own reasoning. Produce score (0.0-1.0) with evidence. +- **PASS:** score >= 0.5 (configurable via `threshold`). + +#### `rubric` / `rubrics` + +- **Fields:** `rubric_items` or `criteria` (array of rubric items with descriptions and weights) +- **Recipe:** For each rubric item, evaluate the response. Score each item 0.0-1.0. Aggregate as weighted average. +- **PASS:** aggregate score >= threshold. + +### Script-based assertions + +#### `code-grader` + +- **Fields:** `path` (string, required — path to script), `command` (string[], optional — custom command) +- **Script SDK:** Use `defineCodeGrader` from `@agentv/eval`: + ```typescript + import { defineCodeGrader } from '@agentv/eval'; + export default defineCodeGrader(({ outputText, trace }) => ({ + score: outputText.includes('expected') ? 1 : 0, + assertions: [{ text: 'Contains expected', passed: outputText.includes('expected') }], + })); + ``` +- **Recipe:** The CLI runs the script, passing context as JSON on stdin (`{output, outputText, input, inputText, ...}`). Script returns `{"score": N, "assertions": [...]}` +- **PASS:** score >= 0.5 (or as configured). + +### Composite assertion + +#### `composite` + +- **Fields:** `assertions` (array of sub-assertions), `aggregation` (string: `weighted_average` | `min` | `max` | `all_pass`) +- **Recipe:** Evaluate each sub-assertion. Aggregate scores per aggregation mode. +- **PASS:** depends on aggregation mode. + +## 3. Negate Support + +When `negate: true` is set on any assertion, invert the pass/fail result: + +- A passing check becomes a failure +- A failing check becomes a pass +- Score is inverted: `1.0 - score` + +## 4. Common Assertion Fields + +All assertion types support: + +- `name` (string, optional) — human-readable name +- `type` (string, required) — the assertion type +- `weight` (number, optional, default 1.0) — weight in score aggregation +- `negate` (boolean, optional) — invert result +- `threshold` (number, optional) — minimum score to pass (for LLM types) + +## 5. AgentV JSONL Output Format + +Each line in the results JSONL file is an `EvaluationResult` object. In JSONL, field names use snake_case (applied by `toSnakeCaseDeep()`). + +### Required fields + +- `timestamp` (string, ISO-8601) +- `test_id` (string) +- `score` (number, 0.0-1.0, weighted average of all assertion scores) +- `assertions` (array of `{text, passed, evidence?}`) +- `output` (Message[]) — agent output messages +- `execution_status` (string: `ok` | `quality_failure` | `execution_error`) + +### Optional fields + +- `scores` (array of EvaluatorResult) — per-grader breakdown +- `input` (Message[]) — input messages +- `token_usage` (object: `{prompt_tokens, completion_tokens, total_tokens}`) +- `cost_usd` (number) +- `duration_ms` (number) +- `target` (string) +- `eval_set` (string) +- `error` (string) +- `file_changes` (string — unified diff) +- `mode` (string — `agent` for agent mode) + +### `scores[]` entries (EvaluatorResult) + +- `name` (string) — grader name +- `type` (string) — grader kind (kebab-case) +- `score` (number, 0.0-1.0) +- `assertions` (array of `{text, passed, evidence?}`) +- `weight` (number, optional) +- `verdict` (string: `pass` | `fail` | `skip`) +- `details` (object, optional — structured data from code graders) +- `reasoning` (string, optional) + +## 6. Eval Set Support + +An eval_set references multiple eval.yaml files: + +```yaml +# eval_set.yaml +eval_set: + - path: ./basic.eval.yaml + - path: ./advanced.eval.yaml +``` + +Process each file's tests independently, then aggregate results. + +## 7. Agent-Mode Pipeline CLI Commands + +These CLI subcommands break the monolithic `eval run` into discrete steps for agent-mode execution. The agent handles LLM grading between steps. + +### `agentv pipeline input --out ` + +Extracts inputs, target commands, and grader configs from an eval YAML file. + +**Output structure:** +``` +/ +├── manifest.json +├── / +│ ├── input.json ← {input, input_files, metadata} +│ ├── invoke.json ← {kind, command?, cwd?, timeout_ms?} +│ ├── criteria.md ← human-readable success criteria +│ ├── expected_output.json ← (if present) +│ ├── code_graders/.json ← {name, command, weight, config?} +│ └── llm_graders/.json ← {name, weight, threshold?, prompt_content} +``` + +**`manifest.json` format:** +```json +{ + "eval_file": "path/to/eval.yaml", + "timestamp": "2026-03-24T...", + "target": {"name": "target-name", "kind": "cli", "subagent_mode_allowed": false}, + "test_ids": ["test-01", "test-02"] +} +``` + +**`invoke.json` kinds:** +- `kind: "cli"` — has `command`, `cwd`, `timeout_ms`. Use the command to run the target. +- `kind: "agent"` — non-CLI provider. Check `manifest.json` `target.subagent_mode_allowed` to decide whether to dispatch executor subagents or fall back to `agentv eval` CLI. + +### `agentv pipeline grade ` + +Runs code-grader assertions against `response.md` files in each test directory. + +**Prerequisites:** `pipeline input` has been run and `response.md` exists in each test dir. + +**Output:** `/code_grader_results/.json` for each code grader, containing: +```json +{ + "name": "grader-name", + "type": "code-grader", + "score": 1.0, + "weight": 1.0, + "assertions": [{"text": "...", "passed": true}] +} +``` + +### `agentv pipeline bench ` + +Merges code-grader results with LLM grader scores and produces final artifacts. + +LLM grader results are read from disk at `/llm_grader_results/.json` per test. + +**LLM grader result file format** (`llm_grader_results/.json`): +```json +{ "score": 0.85, "assertions": [{"text": "...", "passed": true, "evidence": "..."}] } +``` + +**Output:** +- `/grading.json` — merged grading with `graders`, `assertions`, `summary.pass_rate` +- `index.jsonl` — one JSON line per test: `{test_id, score, pass, graders: [...]}` +- `benchmark.json` — aggregate stats: `{metadata: {targets}, run_summary: {: {mean, stddev, n}}}` + +### Agent-Mode Workflow + +``` +1. agentv pipeline input eval.yaml --out ./export +2. (Agent runs targets or reads response.md) +3. agentv pipeline grade ./export +4. (Agent does LLM grading, produces scores JSON) +5. echo '' | agentv pipeline bench ./export +``` + +--- references/migrating-from-skill-creator.md --- +# Migrating from Skill-Creator to AgentV Lifecycle Skill + +This reference covers how to use AgentV's unified agent-evaluation lifecycle skill (`agentv-bench`) with evals.json files originally created for Anthropic's skill-creator. + +## Drop-in Replacement + +AgentV runs skill-creator's evals.json directly — no conversion required: + +```bash +# Run evals.json with AgentV +agentv eval evals.json + +# Or run a single assertion offline (no API keys) +agentv eval assert --agent-output "..." --agent-input "..." +``` + +AgentV automatically: +- Promotes `prompt` → input messages +- Promotes `expected_output` → reference answer +- Converts `assertions` → LLM-grader graders +- Resolves `files[]` paths relative to the evals.json directory + +If you're using the `agentv-bench` skill, it orchestrates these same AgentV commands. Code graders, grading, and artifact generation remain in AgentV core; the skill just orchestrates and summarizes the existing outputs. + +## What You Gain + +Moving from skill-creator's eval loop to AgentV's lifecycle skill gives you: + +| Capability | skill-creator | AgentV lifecycle skill | +|-----------|---------------|----------------------| +| Workspace isolation | ❌ | ✅ Clone repos, run setup/teardown scripts | +| Code graders | ❌ | ✅ Python/TypeScript grader scripts via `defineCodeGrader()` | +| Tool trajectory scoring | ❌ | ✅ Evaluate tool call sequences | +| Multi-provider comparison | with-skill vs without-skill | N-way: Claude, GPT, Copilot, Gemini, custom CLI | +| Multi-turn evaluation | ❌ | ✅ Conversation tracking with `conversation_id` | +| Blind comparison | ❌ | ✅ Judge doesn't know which is baseline | +| Deterministic upgrade suggestions | ❌ | ✅ LLM-grader → contains/regex/is-json | +| Human review checkpoint | ❌ | ✅ Structured feedback gate | +| Workspace file tracking | ❌ | ✅ Evaluate by diffing workspace files | +| Agent mode (no API keys) | ❌ | ✅ Uses grader agent in agent mode | + +## Artifact Compatibility + +AgentV's companion artifacts are compatible with skill-creator's eval-viewer: + +| Artifact | Format | Compatible with eval-viewer | +|----------|--------|---------------------------| +| `/grading.json` | Per-assertion evidence with claims | ✅ Superset of skill-creator's per-test grading format | +| `benchmark.json` | Aggregate pass rates, timing, patterns | ✅ Superset of Agent Skills benchmark format | +| Results JSONL | Per-test results | ✅ Standard JSONL format | + +AgentV's schemas are supersets — they include all fields skill-creator expects, plus additional fields (claims extraction, pattern analysis, deterministic upgrade candidates). Tools that read skill-creator artifacts will read AgentV artifacts correctly, ignoring the extra fields. + +The optimizer scripts layer reads those same artifacts directly: +- `aggregate-benchmark.ts` consumes `benchmark.json`, `timing.json`, and results JSONL +- `generate-report.ts` and `eval-viewer/generate-review.ts` render review output from AgentV artifacts +- `improve-description.ts` proposes follow-up experiments from benchmark/grading observations + +## Graduating to EVAL.yaml + +When evals.json becomes limiting, convert to EVAL.yaml for the full feature set: + +```bash +# Convert evals.json to EVAL.yaml +agentv convert evals.json + +# Edit the generated YAML to add workspace config, code graders, etc. +# Then run with the full lifecycle +agentv eval eval.yaml +``` + +EVAL.yaml unlocks: +- **Workspace setup/teardown** — clone repos, install dependencies, clean up after tests +- **Code graders** — write graders in Python or TypeScript, not just LLM prompts +- **Rubric-based grading** — multi-dimensional scoring with weighted criteria +- **Retry policies** — automatic retries for flaky tests with configurable backoff +- **Test groups** — organize tests by category with shared config +- **Multi-turn conversations** — test agent interactions across multiple turns + +## What Stays in Skill-Creator + +AgentV does NOT replace these skill-creator capabilities: + +- **Trigger optimization** — optimizing when/how a skill is triggered +- **.skill packaging** — bundling skills for distribution +- **Skill authoring** — creating new SKILL.md files from scratch +- **Skill discovery** — finding and installing skills + +AgentV focuses on the **evaluation and optimization loop**. Skill-creator focuses on **skill authoring and packaging**. They are complementary — use skill-creator to write the skill, use AgentV to evaluate and optimize it. + +## Example Workflow + +``` +1. Author a skill with skill-creator +2. skill-creator generates evals.json +3. Run evals.json through AgentV's lifecycle skill for richer evaluation: + - Workspace isolation (test in a real repo) + - Multi-provider comparison (does the skill work with GPT too?) + - Blind comparison (is the new version actually better?) + - Deterministic upgrades (replace vague LLM graders with precise checks) +4. Use AgentV's optimization loop to refine the skill's prompts +5. Return to skill-creator for packaging and distribution +``` + +--- references/schemas.md --- +# JSON Schemas + +This document defines the JSON schemas used by skill-creator. + +--- + +## evals.json + +Defines the evals for a skill. Located at `evals/evals.json` within the skill directory. + +```json +{ + "skill_name": "example-skill", + "evals": [ + { + "id": 1, + "prompt": "User's example prompt", + "expected_output": "Description of expected result", + "files": ["evals/files/sample1.pdf"], + "assertions": [ + "The output includes X", + "The skill used script Y" + ] + } + ] +} +``` + +**Fields:** +- `skill_name`: Name matching the skill's frontmatter +- `evals[].id`: Unique integer identifier +- `evals[].prompt`: The task to execute +- `evals[].expected_output`: Human-readable description of success +- `evals[].files`: Optional list of input file paths (relative to skill root) +- `evals[].assertions`: List of verifiable statements + +--- + +## history.json + +Tracks version progression in Improve mode. Located at workspace root. + +```json +{ + "started_at": "2026-01-15T10:30:00Z", + "skill_name": "pdf", + "current_best": "v2", + "iterations": [ + { + "version": "v0", + "parent": null, + "assertion_pass_rate": 0.65, + "grading_result": "baseline", + "is_current_best": false + }, + { + "version": "v1", + "parent": "v0", + "assertion_pass_rate": 0.75, + "grading_result": "won", + "is_current_best": false + }, + { + "version": "v2", + "parent": "v1", + "assertion_pass_rate": 0.85, + "grading_result": "won", + "is_current_best": true + } + ] +} +``` + +**Fields:** +- `started_at`: ISO timestamp of when improvement started +- `skill_name`: Name of the skill being improved +- `current_best`: Version identifier of the best performer +- `iterations[].version`: Version identifier (v0, v1, ...) +- `iterations[].parent`: Parent version this was derived from +- `iterations[].assertion_pass_rate`: Pass rate from grading +- `iterations[].grading_result`: "baseline", "won", "lost", or "tie" +- `iterations[].is_current_best`: Whether this is the current best version + +--- + +## grading.json + +Output from the grader agent. Located at `/grading.json`. + +**Important:** The `assertions` array must use the fields `text`, `passed`, and `evidence` — downstream tooling depends on these exact field names. + +```json +{ + "assertions": [ + { + "text": "The output includes the name 'John Smith'", + "passed": true, + "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'" + }, + { + "text": "The spreadsheet has a SUM formula in cell B10", + "passed": false, + "evidence": "No spreadsheet was created. The output was a text file." + } + ], + "summary": { + "passed": 2, + "failed": 1, + "total": 3, + "pass_rate": 0.67 + }, + "execution_metrics": { + "tool_calls": { + "Read": 5, + "Write": 2, + "Bash": 8 + }, + "total_tool_calls": 15, + "total_steps": 6, + "errors_encountered": 0, + "output_chars": 12450, + "transcript_chars": 3200 + }, + "timing": { + "executor_duration_seconds": 165.0, + "grader_duration_seconds": 26.0, + "total_duration_seconds": 191.0 + }, + "claims": [ + { + "claim": "The form has 12 fillable fields", + "type": "factual", + "verified": true, + "evidence": "Counted 12 fields in field_info.json" + } + ], + "user_notes_summary": { + "uncertainties": ["Used 2023 data, may be stale"], + "needs_review": [], + "workarounds": ["Fell back to text overlay for non-fillable fields"] + }, + "eval_feedback": { + "suggestions": [ + { + "assertion": "The output includes the name 'John Smith'", + "reason": "A hallucinated document that mentions the name would also pass" + } + ], + "overall": "Assertions check presence but not correctness." + } +} +``` + +**Fields:** +- `assertions[]`: Graded assertion results with evidence +- `summary`: Aggregate pass/fail counts +- `execution_metrics`: Tool usage and output size (from executor's metrics.json) +- `timing`: Wall clock timing (from timing.json) +- `claims`: Extracted and verified claims from the output +- `user_notes_summary`: Issues flagged by the executor +- `eval_feedback`: (optional) Improvement suggestions for the evals, only present when the grader identifies issues worth raising + +--- + +## metrics.json + +Output from the executor agent. Located at `/outputs/metrics.json`. + +```json +{ + "tool_calls": { + "Read": 5, + "Write": 2, + "Bash": 8, + "Edit": 1, + "Glob": 2, + "Grep": 0 + }, + "total_tool_calls": 18, + "total_steps": 6, + "files_created": ["filled_form.pdf", "field_values.json"], + "errors_encountered": 0, + "output_chars": 12450, + "transcript_chars": 3200 +} +``` + +**Fields:** +- `tool_calls`: Count per tool type +- `total_tool_calls`: Sum of all tool calls +- `total_steps`: Number of major execution steps +- `files_created`: List of output files created +- `errors_encountered`: Number of errors during execution +- `output_chars`: Total character count of output files +- `transcript_chars`: Character count of transcript + +--- + +## timing.json + +Wall clock timing for a run. Located at `/timing.json`. + +**How to capture:** When a subagent task completes, the task notification includes `total_tokens` and `duration_ms`. Save these immediately — they are not persisted anywhere else and cannot be recovered after the fact. + +```json +{ + "total_tokens": 84852, + "duration_ms": 23332, + "total_duration_seconds": 23.3, + "executor_start": "2026-01-15T10:30:00Z", + "executor_end": "2026-01-15T10:32:45Z", + "executor_duration_seconds": 165.0, + "grader_start": "2026-01-15T10:32:46Z", + "grader_end": "2026-01-15T10:33:12Z", + "grader_duration_seconds": 26.0 +} +``` + +--- + +## benchmark.json + +Output from Benchmark mode. Located at `benchmarks//benchmark.json`. + +```json +{ + "metadata": { + "skill_name": "pdf", + "skill_path": "/path/to/pdf", + "executor_model": "claude-sonnet-4-20250514", + "analyzer_model": "most-capable-model", + "timestamp": "2026-01-15T10:30:00Z", + "evals_run": [1, 2, 3], + "runs_per_configuration": 3 + }, + + "runs": [ + { + "eval_id": 1, + "eval_name": "Ocean", + "configuration": "with_skill", + "run_number": 1, + "result": { + "pass_rate": 0.85, + "passed": 6, + "failed": 1, + "total": 7, + "time_seconds": 42.5, + "tokens": 3800, + "tool_calls": 18, + "errors": 0 + }, + "assertions": [ + {"text": "...", "passed": true, "evidence": "..."} + ], + "notes": [ + "Used 2023 data, may be stale", + "Fell back to text overlay for non-fillable fields" + ] + } + ], + + "run_summary": { + "with_skill": { + "pass_rate": {"mean": 0.85, "stddev": 0.05, "min": 0.80, "max": 0.90}, + "time_seconds": {"mean": 45.0, "stddev": 12.0, "min": 32.0, "max": 58.0}, + "tokens": {"mean": 3800, "stddev": 400, "min": 3200, "max": 4100} + }, + "without_skill": { + "pass_rate": {"mean": 0.35, "stddev": 0.08, "min": 0.28, "max": 0.45}, + "time_seconds": {"mean": 32.0, "stddev": 8.0, "min": 24.0, "max": 42.0}, + "tokens": {"mean": 2100, "stddev": 300, "min": 1800, "max": 2500} + }, + "delta": { + "pass_rate": "+0.50", + "time_seconds": "+13.0", + "tokens": "+1700" + } + }, + + "notes": [ + "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value", + "Eval 3 shows high variance (50% ± 40%) - may be flaky or model-dependent", + "Without-skill runs consistently fail on table extraction assertions", + "Skill adds 13s average execution time but improves pass rate by 50%" + ] +} +``` + +**Fields:** +- `metadata`: Information about the benchmark run + - `skill_name`: Name of the skill + - `timestamp`: When the benchmark was run + - `evals_run`: List of eval names or IDs + - `runs_per_configuration`: Number of runs per config (e.g. 3) +- `runs[]`: Individual run results + - `eval_id`: Numeric eval identifier + - `eval_name`: Human-readable eval name (used as section header in the viewer) + - `configuration`: Must be `"with_skill"` or `"without_skill"` (the viewer uses this exact string for grouping and color coding) + - `run_number`: Integer run number (1, 2, 3...) + - `result`: Nested object with `pass_rate`, `passed`, `total`, `time_seconds`, `tokens`, `errors` +- `run_summary`: Statistical aggregates per configuration + - `with_skill` / `without_skill`: Each contains `pass_rate`, `time_seconds`, `tokens` objects with `mean` and `stddev` fields + - `delta`: Difference strings like `"+0.50"`, `"+13.0"`, `"+1700"` +- `notes`: Freeform observations from the analyzer + +**Important:** The viewer reads these field names exactly. Using `config` instead of `configuration`, or putting `pass_rate` at the top level of a run instead of nested under `result`, will cause the viewer to show empty/zero values. Always reference this schema when generating benchmark.json manually. + +--- + +## comparison.json + +Output from blind comparator. Located at `/comparison-N.json`. + +```json +{ + "winner": "A", + "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.", + "rubric": { + "A": { + "content": { + "correctness": 5, + "completeness": 5, + "accuracy": 4 + }, + "structure": { + "organization": 4, + "formatting": 5, + "usability": 4 + }, + "content_score": 4.7, + "structure_score": 4.3, + "overall_score": 9.0 + }, + "B": { + "content": { + "correctness": 3, + "completeness": 2, + "accuracy": 3 + }, + "structure": { + "organization": 3, + "formatting": 2, + "usability": 3 + }, + "content_score": 2.7, + "structure_score": 2.7, + "overall_score": 5.4 + } + }, + "output_quality": { + "A": { + "score": 9, + "strengths": ["Complete solution", "Well-formatted", "All fields present"], + "weaknesses": ["Minor style inconsistency in header"] + }, + "B": { + "score": 5, + "strengths": ["Readable output", "Correct basic structure"], + "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"] + } + }, + "assertions": { + "A": { + "passed": 4, + "total": 5, + "pass_rate": 0.80, + "details": [ + {"text": "Output includes name", "passed": true} + ] + }, + "B": { + "passed": 3, + "total": 5, + "pass_rate": 0.60, + "details": [ + {"text": "Output includes name", "passed": true} + ] + } + } +} +``` + +--- + +## analysis.json + +Output from post-hoc analyzer. Located at `/analysis.json`. + +```json +{ + "comparison_summary": { + "winner": "A", + "winner_skill": "path/to/winner/skill", + "loser_skill": "path/to/loser/skill", + "comparator_reasoning": "Brief summary of why comparator chose winner" + }, + "winner_strengths": [ + "Clear step-by-step instructions for handling multi-page documents", + "Included validation script that caught formatting errors" + ], + "loser_weaknesses": [ + "Vague instruction 'process the document appropriately' led to inconsistent behavior", + "No script for validation, agent had to improvise" + ], + "instruction_following": { + "winner": { + "score": 9, + "issues": ["Minor: skipped optional logging step"] + }, + "loser": { + "score": 6, + "issues": [ + "Did not use the skill's formatting template", + "Invented own approach instead of following step 3" + ] + } + }, + "improvement_suggestions": [ + { + "priority": "high", + "category": "instructions", + "suggestion": "Replace 'process the document appropriately' with explicit steps", + "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior" + } + ], + "transcript_insights": { + "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script", + "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods" + } +} +``` + +--- references/subagent-pipeline.md --- +# Subagent Pipeline — Running eval.yaml without CLI + +This reference documents the detailed procedure for running evaluations in subagent mode +(`AGENT_EVAL_MODE=subagent`, the default). The orchestrating skill dispatches `executor` +subagents to perform test cases and `grader` subagents to evaluate outputs. + +Read this reference when executing Step 3 (Run and Grade) in subagent mode. + +## Prerequisites + +- The eval.yaml file exists and contains valid test definitions +- `agentv` CLI is installed (or run from source via `AGENTV_CLI=bun /path/to/cli.ts` in `.env`) +- Read `references/eval-yaml-spec.md` for the full schema + +## Workspace Context + +Some evals pass prompt files directly and don't require a specific workspace — those run fine +from anywhere. But evals that test agent behavior in a workspace (accessing skills, modifying +repos, using tools across multiple repos) require the user to be in the **target workspace** +(e.g., a multi-repo workspace set up by allagents). If the eval references workspace files or +expects the agent to use skills, check that the current directory is the target workspace, not +just the eval repo — and warn the user if it's wrong. + +## Executor Subagent Eligibility + +All providers except `cli` are eligible for executor subagents by default. To opt out a +specific target, set `subagent_mode_allowed: false` in `.agentv/targets.yaml`: + +```yaml +# .agentv/targets.yaml +targets: + - name: my-target + provider: openai + model: ${{ OPENAI_MODEL }} + api_key: ${{ OPENAI_API_KEY }} + subagent_mode_allowed: false # forces CLI invocation instead of executor subagent +``` + +When `subagent_mode_allowed: false`, the target falls back to CLI invocation via `agentv eval` +even in subagent mode. + +## CLI Targets: Single Command + +For evals with CLI targets, `pipeline run` handles input extraction, target invocation, and +code grading in one step. When `--out` is omitted, the output directory defaults to +`.agentv/results/runs/` (same convention as `agentv eval`): + +```bash +# Extract inputs and invoke all CLI targets in parallel: +agentv pipeline run evals/repro.eval.yaml + +# Also run code graders inline (instead of using pipeline grade separately): +agentv pipeline run evals/repro.eval.yaml --grader-type code +``` + +By default, `pipeline run` extracts inputs and invokes targets only. Pass `--grader-type code` +to also run code-graders inline, or use `agentv pipeline grade ` as a separate step. + +The run directory is printed to stdout. Then continue to the grading and merge phases +described in SKILL.md Step 3. + +## Non-CLI Targets: Executor Subagents + +When the target provider is not `cli`, check `manifest.json` → `target.subagent_mode_allowed`. +If `true` (default for all non-CLI providers), the subagent IS the target. If `false` (user +opted out via `subagent_mode_allowed: false` in `.agentv/targets.yaml`), fall back to +`agentv eval` CLI mode instead. + +### Step 1: Extract inputs + +```bash +# Defaults to .agentv/results/runs/ +agentv pipeline input evals/repro.eval.yaml +``` + +This creates a run directory with per-test `input.json`, `invoke.json`, +`criteria.md`, and grader configs. + +### Step 2: Dispatch executor subagents + +Read `agents/executor.md`. Launch one `executor` subagent **per test case**, all in parallel. +Each subagent receives the test directory path, reads `input.json`, performs the task using +its own tools, and writes `response.md`. + +Example: 5 tests = 5 executor subagents launched simultaneously. + +``` +# Per executor subagent: +# - Reads //input.json +# - Performs the task +# - Writes //response.md +``` + +### Step 3 onward: Grade and merge + +See SKILL.md Step 3 "Grading" section for the three-phase grading process (code graders → +LLM grading → merge and validate). + +## Step-by-Step Fine-Grained Control (CLI targets) + +Use individual commands when you need control over each step with CLI targets: + +```bash +# Step 1: Extract inputs (defaults to .agentv/results/runs/) +agentv pipeline input evals/repro.eval.yaml + +# Step 2: run_tests.py invokes CLI targets (or use pipeline run instead) + +# Step 3: Run code graders +agentv pipeline grade + +# Step 4: Subagent does LLM grading, writes results to llm_grader_results/.json per test + +# Step 5: Merge scores (writes index.jsonl with full scores[] for dashboard) +agentv pipeline bench + +# Step 6: Validate +agentv results validate +``` + +## LLM Grading JSON Format + +The agent reads `llm_graders/.json` for each test, grades the response using the prompt +content, and produces a scores JSON: + +```json +{ + "test-01": { + "relevance": { + "score": 0.85, + "assertions": [{"text": "Response is relevant", "passed": true, "evidence": "..."}] + } + } +} +``` + +## Pipeline Bench and Dashboard + +`pipeline bench` merges LLM scores into `index.jsonl` with a full `scores[]` array per entry, +matching the CLI-mode schema. The web dashboard (`agentv results serve`) reads this format +directly — no separate conversion script is needed. Run `agentv results validate ` +to verify compatibility. + +## Output Structure + +The path hierarchy mirrors the CLI mode: `` comes from the `name` field in +the eval.yaml. The target is recorded in `manifest.json` — one run = one target. + +``` +.agentv/results/runs/// +├── manifest.json ← eval metadata, target, test_ids +├── index.jsonl ← per-test scores +├── benchmark.json ← aggregate statistics +└── / ← eval.yaml "name" field, or eval file basename if absent (same as CLI mode) + └── / ← test case id + ├── input.json ← test input text + messages + ├── invoke.json ← target command or agent instructions + ├── criteria.md ← grading criteria + ├── response.md ← target/agent output + ├── timing.json ← execution timing + ├── code_graders/.json ← grader configs written by `pipeline input`: code-grader scripts AND built-in types (contains, regex, equals, etc.) + ├── llm_graders/.json ← LLM grader configs + ├── code_grader_results/.json ← code grader results + ├── llm_grader_results/.json ← LLM grader results (written by grader subagents; one file per grader) + └── grading.json ← merged grading (written by `pipeline bench` — do NOT write here directly) +``` diff --git a/evals/self/skills/fixtures/agentv-bench.txt b/evals/self/skills/fixtures/agentv-bench.txt new file mode 100644 index 000000000..67df5b625 --- /dev/null +++ b/evals/self/skills/fixtures/agentv-bench.txt @@ -0,0 +1,428 @@ +--- +name: agentv-bench +description: >- + Run AgentV evaluations and optimize agents through eval-driven iteration. + Triggers: run evals, benchmark agents, optimize prompts/skills against evals, compare + agent outputs across providers, analyze eval results, offline evaluation of recorded sessions, + run autoresearch, optimize unattended, run overnight optimization loop. + Not for: writing/editing eval YAML without running (use agentv-eval-writer), + analyzing existing traces/JSONL without re-running (use agentv-trace-analyst). +--- + +# AgentV Bench + + +A skill for evaluating agents and iteratively improving them through data-driven optimization. + +At a high level, the process goes like this: + +- Understand what the agent does and what "good" looks like +- Write evaluation test cases (EVAL.yaml or evals.json) +- Run the agent on those test cases, grade the outputs +- Analyze the results — what's working, what's failing, and why +- Improve the agent's prompts/skills/config based on the analysis +- Repeat until you're satisfied + +Your job when using this skill is to figure out where the user is in this process and then jump in and help them progress. Maybe they want to start from scratch — help them write evals, run them, and iterate. Maybe they already have results — jump straight to analysis and improvement. + +Be flexible. If the user says "I don't need a full benchmark, just help me debug this failure", do that instead. + +After the agent is working well, you can also run description optimization to improve skill triggering accuracy (see `references/description-optimization.md`). + +## Communicating with the user + +This skill is used by people across a wide range of familiarity with evaluation tooling. Pay attention to context cues: + +- "evaluation" and "benchmark" are borderline but OK in most cases +- For "YAML", "grader", "assertion", "deterministic judge" — see serious cues from the user that they know what those mean before using them without explanation +- Briefly explain terms if in doubt + +When presenting results, default to summary tables. Offer detail on request. In CI/headless mode, skip interactive prompts and exit with status codes. + +--- + +## Step 1: Understand the Agent + +Before running or optimizing, understand what you're working with. + +1. **Read the agent's artifacts** — prompts, skills, configs, recent changes. Understand the full picture: what tools are available, what the expected input/output looks like, what constraints exist. + +2. **Identify success criteria** — what does "good" look like for this agent? What are the edge cases? What would a failure look like? Talk to the user if this isn't clear from the artifacts alone. + +3. **Understand the target harness** — which provider runs the agent (Claude, GPT, Copilot CLI, Gemini, custom CLI)? This affects what grader types are available and how to run tests. Targets are configured in `.agentv/targets.yaml` (canonical location, searched from the eval file directory upward). Sensitive values like `api_key` must use `${{ ENV_VAR }}` syntax — literal secrets are rejected as a security guardrail. + +4. **Challenge assumptions** — if evals already exist, review their quality before running: + - Are the test cases testing the right things? + - Are assertions specific enough to catch real failures? + - Are there ambiguous or contradictory test cases? + - Flag eval issues before proceeding — running bad evals wastes time. + +5. **Check integrity** — ensure task prompts (what the agent receives) are not also used as grader prompts (how outputs are scored). If a prompt file appears in both locations, note the overlap and optimize only for the task purpose. + +--- + +## Step 2: Write Evaluations + +AgentV supports two evaluation formats: + +**EVAL.yaml** (native, full features) — supports workspaces, code graders, multi-turn conversations, tool trajectory scoring, workspace file tracking, multi-provider targets. Use this for agent evaluation. + +```yaml +# example.eval.yaml +tests: + - id: basic-code-review + input: "Review this TypeScript file for bugs and suggest improvements" + criteria: "Identifies the null pointer bug on line 12 and suggests a fix" + assertions: + - type: contains + value: "null" + - Review identifies the null pointer bug and suggests a concrete fix + +workspace: + template: ./workspace-template + hooks: + before_each: + reset: fast +``` + +Multi-skill evaluation is handled naturally via input messages — describe the task in the test input, and the agent uses whatever skills it needs. + +**evals.json** (skill-creator compatible) — auto-promoted to EVAL-equivalent format: +- `prompt` → input messages +- `expected_output` → reference answer +- `assertions` → graders +- `files[]` paths resolved relative to the evals.json location + +```json +{ + "skill_name": "my-agent", + "evals": [ + { + "id": 1, + "prompt": "User's task prompt", + "expected_output": "Description of expected result", + "assertions": ["Output includes error handling", "Uses async/await"] + } + ] +} +``` + +### Writing good test cases + +Start with 2-3 realistic test cases — the kind of thing a real user would actually say. Share them with the user before running: "Here are a few test cases I'd like to try. Do these look right, or do you want to add more?" + +Good assertions are objectively verifiable and have descriptive names. Subjective quality ("the output is good") is better evaluated qualitatively — don't force assertions onto things that need human judgment. + +**Grader types** (cheapest to most expensive): `exact`, `contains`, `regex`, `is-json`, `field-accuracy`, `composite`, `code-grader`, `tool-trajectory`, `llm-grader`. See `references/eval-yaml-spec.md` for full config and grading recipes for each type. + +Prefer deterministic graders over LLM graders whenever possible. If an assertion can be checked with `contains` or `regex`, don't use `llm-grader`. + +--- + +## Step 3: Run and Grade + +This section is one continuous sequence — don't stop partway through. + +Each run produces a new `.agentv/results/runs//` directory automatically. Use timestamps to identify iterations when comparing runs. + +### Choosing a run mode + +**User instruction takes priority.** If the user says "run in subagent mode", "use subagent mode", or "use CLI mode", use that mode directly. + +If the user has not specified a mode, default to `subagent`. + +| `AGENT_EVAL_MODE` | Mode | How | +|----------------------|------|-----| +| `subagent` (default) | **Subagent mode** | Subagent-driven eval — parses eval.yaml, spawns executor + grader subagents. Zero CLI dependency. | +| `cli` | **AgentV CLI** | `agentv eval ` — end-to-end, multi-provider | + +Set `AGENT_EVAL_MODE` in `.env` at the project root as the default when no mode is specified. If absent, default to `subagent`. **User instruction always overrides this.** + +**`subagent`** — Parses eval.yaml directly, spawns executor subagents to run each test case in the current workspace, then spawns grader subagents to evaluate all assertion types natively. No CLI or external API calls required. Read `references/subagent-pipeline.md` for the detailed procedure. + +**`cli`** — AgentV CLI handles execution, grading, and artifact generation end-to-end. Works with all providers. Use when you need multi-provider benchmarking or CLI-specific features. + +### Running evaluations + +**AgentV CLI mode** (end-to-end, EVAL.yaml): +```bash +agentv eval --output .agentv/artifacts/ +``` + +**Subagent mode** — read `references/subagent-pipeline.md` for the detailed procedure. In brief: use `pipeline input` to extract inputs, dispatch one `executor` subagent per test case (all in parallel), then proceed to grading below. + +**Spawn all runs in the same turn.** For each test case that needs both a "with change" and a "baseline" run, launch them simultaneously. Don't run one set first and come back for the other — launch everything at once so results arrive around the same time. + +**Multi-target benchmarking:** +```bash +agentv eval --target claude --target gpt --target copilot +``` + +**Baseline strategy:** +- **New agent**: baseline is "no prompt" or minimal prompt — same eval, no agent-specific configuration +- **Improving existing**: snapshot the current version before editing (`cp -r /prompt-snapshot/`), use as baseline throughout +- **Multi-target**: each target is its own baseline — no need for a separate "without" run + +### While runs are in progress, draft graders + +Don't just wait for runs to finish — use this time productively. If assertions don't exist yet, draft them now. If they exist, review them and explain what they check to the user. + +Good assertions are *discriminating* — they pass when the agent genuinely succeeds and fail when it doesn't. An assertion that passes for both good and bad outputs is worse than no assertion. + +### As runs complete, capture timing data + +When each subagent task completes, you receive a notification containing `total_tokens` and `duration_ms`. **Save this data immediately** to `timing.json` in the run directory. See `references/schemas.md` for the timing.json schema. + +This is the only opportunity to capture this data — it comes through the task notification and isn't persisted elsewhere. Process each notification as it arrives. + +### Grading + +**In CLI mode**, `agentv eval` handles all grading end-to-end — no manual phases needed. + +**In subagent mode**, grading has three phases. **All three are required — do not stop after phase 1.** + +**Phase 1: Code graders** (deterministic, zero-cost) + +```bash +agentv pipeline grade +``` + +This evaluates all deterministic assertions against `response.md` files. Two types are handled: +- **`code-grader` scripts** — external scripts executed against the response (arbitrary logic, any language) +- **Built-in assertion types** — evaluated in-process: `contains`, `contains-any`, `contains-all`, `icontains`, `regex`, `equals`, `starts-with`, `ends-with`, `is-json`, and variants + +Both types are configured by `pipeline input` into `code_graders/.json` and graded by `pipeline grade`. Results are written to `/code_grader_results/.json`. Alternatively, pass `--grader-type code` to `pipeline run` to run these inline. + +**Do not dispatch LLM grader subagents for tests that only have `contains`, `regex`, or other built-in assertions** — `pipeline grade` handles them entirely, at zero cost. To detect which tests need Phase 2, check whether `/llm_graders/` contains any `.json` config files — `pipeline input` only writes there for `llm-grader` assertions. Tests with an empty (or missing) `llm_graders/` directory are done after Phase 1. + +**Phase 2: LLM grading** (semantic — do NOT skip this phase) + +Dispatch one `grader` subagent per (test × LLM grader) pair, **all in parallel**. Do not write a script to call an LLM API instead — the grader subagents use their own reasoning, which IS the LLM grading. +Example: 5 tests × 2 LLM graders = 10 grader subagents launched simultaneously. + +**Do NOT dispatch a single grader for multiple tests.** Each subagent grades exactly one (test, grader) pair. + +**Before dispatching graders, read `agents/grader.md` and embed its full content as the system instructions in every grader subagent prompt.** The grader is a `general-purpose` task agent — there is no auto-resolved "grader" type. Without `agents/grader.md` embedded verbatim, the subagent has no grading process, no output format, and no file-path knowledge, and will produce empty or incorrect output. + +Each grader subagent (operating under `agents/grader.md` instructions): +1. Reads `/llm_graders/.json` for the grading prompt +2. Reads `/response.md` for the candidate output +3. Grades the response against the prompt criteria +4. **Writes its result to disk**: `///llm_grader_results/.json` +5. Returns score (0.0–1.0) and per-assertion evidence to the orchestrator + +**Writing to disk is critical.** Assertion arrays are lost if accumulated only in the orchestrator's context across multiple batches (context summarization drops detail). Writing per-test results to `llm_grader_results/.json` makes grading resumable and assertion evidence durable. + +The result file format is: +```json +{ "score": 0.85, "assertions": [{"text": "...", "passed": true, "evidence": "..."}] } +``` + +After **all** grader subagents complete, run Phase 3 directly. + +**Phase 3: Merge and validate** + +```bash +agentv pipeline bench +agentv results validate +``` + +`pipeline bench` reads LLM grader results from `llm_grader_results/.json` per test automatically, merges with code-grader scores, computes weighted pass_rate, and writes `grading.json` + `index.jsonl` + `benchmark.json`. + +> **Diagnosing `pass_rate=0`:** If `pipeline bench` reports `pass_rate=0` across the board, do **not** assume the tests genuinely failed. First verify the grading pipeline ran correctly: check that `/llm_grader_results/.json` exists and is non-empty for each test. If these files are absent or empty, the grader subagents failed to produce output (most common cause: `agents/grader.md` was not embedded in the subagent prompts — see Phase 2). Treat `pass_rate=0` as a real signal only after confirming grader results exist. + +### Artifacts + +All artifacts use established schemas — see `references/schemas.md` for the full definitions. Do not modify the structure. Key artifacts per run: +- **grading.json**: per-test assertions with `{text, passed, evidence}`, plus summary +- **timing.json**: `{total_tokens, duration_ms, total_duration_seconds}` +- **benchmark.json**: per-target aggregate `{pass_rate, time_seconds, tokens}` + +Write artifacts to `.agentv/artifacts/` or the iteration directory. + +### Workspace features (EVAL.yaml only) + +- **Workspace isolation** — clone repos, run setup/teardown hooks (before_all, before_each, after_each, after_all) +- **Materialization modes** — `pooled` (reuse slots), `temp` (fresh per run), `static` (existing dir) +- **Multi-repo** — clone multiple repos with sparse checkout and shallow clone support +- **File change tracking** — grade by diffing workspace files before/after agent execution + +--- + +## Step 4: Analyze Results + +Once all runs are graded, analyze the results before attempting improvements. + +### Pattern analysis + +Read the JSONL results and look for: + +- **Always-pass tests** — assertion too loose or non-discriminating. If it passes for both good and bad outputs, it's not testing anything. +- **Always-fail tests** — task impossible, eval broken, or assertion misconfigured. Don't optimize against broken evals. +- **Flaky tests** — non-deterministic results across runs. Investigate before treating failures as real. +- **Systematic failures** — same failure pattern across multiple tests. This usually points to a missing instruction or wrong approach. +- **Deterministic upgrade candidates** — `llm-grader` assertions that could be replaced with `contains`, `regex`, or `is-json` (cheaper, faster, more reliable). + +### Dispatch subagents + +- **Dispatch `analyzer`** (read `agents/analyzer.md`) for a structured quality audit: deterministic upgrade suggestions, weak assertion detection, cost/quality flags, and benchmark pattern analysis. + +- **Dispatch `comparator`** (read `agents/comparator.md`) for blind N-way comparison between iterations or targets. The comparator blinds provider identities, generates task-specific rubrics, scores each output, then unblinds and attributes improvements. + +### Trace analysis + +Use CLI tools for deeper investigation: +```bash +agentv inspect # Detailed execution trace inspection +agentv compare # Structured diff between runs +``` + +Look for: tool call patterns, error recovery behavior, conversation flow, wasted steps. + +### Present results to the user + +Show a summary table: + +``` +| Test ID | Score | Pass/Fail | Delta | Notes | +|------------------|-------|-----------|-------|--------------------------| +| basic-code-review| 0.85 | ✓ PASS | +0.15 | Found the bug this time | +| edge-case-empty | 0.00 | ✗ FAIL | — | Crashed on empty input | +``` + +Highlight: +- Current pass rate and delta from baseline +- Comparison results (which target/iteration won and why) +- Analyst observations the aggregate stats would hide + +Ask: "How does this look? Anything you'd change about the evals or the approach?" + +--- + +## Step 5: Improve + +This is the heart of the loop. You've run the test cases, analyzed the results, and now you need to make the agent better. + +### How to think about improvements + +1. **Generalize from the analysis.** You're iterating on a small eval set, but the agent will be used on many different inputs. Don't overfit to specific test cases. Rather than fiddly patches or oppressively rigid MUSTs, try different approaches and see what works. It's cheap to experiment. + +2. **Keep the prompt lean.** Read the execution transcripts, not just the final outputs. If the agent wastes time on unproductive steps, remove the instructions causing that. If it always ignores a section, that section isn't pulling its weight. + +3. **Explain the why.** Today's LLMs are smart. They have good theory of mind and can go beyond rote instructions when given good reasoning. If you find yourself writing ALWAYS or NEVER in all caps, that's a yellow flag — reframe as an explanation of why the thing matters. That's more humane, powerful, and effective. + +4. **Look for repeated work.** Read the transcripts from test runs and notice if the agent independently takes the same multi-step approach to something across cases. If all test runs result in writing the same helper script, bundle it. If every run makes the same mistake, the instruction is missing or unclear. + +### Applying changes + +- **Surgical edits**: ADD (new rule for a missing constraint), UPDATE (refine for clarity), DELETE (remove redundant or harmful rules), NEGATIVE CONSTRAINT (explicitly state what NOT to do) +- **One change per iteration** to isolate effects. If you change three things and the score improves, you don't know which change helped. +- **Variant tracking**: When a change helps some tests but hurts others, maintain 2-3 prompt variants. Compare variants to find the best overall approach before converging. +- **When converging**: Generalize specific patches into broad principles. Remove redundancy and contradictions. Ensure the prompt is clear, focused, and under 200 lines. + +### Evaluation integrity + +**Critical**: Only optimize **task prompts** (what the agent receives), never **judge prompts** (how graders score outputs). Modifying judge prompts games the evaluation without improving the agent. + +If a prompt file is referenced in both task input and grader configs, optimize for the task purpose only. Document which prompts were modified in the optimization log. + +### The iteration loop + +After improving: + +1. Apply your changes to the agent's prompts/skills/config +2. Re-run all test cases (agentv creates a new `.agentv/results/runs//` directory automatically) +3. Compare against the previous iteration (Step 4). If running in automated mode, use the **automated keep/discard** logic below instead of manual judgment — it will decide whether to keep or revert the change for you. +4. Present results to the user (or log the decision if running automated keep/discard) +5. Stop when ANY of: + - The user says they're happy + - Feedback is all empty (everything looks good) + - You're not making meaningful progress (no improvement for 2 consecutive iterations) + - Target pass rate is reached + - Maximum iterations exhausted + +**Human checkpoints**: At iterations 3, 6, and 9, always present progress to the user regardless of automation settings. Push back if optimization is accumulating contradictory rules or overfitting to specific test cases. + +### Automated keep/discard + +For autonomous iteration, use `agentv compare --json` to automatically decide whether to keep or discard each change based on wins/losses/ties. Read `references/autoresearch.md` for the full decision rules, logging format, and integration with the iteration loop. + +--- + +## Entering Mid-Lifecycle + +Users can start at any step by providing existing data: + +| Entry point | Required input | Example prompt | +|------------|---------------|----------------| +| Step 1 (Understand) | `eval-path` | "Optimize my agent against evals/support.yaml" | +| Step 2 (Write Evals) | Agent artifacts | "Write evals for this agent" | +| Step 3 (Run + Grade) | `eval-path` | "Run this eval and show me results" | +| Step 4 (Analyze) | `results-path` | "Analyze why my agent is failing on these results" | +| Step 5 (Improve) | Analysis + strategy | "Apply these optimization suggestions" | + +When entering mid-lifecycle, run only the requested step and subsequent steps. Don't re-run earlier steps unless the user requests a full loop. + +--- + +## Advanced: Blind Comparison + +For situations where you want a rigorous comparison between two versions (e.g., "is the new version actually better?"), dispatch the `comparator` subagent. It blinds identities, generates task-specific rubrics, scores outputs, then unblinds and explains why the winner won. + +This is optional and requires subagents. The human review loop is usually sufficient. + +--- + +## Description Optimization + +After the agent is working well, offer to optimize the skill's `description` field for better triggering accuracy. Read `references/description-optimization.md` for the full procedure (generate trigger EVAL.yaml, review with user, iterate, apply). + +--- + +## Autoresearch Mode + +Autoresearch is an unattended eval-improve loop that runs multiple optimize cycles without human intervention. The user triggers it with natural language (e.g., "run autoresearch on this skill", "optimize this skill unattended"). It uses the mutator subagent (`agents/mutator.md`) to rewrite artifacts based on failure analysis, and automated keep/discard to decide whether to keep or revert each change. + +Read `references/autoresearch.md` for the full procedure (prerequisites, artifact layout, keep/discard rules, the step-by-step loop, convergence criteria, and context hygiene). + +--- + +## Environment Adaptation + +For provider-specific notes (Copilot, Codex, Claude SDK, custom CLI), CI/headless mode behavior, and fallback strategies when subagents aren't available, read `references/environment-adaptation.md`. + +--- + +## Subagent Reference + +The `agents/` directory contains instructions for specialized subagents. Read them when you need to spawn the relevant subagent. + +| Agent | File | Purpose | When to dispatch | +|-------|------|---------|-----------------| +| executor | `agents/executor.md` | Perform test case tasks as the target agent | Step 3 (agent targets — one per test case) | +| grader | `agents/grader.md` | Grade responses with per-assertion evidence | Step 3 (grading — one per test × LLM grader pair) | +| comparator | `agents/comparator.md` | Blind N-way comparison + post-hoc analysis | Step 4 (comparing iterations/targets) | +| analyzer | `agents/analyzer.md` | Quality audit, deterministic upgrades, benchmarks | Step 4 (pattern analysis) | +| mutator | `agents/mutator.md` | Rewrite artifact from failure analysis | Step 5 (autoresearch — dispatched per cycle) | + +The `references/` directory has additional documentation: +- `references/autoresearch.md` — Autoresearch unattended optimization loop and automated keep/discard rules +- `references/eval-yaml-spec.md` — Eval YAML schema and assertion grading recipes +- `references/subagent-pipeline.md` — Detailed subagent-mode pipeline commands and output structure +- `references/description-optimization.md` — Skill description optimization workflow +- `references/environment-adaptation.md` — Provider-specific notes and CI/headless behavior +- `references/schemas.md` — JSON schemas for all artifacts (grading.json, benchmark.json, etc.) +- `references/migrating-from-skill-creator.md` — Guide for users coming from Anthropic's skill-creator + +--- + +Repeating the core loop for emphasis: + +- Understand what the agent does +- Write evaluation test cases +- Run the agent and grade outputs +- Analyze results — surface patterns, dispatch analyst and comparator subagents +- Improve the agent based on analysis +- Repeat until you and the user are satisfied + +Take your time with improvements. Read the transcripts. Understand why failures happened. Make changes that generalize beyond the test set. This is important work. diff --git a/evals/self/skills/fixtures/agentv-eval-review-full.txt b/evals/self/skills/fixtures/agentv-eval-review-full.txt new file mode 100644 index 000000000..23e2c3466 --- /dev/null +++ b/evals/self/skills/fixtures/agentv-eval-review-full.txt @@ -0,0 +1,52 @@ +--- +name: agentv-eval-review +description: >- + Use when reviewing eval YAML files for quality issues, linting eval files before + committing, checking eval schema compliance, or when asked to "review these evals", + "check eval quality", "lint eval files", or "validate eval structure". + Do NOT use for writing evals (use agentv-eval-writer) or running evals (use agentv-bench). +--- + +# Eval Review + +## Overview + +Lint and review AgentV eval YAML files for structural issues, schema compliance, and quality problems. Runs deterministic checks via script, then applies LLM judgment for semantic issues the script cannot catch. + +## Process + +### Step 1: Run the linter + +Execute `scripts/lint_eval.py` against the target eval files: + +```bash +python scripts/lint_eval.py --json +``` + +The script checks: +- `.eval.yaml` extension +- `description` field present +- Each test has `id`, `input`, and at least one of `criteria`/`expected_output`/`assertions` +- File paths in `type: file` use leading `/` +- `assertions` blocks present (flags tests relying solely on `expected_output`) +- `expected_output` prose detection (flags "The agent should..." patterns) +- Repeated file inputs across tests (recommends top-level `input`) +- Naming prefix consistency across eval files in same directory + +### Step 2: Review script output + +Report the script findings grouped by severity (error > warning > info). For each finding, include the file path and a concrete fix. + +### Step 3: Semantic review (LLM judgment) + +The script catches structural issues but cannot assess: +- **Factual accuracy** — Do tool/command names in expected_output match what the skill documents? +- **Coverage gaps** — Are important edge cases missing? +- **Assertion discriminability** — Would assertions pass for both good and bad output? +- **Cross-file consistency** — Do output filenames match across evals and skills? + +Read the relevant SKILL.md files and cross-check against the eval content for these issues. + +## Skill Resources + +- `scripts/lint_eval.py` — Deterministic eval linter (Python 3.11+, stdlib only) diff --git a/evals/self/skills/fixtures/agentv-eval-review.txt b/evals/self/skills/fixtures/agentv-eval-review.txt new file mode 100644 index 000000000..23e2c3466 --- /dev/null +++ b/evals/self/skills/fixtures/agentv-eval-review.txt @@ -0,0 +1,52 @@ +--- +name: agentv-eval-review +description: >- + Use when reviewing eval YAML files for quality issues, linting eval files before + committing, checking eval schema compliance, or when asked to "review these evals", + "check eval quality", "lint eval files", or "validate eval structure". + Do NOT use for writing evals (use agentv-eval-writer) or running evals (use agentv-bench). +--- + +# Eval Review + +## Overview + +Lint and review AgentV eval YAML files for structural issues, schema compliance, and quality problems. Runs deterministic checks via script, then applies LLM judgment for semantic issues the script cannot catch. + +## Process + +### Step 1: Run the linter + +Execute `scripts/lint_eval.py` against the target eval files: + +```bash +python scripts/lint_eval.py --json +``` + +The script checks: +- `.eval.yaml` extension +- `description` field present +- Each test has `id`, `input`, and at least one of `criteria`/`expected_output`/`assertions` +- File paths in `type: file` use leading `/` +- `assertions` blocks present (flags tests relying solely on `expected_output`) +- `expected_output` prose detection (flags "The agent should..." patterns) +- Repeated file inputs across tests (recommends top-level `input`) +- Naming prefix consistency across eval files in same directory + +### Step 2: Review script output + +Report the script findings grouped by severity (error > warning > info). For each finding, include the file path and a concrete fix. + +### Step 3: Semantic review (LLM judgment) + +The script catches structural issues but cannot assess: +- **Factual accuracy** — Do tool/command names in expected_output match what the skill documents? +- **Coverage gaps** — Are important edge cases missing? +- **Assertion discriminability** — Would assertions pass for both good and bad output? +- **Cross-file consistency** — Do output filenames match across evals and skills? + +Read the relevant SKILL.md files and cross-check against the eval content for these issues. + +## Skill Resources + +- `scripts/lint_eval.py` — Deterministic eval linter (Python 3.11+, stdlib only) diff --git a/evals/self/skills/fixtures/agentv-eval-writer-full.txt b/evals/self/skills/fixtures/agentv-eval-writer-full.txt new file mode 100644 index 000000000..76f429815 --- /dev/null +++ b/evals/self/skills/fixtures/agentv-eval-writer-full.txt @@ -0,0 +1,18195 @@ +--- +name: agentv-eval-writer +description: >- + Write, edit, review, and validate AgentV EVAL.yaml / .eval.yaml evaluation files. + Use when asked to create new eval files, update or fix existing ones, add or remove test cases, + configure graders (`llm-grader`, `code-grader`, `rubrics`), review whether an eval is correct or complete, + convert between EVAL.yaml and evals.json using `agentv convert`, or generate eval test cases + from chat transcripts (markdown conversation or JSON messages). + Do NOT use for creating SKILL.md files, writing skill definitions, or running evals — + running and benchmarking belongs to agentv-bench. +--- + +# AgentV Eval Writer + +Comprehensive docs: https://agentv.dev + +## Evaluation Types + +AgentV evaluations measure **execution quality** — whether your agent or skill produces correct output when invoked. + +For **trigger quality** (whether the right skill is triggered for the right prompts), see the [Evaluation Types guide](https://agentv.dev/guides/evaluation-types/). Do not use execution eval configs (`EVAL.yaml`, `evals.json`) for trigger evaluation — these are distinct concerns requiring different tooling and methodologies. + +## Starting from evals.json? + +If the project already has an Agent Skills `evals.json` file, use it as a starting point instead of writing YAML from scratch: + +```bash +# Convert evals.json to AgentV EVAL YAML +agentv convert evals.json + +# Run directly without converting (all commands accept evals.json) +agentv eval evals.json +``` + +The converter maps `prompt` → `input`, `expected_output` → `expected_output`, `assertions` → `assertions` (`llm-grader`), and resolves `files[]` paths. The generated YAML includes TODO comments for AgentV features to add (workspace setup, code graders, rubrics, required gates). + +After converting, enhance the YAML with AgentV-specific capabilities shown below. + +## From Chat Transcript + +Convert a chat conversation into eval test cases without starting from scratch. + +**Input formats:** + +Markdown conversation: +``` +User: How do I reset my password? +Assistant: Go to Settings > Security > Reset Password... +``` + +JSON messages: +```json +[{"role": "user", "content": "How do I reset my password?"}, + {"role": "assistant", "content": "Go to Settings > Security > Reset Password..."}] +``` + +**Select exchanges that make good test cases:** +- Factual Q&A — verifiable answers +- Task completion — user requests an action, agent performs it +- Edge cases — unusual inputs, error handling, boundary conditions +- Multi-turn reasoning — exchanges where earlier context matters + +**Skip:** greetings, one-word acknowledgments, repeated exchanges + +**Multi-turn format** (when context from prior turns matters): +```yaml +tests: + - id: multi-turn-context + criteria: "Agent remembers prior context" + input: + - role: user + content: "My name is Alice" + - role: assistant + content: "Nice to meet you, Alice!" + - role: user + content: "What's my name?" + expected_output: "Your name is Alice." + assertions: + - type: rubrics + criteria: + - Correctly recalls the user's name from earlier in the conversation +``` + +**Guidelines:** preserve exact wording in `expected_output`; aim for 5–15 tests per transcript; pick exchanges that test different capabilities. + +## Quick Start + +```yaml +description: Example eval +execution: + target: default + +tests: + - id: greeting + criteria: Friendly greeting + input: "Say hello" + expected_output: "Hello! How can I help you?" + assertions: + - type: rubrics + criteria: + - Greeting is friendly and warm + - Offers to help +``` + +## Eval File Structure + +**Required:** `tests` (array or string path) +**Optional:** `name`, `description`, `version`, `author`, `tags`, `license`, `requires`, `execution`, `suite`, `workspace`, `assertions`, `input` + +**Test fields:** + +| Field | Required | Description | +|-------|----------|-------------| +| `id` | yes | Unique identifier | +| `criteria` | yes | What the response should accomplish | +| `input` / `input` | yes | Input to the agent | +| `expected_output` / `expected_output` | no | Gold-standard reference answer | +| `assertions` | no | Graders: deterministic checks, rubrics, and LLM/code graders | +| `rubrics` | no | **Deprecated** — use `assertions: [{type: rubrics, criteria: [...]}]` instead | +| `execution` | no | Per-case execution overrides | +| `workspace` | no | Per-case workspace config (overrides suite-level) | +| `metadata` | no | Arbitrary key-value pairs passed to setup/teardown scripts | +| `conversation_id` | no | Thread grouping | + +**Shorthand aliases:** +- `input` (string) expands to `[{role: "user", content: "..."}]` +- `expected_output` (string/object) expands to `[{role: "assistant", content: ...}]` +- Canonical `input` / `expected_output` take precedence when both present + +**Message format:** `{role, content}` where role is `system`, `user`, `assistant`, or `tool` +**Content types:** inline text, `{type: "file", value: "./path.md"}` +**File paths:** relative from eval file dir, or absolute with `/` prefix from repo root +**File handling by provider type:** LLM providers receive file content inlined in XML tags. Agent providers receive a preread block with `file://` URIs and must read files themselves. See [Coding Agents > Prompt format](https://agentv.dev/targets/coding-agents#prompt-format). + +**JSONL format:** One test per line as JSON. Optional `.yaml` sidecar for shared defaults. See `examples/features/basic-jsonl/`. + +**Environment variables:** All string fields support `${{ VAR }}` interpolation. Missing vars resolve to empty string. Works in eval files, external case files, and workspace configs. `.env` files are loaded automatically. + +## Metadata + +When `name` is present, the suite is parsed as a metadata-bearing eval: + +```yaml +name: export-screening # required, lowercase/hyphens, max 64 chars +description: Evaluates export control screening accuracy +version: "1.0" +author: acme-compliance +tags: [compliance, agents] +license: Apache-2.0 +requires: + agentv: ">=0.30.0" +``` + +## Suite-level Input + +Prepend shared input messages to every test (like suite-level `assertions`). Avoids repeating the same prompt file in each test: + +```yaml +input: + - role: user + content: + - type: file + value: ./system-prompt.md + +tests: ./cases.yaml + +# cases.yaml — each test only needs its own query +# - id: test-1 +# criteria: ... +# input: "User question here" +``` + +Effective input: `[...suite input, ...test input]`. Skipped when `execution.skip_defaults: true`. +Accepts same formats as test `input` (string or message array). + +## Tests as String Path + +Point `tests` to an external file instead of inlining: + +```yaml +name: my-eval +description: My evaluation suite +tests: ./cases.yaml # relative to eval file dir +``` + +The external file can be YAML (array of test objects) or JSONL. + +## Assertions Field + +`assertions` defines graders at the suite level or per-test level. It is the canonical field for all graders: + +```yaml +# Suite-level (appended to every test) +assertions: + - type: is-json + required: true + - type: contains + value: "status" + +tests: + - id: test-1 + criteria: Returns JSON + input: Get status + # Per-test assertions (runs before suite-level) + assertions: + - type: equals + value: '{"status": "ok"}' +``` + +## How `criteria` and `assertions` Interact + +`criteria` is a **data field** — it describes what the response should accomplish. It is **not** a grader. How it gets evaluated depends on whether `assertions` is present: + +| Scenario | What happens | Warning? | +|----------|-------------|----------| +| `criteria` + **no `assertions`** | Implicit `llm-grader` runs automatically against `criteria` | No | +| `criteria` + **`assertions` with only deterministic graders** (contains, regex, etc.) | Only declared graders run. `criteria` is **not evaluated**. | Yes — warns that no grader will consume criteria | +| `criteria` + **`assertions` with a grader** (`llm-grader`, `code-grader`, `rubrics`) | Declared graders run. Graders receive `criteria` as input. | No | + +### No assertions → implicit llm-grader + +The simplest path. `criteria` is automatically evaluated by the default `llm-grader`: + +```yaml +tests: + - id: simple-eval + criteria: Assistant correctly explains the bug and proposes a fix + input: "Debug this function..." + # No assertions → default llm-grader evaluates against criteria +``` + +### assertions present → no implicit grader + +When `assertions` is defined, **only the declared graders run**. If you want an LLM grader alongside deterministic checks, declare it explicitly: + +```yaml +tests: + - id: mixed-eval + criteria: Response is helpful and mentions the fix + input: "Debug this function..." + assertions: + - type: llm-grader # must be explicit when assertions is present + - type: contains + value: "fix" +``` + +**Common mistake:** defining `criteria` with only deterministic graders. The criteria will be ignored and a warning is emitted: + +```yaml +tests: + - id: bad-example + criteria: Gives a thoughtful answer # ⚠ NOT evaluated — no grader in assertions + input: "What is 2+2?" + assertions: + - type: contains + value: "4" + # Warning: criteria is defined but no grader in assertions will evaluate it. +``` + +## Required Gates + +Any grader can be marked `required` to enforce a minimum score: + +```yaml +assertions: + - type: contains + value: "DENIED" + required: true # must score >= 0.8 (default) + - type: rubrics + required: 0.6 # must score >= 0.6 (custom threshold) + criteria: + - id: accuracy + outcome: Identifies the denied party + weight: 5.0 +``` + +If a required grader scores below its threshold, the overall verdict is forced to `fail`. + +## Workspace Setup/Teardown + +Run scripts before/after each test. Define at suite level or override per case: + +```yaml +workspace: + template: ./workspace-templates/my-project + setup: + command: ["bun", "run", "setup.ts"] + timeout_ms: 120000 + teardown: + command: ["bun", "run", "teardown.ts"] + +tests: + - id: case-1 + input: Fix the bug + criteria: Bug is fixed + metadata: + repo: sympy/sympy + workspace: + repos: + - path: /testbed + source: + type: git + url: https://github.com/sympy/sympy.git + checkout: + base_commit: "abc123" + docker: + image: swebench/sweb.eval.django__django:latest +``` + +**Lifecycle:** template copy → repo clone → setup → git baseline → agent → file changes → teardown → repo reset → cleanup +**Merge:** Case-level fields replace suite-level fields. +**Commands receive stdin JSON:** `{workspace_path, test_id, eval_run_id, case_input, case_metadata}` +**Setup failure:** aborts case. **Teardown failure:** non-fatal (warning). +For SWE-bench-style evals, keep operational checkout state under `workspace.repos[].checkout.base_commit`; treat `metadata.base_commit` as informational only. + +### Repository Lifecycle + +Clone repos into workspace automatically. For shared repo workspaces, pooling is the default: + +```yaml +workspace: + repos: + - path: ./repo + source: + type: git + url: https://github.com/org/repo.git + checkout: + ref: main + ancestor: 1 # parent commit + clone: + depth: 10 + hooks: + after_each: + reset: fast # none | fast | strict + isolation: shared # shared | per_test + mode: pooled # pooled | temp | static + hooks: + enabled: true # set false to skip all hooks +``` + +- `source.type`: `git` (URL) or `local` (path) +- `checkout.resolve`: `remote` (ls-remote) or `local` +- `clone.depth`: shallow clone depth +- `clone.filter`: partial clone filter (e.g., `blob:none`) +- `clone.sparse`: sparse checkout paths array +- `mode`: `pooled` (default for shared repos), `temp`, or `static` +- `path`: workspace path used when `mode: static`; when empty/missing the workspace is auto-materialised (template copied + repos cloned); populated dirs are reused as-is +- `hooks.enabled`: boolean (default `true`); set `false` to skip all lifecycle hooks +- Pool reset defaults to `fast` (`git clean -fd`); use `--workspace-clean full` for strict reset (`git clean -fdx`) +- Pool entries are managed separately via `agentv workspace list` and `agentv workspace clean` +- `agentv workspace deps ` scans eval files and outputs a JSON manifest of required git repos (useful for CI pre-cloning) + +See https://agentv.dev/targets/configuration/#repository-lifecycle + +## Grader Types + +Configure via `assertions` array. Multiple graders produce a weighted average score. + +### code_grader +```yaml +- name: format_check + type: code-grader + command: [uv, run, validate.py] + cwd: ./scripts # optional working directory + target: {} # optional: enable LLM target proxy (max_calls: 50) +``` +Contract: stdin JSON -> stdout JSON `{score, assertions: [{text, passed, evidence?}], reasoning}` +Input includes: `question`, `criteria`, `answer`, `reference_answer`, `output`, `trace`, `token_usage`, `cost_usd`, `duration_ms`, `start_time`, `end_time`, `file_changes`, `workspace_path`, `config` +When a workspace is configured, `workspace_path` is the absolute path to the workspace dir (also available as `AGENTV_WORKSPACE_PATH` env var). Use this for functional grading (e.g., running `npm test` in the workspace). +See docs at https://agentv.dev/graders/code-graders/ + +### llm_grader +```yaml +- name: quality + type: llm-grader + prompt: ./prompts/eval.md # markdown template or command config + target: grader_gpt_5_mini # optional: override the grader target for this grader + model: gpt-5-chat # optional model override + config: # passed to prompt templates as context.config + strictness: high +``` +Variables: `{{question}}`, `{{criteria}}`, `{{answer}}`, `{{reference_answer}}`, `{{input}}`, `{{expected_output}}`, `{{output}}`, `{{file_changes}}` +- Markdown templates: use `{{variable}}` syntax +- TypeScript templates: use `definePromptTemplate(fn)` from `@agentv/eval`, receives context object with all variables + `config` +- Use `target:` to run different `llm-grader` graders against different named LLM targets in the same eval (useful for grader panels / ensembles) + +### composite +```yaml +- name: gate + type: composite + assertions: + - name: safety + type: llm-grader + prompt: ./safety.md + - name: quality + type: llm-grader + aggregator: + type: weighted_average + weights: { safety: 0.3, quality: 0.7 } +``` +Aggregator types: `weighted_average`, `all_or_nothing`, `minimum`, `maximum`, `safety_gate` +- `safety_gate`: fails immediately if the named gate grader scores below threshold (default 1.0) + +### tool_trajectory +```yaml +- name: tool_check + type: tool-trajectory + mode: any_order # any_order | in_order | exact + minimums: # for any_order + knowledgeSearch: 2 + expected: # for in_order/exact + - tool: knowledgeSearch + args: { query: "search term" } # partial deep equality match + - tool: documentRetrieve + args: any # any arguments accepted + max_duration_ms: 5000 # per-tool latency assertion + - tool: summarize # omit args to skip argument checking +``` + +### field_accuracy +```yaml +- name: fields + type: field-accuracy + match_type: exact # exact | date | numeric_tolerance + numeric_tolerance: 0.01 # for numeric_tolerance match_type + aggregation: weighted_average # weighted_average | all_or_nothing +``` +Compares `output` fields against `expected_output` fields. + +### latency +```yaml +- name: speed + type: latency + max_ms: 5000 +``` + +### cost +```yaml +- name: budget + type: cost + max_usd: 0.10 +``` + +### token_usage +```yaml +- name: tokens + type: token-usage + max_total_tokens: 4000 +``` + +### execution_metrics +```yaml +- name: efficiency + type: execution-metrics + max_tool_calls: 10 # Maximum tool invocations + max_llm_calls: 5 # Maximum LLM calls (assistant messages) + max_tokens: 5000 # Maximum total tokens (input + output) + max_cost_usd: 0.05 # Maximum cost in USD + max_duration_ms: 30000 # Maximum execution duration + target_exploration_ratio: 0.6 # Target ratio of read-only tool calls + exploration_tolerance: 0.2 # Tolerance for ratio check (default: 0.2) +``` +Declarative threshold-based checks on execution metrics. Only specified thresholds are checked. +Score is proportional: `passed / total` assertions. Missing data counts as a failed assertion. + +### contains +```yaml +- type: contains + value: "DENIED" + required: true +``` +Binary check: does output contain the substring? Name auto-generated if omitted. + +### regex +```yaml +- type: regex + value: "\\d{3}-\\d{2}-\\d{4}" +``` +Binary check: does output match the regex pattern? + +### equals +```yaml +- type: equals + value: "42" +``` +Binary check: does output exactly equal the value (both trimmed)? + +### is_json +```yaml +- type: is-json + required: true +``` +Binary check: is the output valid JSON? + +### rubrics +```yaml +- type: rubrics + criteria: + - id: accuracy + outcome: Correctly identifies the denied party + weight: 5.0 + - id: reasoning + outcome: Provides clear reasoning + weight: 3.0 +``` +LLM-judged structured evaluation with weighted criteria. Criteria items support `id`, `outcome`, `weight`, and `required` fields. + +### rubrics (inline, deprecated) +Top-level `rubrics:` field is deprecated. Use `type: rubrics` under `assertions` instead. +See `references/rubric-grader.md` for score-range mode and scoring formula. + +## Execution Error Tolerance + +Control how the runner handles execution errors (infrastructure failures, not quality failures): + +```yaml +execution: + fail_on_error: false # never halt (default) + # fail_on_error: true # halt on first execution error +``` + +When halted, remaining tests get `executionStatus: 'execution_error'` with `failureReasonCode: 'error_threshold_exceeded'`. + +## Suite-Level Quality Threshold + +Set a minimum mean score for the eval suite. If the mean quality score falls below the threshold, the CLI exits with code 1 — useful for CI/CD quality gates. + +```yaml +execution: + threshold: 0.8 +``` + +CLI flag `--threshold 0.8` overrides the YAML value. Must be a number between 0 and 1. Mean score is computed from quality results only (execution errors excluded). + +The threshold also controls JUnit XML pass/fail: tests with scores below the threshold are marked as ``. When no threshold is set, JUnit defaults to 0.5. + +## CLI Commands + +```bash +# Run evaluation (requires API keys) +agentv eval [--test-id ] [--target ] [--dry-run] [--threshold <0-1>] + +# Run with OTLP JSON file (importable by OTel backends) +agentv eval --otel-file traces/eval.otlp.json + +# Run a single assertion in isolation (no API keys needed) +agentv eval assert --agent-output "..." --agent-input "..." + +# Import agent transcripts for offline grading +agentv import claude --session-id + +# Re-run only execution errors from a previous run +agentv eval --retry-errors .agentv/results/runs//index.jsonl + +# Validate eval file +agentv validate + +# Compare results — N-way matrix from a canonical run manifest +agentv compare .agentv/results/runs//index.jsonl +agentv compare .agentv/results/runs//index.jsonl --baseline # CI regression gate +agentv compare .agentv/results/runs//index.jsonl --baseline --candidate # pairwise +agentv compare .agentv/results/runs//index.jsonl .agentv/results/runs//index.jsonl + +# Author assertions directly in the eval file +# Prefer simple assertions when they fit the criteria; use deterministic or LLM-based graders when needed +agentv validate +``` + +## Code Judge SDK + +Use `@agentv/eval` to build custom graders in TypeScript/JavaScript: + +### defineAssertion (recommended for custom checks) +```typescript +#!/usr/bin/env bun +import { defineAssertion } from '@agentv/eval'; + +export default defineAssertion(({ answer, trace }) => ({ + pass: answer.length > 0 && (trace?.eventCount ?? 0) <= 10, + reasoning: 'Checks content exists and is efficient', +})); +``` + +Assertions support both `pass: boolean` and `score: number` (0-1). If only `pass` is given, score is 1 (pass) or 0 (fail). + +### defineCodeGrader (full control) +```typescript +#!/usr/bin/env bun +import { defineCodeGrader } from '@agentv/eval'; + +export default defineCodeGrader(({ trace, answer }) => ({ + score: trace?.eventCount <= 5 ? 1.0 : 0.5, + assertions: [ + { text: 'Efficient tool usage', passed: (trace?.eventCount ?? 0) <= 5 }, + ], +})); +``` + +Both are used via `type: code-grader` in YAML with `command: [bun, run, grader.ts]`. + +### Convention-Based Discovery + +Place assertion files in `.agentv/assertions/` — they auto-register by filename: + +``` +.agentv/assertions/word-count.ts → type: word-count +.agentv/assertions/sentiment.ts → type: sentiment +``` + +No `command:` needed in YAML — just use `type: `. + +## Programmatic API + +Use `evaluate()` from `@agentv/core` to run evals as a library: + +```typescript +import { evaluate } from '@agentv/core'; + +const { results, summary } = await evaluate({ + tests: [ + { + id: 'greeting', + input: 'Say hello', + assertions: [{ type: 'contains', value: 'hello' }], + }, + ], + target: { provider: 'mock_agent' }, +}); +console.log(`${summary.passed}/${summary.total} passed`); +``` + +Supports inline tests (no YAML) or file-based via `specFile`. + +## defineConfig + +Type-safe project configuration in `agentv.config.ts`: + +```typescript +import { defineConfig } from '@agentv/core'; + +export default defineConfig({ + execution: { workers: 5, maxRetries: 2 }, + output: { format: 'jsonl', dir: './results' }, + limits: { maxCostUsd: 10.0 }, +}); +``` + +Auto-discovered from project root. Validated with Zod. + +## Scaffold Commands + +```bash +agentv create assertion # → .agentv/assertions/.ts +agentv create eval # → evals/.eval.yaml + .cases.jsonl +``` + +## Skill Improvement Workflow + +For a complete guide to iterating on skills using evaluations — writing scenarios, running baselines, comparing results, and improving — see the [Skill Improvement Workflow](https://agentv.dev/guides/skill-improvement-workflow/) guide. +## Human Review Checkpoint + +After running evals, perform a human review before iterating. Create `feedback.json` in the results directory: + +```json +{ + "run_id": "2026-03-14T10-32-00_claude", + "reviewer": "engineer-name", + "timestamp": "2026-03-14T12:00:00Z", + "overall_notes": "Summary of observations", + "per_case": [ + { + "test_id": "test-id", + "verdict": "acceptable | needs_improvement | incorrect | flaky", + "notes": "Why this verdict", + "evaluator_overrides": { "code-grader:name": "Override note" }, + "workspace_notes": "Workspace state observations" + } + ] +} +``` + +Use `evaluator_overrides` for workspace evaluations to annotate specific grader results (e.g., "code-grader was too strict"). Use `workspace_notes` for observations about workspace state. + +Review workflow: run evals → inspect results (`agentv inspect show`) → write feedback → tune prompts/graders → re-run. + +Full guide: https://agentv.dev/guides/human-review/ + +## Schemas + +- Eval file: `references/eval-schema.json` +- Config: `references/config-schema.json` + +--- references/config-schema.json --- +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "AgentV Config Schema", + "description": "Schema for .agentv/config.yaml configuration files", + "type": "object", + "properties": { + "$schema": { + "type": "string", + "description": "Schema identifier", + "enum": ["agentv-config-v2"] + }, + "required_version": { + "type": "string", + "description": "Minimum AgentV version required to run this project's evals. Uses semver range syntax (e.g., '>=2.11.0', '^2.11.0'). When the installed version is below the range, AgentV warns and prompts to update.", + "examples": [">=2.11.0", "^2.12.0", ">=2.11.0 <3.0.0"] + }, + "eval_patterns": { + "type": "array", + "description": "Glob patterns for discovering eval files during interactive mode (`agentv eval` with no args). Defaults to ['**/evals/**/dataset*.yaml', '**/evals/**/eval.yaml'] if not specified.", + "items": { + "type": "string", + "description": "Glob pattern (e.g., '**/evals/**/dataset*.yaml', '**/evals/**/eval.yaml')" + }, + "examples": [["**/evals/**/dataset*.yaml", "**/evals/**/eval.yaml"], ["**/evals/**/*.yaml"]] + }, + "execution": { + "type": "object", + "description": "Default execution options. CLI flags take precedence over these values.", + "properties": { + "verbose": { + "type": "boolean", + "description": "Enable verbose logging (equivalent to --verbose)", + "default": false + }, + "keep_workspaces": { + "type": "boolean", + "description": "Always keep temp workspaces after eval (equivalent to --keep-workspaces)", + "default": false + }, + "otel_file": { + "type": "string", + "description": "Write OTLP JSON trace to this path (equivalent to --otel-file). Supports {timestamp} placeholder.", + "examples": [".agentv/results/otel-{timestamp}.json"] + } + }, + "additionalProperties": false + }, + "hooks": { + "type": "object", + "description": "Lifecycle hooks that run at specific points during agentv execution.", + "properties": { + "before_session": { + "type": "string", + "description": "Shell command to run once at agentv startup, before any command executes. stdout is parsed for env var exports (KEY=value or export KEY=\"value\") and injected into process.env. Keys already set in the environment are not overwritten. stderr is forwarded to the user. Non-zero exit aborts with an error.", + "examples": ["bun scripts/load-secrets.ts", "eval $(aws ssm get-parameters-by-path ...)"] + } + }, + "additionalProperties": false + } + }, + "required": ["$schema"], + "additionalProperties": false +} + +--- references/custom-evaluators.md --- +# Custom Graders + +## Wire Format + +### Input (stdin JSON) + +```json +{ + "question": "string", + "criteria": "string", + "reference_answer": "string", + "answer": "string", + "input_files": ["path"], + "input": [{"role": "user", "content": "..."}], + "expected_output": [{"role": "assistant", "content": "..."}], + "output": [{"role": "assistant", "content": "..."}], + "trace": { + "event_count": 5, + "tool_calls": {"fetch": 1}, + "error_count": 0, + "llm_call_count": 2 + }, + "token_usage": {"input": 1000, "output": 500}, + "cost_usd": 0.0015, + "duration_ms": 3500, + "start_time": "2026-02-13T10:00:00.000Z", + "end_time": "2026-02-13T10:00:03.500Z" +} +``` + +### Output (stdout JSON) + +```json +{ + "score": 0.85, + "assertions": [ + { "text": "passed check", "passed": true }, + { "text": "failed check", "passed": false } + ], + "reasoning": "explanation" +} +``` + +`score` (0.0-1.0) required. `assertions`, `reasoning` optional. + +## SDK Functions + +```typescript +import { defineCodeGrader, createTargetClient, definePromptTemplate } from '@agentv/eval'; +``` + +- `defineCodeGrader(fn)` - Wraps evaluation function with stdin/stdout handling +- `createTargetClient()` - Returns LLM proxy client (when `target: {}` configured) + - `.invoke({question, systemPrompt})` - Single LLM call + - `.invokeBatch(requests)` - Batch LLM calls +- `definePromptTemplate(fn)` - Wraps prompt generation function + - Context fields: `question`, `answer`, `referenceAnswer`, `criteria`, `expectedOutput`, `output`, `config`, `trace`, `tokenUsage`, `costUsd`, `durationMs`, `startTime`, `endTime` + +## Python Example + +```python +#!/usr/bin/env python3 +import json, sys + +def evaluate(data: dict) -> dict: + candidate = data.get("answer", "") + assertions = [] + for kw in ["async", "await"]: + assertions.append({"text": f"Keyword '{kw}'", "passed": kw in candidate}) + passed = sum(1 for a in assertions if a["passed"]) + return { + "score": passed / max(len(assertions), 1), + "assertions": assertions, + } + +if __name__ == "__main__": + try: + print(json.dumps(evaluate(json.loads(sys.stdin.read())))) + except Exception as e: + print(json.dumps({"score": 0, "assertions": [{"text": str(e), "passed": False}]})) + sys.exit(1) +``` + +## TypeScript Example + +```typescript +#!/usr/bin/env bun +import { defineCodeGrader } from '@agentv/eval'; + +export default defineCodeGrader(({ answer, criteria }) => { + const assertions: Array<{ text: string; passed: boolean }> = []; + if (answer.includes(criteria)) { + assertions.push({ text: 'Matches expected outcome', passed: true }); + } else { + assertions.push({ text: 'Does not match expected outcome', passed: false }); + } + const passed = assertions.filter(a => a.passed).length; + return { + score: passed / Math.max(assertions.length, 1), + assertions, + }; +}); +``` + +## Template Variables + +Derived from test fields (users never author these directly): + +| Variable | Source | +|----------|--------| +| `question` | First user message in `input` | +| `criteria` | Test `criteria` field | +| `reference_answer` | Last entry in `expected_output` | +| `answer` | Last entry in `output` (runtime) | +| `input` | Full resolved input array (JSON) | +| `expected_output` | Full resolved expected array (JSON) | +| `output` | Full provider output array (JSON) | + +Markdown templates use `{{variable}}` syntax. TypeScript templates receive context object. + +--- references/eval-schema.json --- +{ + "$schema": "https://json-schema.org/draft/2019-09/schema#", + "title": "AgentV Eval File", + "description": "Schema for AgentV evaluation YAML files (.eval.yaml)", + "$ref": "#/definitions/EvalFile", + "definitions": { + "EvalFile": { + "type": "object", + "properties": { + "$schema": { + "type": "string" + }, + "name": { + "type": "string", + "pattern": "^[a-z0-9-]+$" + }, + "description": { + "type": "string" + }, + "category": { + "type": "string" + }, + "version": { + "type": "string" + }, + "author": { + "type": "string" + }, + "tags": { + "type": "array", + "items": { + "type": "string" + } + }, + "license": { + "type": "string" + }, + "requires": { + "type": "object", + "properties": { + "agentv": { + "type": "string" + } + }, + "additionalProperties": false + }, + "input": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + }, + "required": ["role", "content"], + "additionalProperties": false + } + } + ] + }, + "input_files": { + "type": "array", + "items": { + "type": "string" + } + }, + "tests": { + "anyOf": [ + { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "criteria": { + "type": "string" + }, + "input": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + }, + "required": ["role", "content"], + "additionalProperties": false + } + } + ] + }, + "input_files": { + "type": "array", + "items": { + "type": "string" + } + }, + "expected_output": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + }, + "required": ["role", "content"], + "additionalProperties": false + } + } + ] + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": ["any_order", "in_order", "exact", "subset", "superset"] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["tool"], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["path", "match"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } + }, + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "budget"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] + } + }, + "evaluators": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": ["any_order", "in_order", "exact", "subset", "superset"] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["tool"], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["path", "match"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } + }, + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "budget"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] + } + }, + "execution": { + "type": "object", + "properties": { + "target": { + "type": "string" + }, + "targets": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "name": { + "type": "string", + "minLength": 1 + }, + "use_target": { + "type": "string" + }, + "hooks": { + "type": "object", + "properties": { + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "required": ["name"], + "additionalProperties": false + } + ] + } + }, + "workers": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": ["any_order", "in_order", "exact", "subset", "superset"] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["tool"], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["path", "match"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } + }, + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "budget"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] + } + }, + "evaluators": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": ["any_order", "in_order", "exact", "subset", "superset"] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["tool"], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["path", "match"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } + }, + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "budget"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] + } + }, + "skip_defaults": { + "type": "boolean" + }, + "cache": { + "type": "boolean" + }, + "trials": { + "type": "object", + "properties": { + "count": { + "type": "integer", + "minimum": 1 + }, + "strategy": { + "type": "string", + "enum": ["pass_at_k", "mean", "confidence_interval"] + }, + "cost_limit_usd": { + "type": "number", + "minimum": 0 + }, + "costLimitUsd": { + "type": "number", + "minimum": 0 + } + }, + "required": ["count"], + "additionalProperties": false + }, + "budget_usd": { + "type": "number", + "minimum": 0 + }, + "budgetUsd": { + "type": "number", + "minimum": 0 + }, + "fail_on_error": { + "type": "boolean" + }, + "failOnError": { + "type": "boolean" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "additionalProperties": false + }, + "workspace": { + "type": "object", + "properties": { + "template": { + "type": "string" + }, + "isolation": { + "type": "string", + "enum": ["shared", "per_test"] + }, + "repos": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "source": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "git" + }, + "url": { + "type": "string", + "format": "uri" + } + }, + "required": ["type", "url"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "local" + }, + "path": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + } + ] + }, + "checkout": { + "type": "object", + "properties": { + "ref": { + "type": "string" + }, + "base_commit": { + "type": "string", + "minLength": 1 + }, + "resolve": { + "type": "string", + "enum": ["remote", "local"] + }, + "ancestor": { + "type": "integer", + "minimum": 0 + } + }, + "additionalProperties": false + }, + "clone": { + "type": "object", + "properties": { + "depth": { + "type": "integer", + "minimum": 1 + }, + "filter": { + "type": "string" + }, + "sparse": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "hooks": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + }, + "mode": { + "type": "string", + "enum": ["pooled", "temp", "static"] + }, + "path": { + "type": "string" + }, + "docker": { + "type": "object", + "properties": { + "image": { + "type": "string" + }, + "timeout": { + "type": "integer", + "minimum": 1 + }, + "memory": { + "type": "string" + }, + "cpus": { + "type": "number", + "minimum": 0.1 + } + }, + "required": ["image"], + "additionalProperties": false + } + }, + "additionalProperties": false + }, + "metadata": { + "type": "object", + "additionalProperties": {} + }, + "conversation_id": { + "type": "string" + }, + "suite": { + "type": "string" + }, + "depends_on": { + "type": "array", + "items": { + "type": "string" + } + }, + "on_dependency_failure": { + "type": "string", + "enum": ["skip", "fail", "run"] + }, + "mode": { + "type": "string", + "enum": ["conversation"] + }, + "turns": { + "type": "array", + "items": { + "type": "object", + "properties": { + "input": { + "anyOf": [ + { + "type": "string" + }, + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + ] + }, + "expected_output": { + "anyOf": [ + { + "type": "string" + }, + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + ] + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["tool"], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["path", "match"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } + }, + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "budget"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] + } + ] + } + } + }, + "required": ["input"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["mean", "min", "max"] + }, + "on_turn_failure": { + "type": "string", + "enum": ["continue", "stop"] + }, + "window_size": { + "type": "integer", + "minimum": 1 + } + }, + "required": ["id"], + "additionalProperties": false + } + }, + { + "type": "string" + } + ] + }, + "eval_cases": { + "anyOf": [ + { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "criteria": { + "type": "string" + }, + "input": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + }, + "required": ["role", "content"], + "additionalProperties": false + } + } + ] + }, + "input_files": { + "type": "array", + "items": { + "type": "string" + } + }, + "expected_output": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + }, + "required": ["role", "content"], + "additionalProperties": false + } + } + ] + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": ["any_order", "in_order", "exact", "subset", "superset"] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["tool"], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["path", "match"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } + }, + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "budget"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] + } + }, + "evaluators": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": ["any_order", "in_order", "exact", "subset", "superset"] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["tool"], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["path", "match"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } + }, + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "budget"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] + } + }, + "execution": { + "type": "object", + "properties": { + "target": { + "type": "string" + }, + "targets": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "name": { + "type": "string", + "minLength": 1 + }, + "use_target": { + "type": "string" + }, + "hooks": { + "type": "object", + "properties": { + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "required": ["name"], + "additionalProperties": false + } + ] + } + }, + "workers": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": ["any_order", "in_order", "exact", "subset", "superset"] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["tool"], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["path", "match"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } + }, + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "budget"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] + } + }, + "evaluators": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": ["any_order", "in_order", "exact", "subset", "superset"] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["tool"], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["path", "match"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } + }, + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "budget"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] + } + }, + "skip_defaults": { + "type": "boolean" + }, + "cache": { + "type": "boolean" + }, + "trials": { + "type": "object", + "properties": { + "count": { + "type": "integer", + "minimum": 1 + }, + "strategy": { + "type": "string", + "enum": ["pass_at_k", "mean", "confidence_interval"] + }, + "cost_limit_usd": { + "type": "number", + "minimum": 0 + }, + "costLimitUsd": { + "type": "number", + "minimum": 0 + } + }, + "required": ["count"], + "additionalProperties": false + }, + "budget_usd": { + "type": "number", + "minimum": 0 + }, + "budgetUsd": { + "type": "number", + "minimum": 0 + }, + "fail_on_error": { + "type": "boolean" + }, + "failOnError": { + "type": "boolean" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "additionalProperties": false + }, + "workspace": { + "type": "object", + "properties": { + "template": { + "type": "string" + }, + "isolation": { + "type": "string", + "enum": ["shared", "per_test"] + }, + "repos": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "source": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "git" + }, + "url": { + "type": "string", + "format": "uri" + } + }, + "required": ["type", "url"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "local" + }, + "path": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + } + ] + }, + "checkout": { + "type": "object", + "properties": { + "ref": { + "type": "string" + }, + "base_commit": { + "type": "string", + "minLength": 1 + }, + "resolve": { + "type": "string", + "enum": ["remote", "local"] + }, + "ancestor": { + "type": "integer", + "minimum": 0 + } + }, + "additionalProperties": false + }, + "clone": { + "type": "object", + "properties": { + "depth": { + "type": "integer", + "minimum": 1 + }, + "filter": { + "type": "string" + }, + "sparse": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "hooks": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + }, + "mode": { + "type": "string", + "enum": ["pooled", "temp", "static"] + }, + "path": { + "type": "string" + }, + "docker": { + "type": "object", + "properties": { + "image": { + "type": "string" + }, + "timeout": { + "type": "integer", + "minimum": 1 + }, + "memory": { + "type": "string" + }, + "cpus": { + "type": "number", + "minimum": 0.1 + } + }, + "required": ["image"], + "additionalProperties": false + } + }, + "additionalProperties": false + }, + "metadata": { + "type": "object", + "additionalProperties": {} + }, + "conversation_id": { + "type": "string" + }, + "suite": { + "type": "string" + }, + "depends_on": { + "type": "array", + "items": { + "type": "string" + } + }, + "on_dependency_failure": { + "type": "string", + "enum": ["skip", "fail", "run"] + }, + "mode": { + "type": "string", + "enum": ["conversation"] + }, + "turns": { + "type": "array", + "items": { + "type": "object", + "properties": { + "input": { + "anyOf": [ + { + "type": "string" + }, + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + ] + }, + "expected_output": { + "anyOf": [ + { + "type": "string" + }, + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + ] + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["tool"], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["path", "match"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } + }, + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "budget"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] + } + ] + } + } + }, + "required": ["input"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["mean", "min", "max"] + }, + "on_turn_failure": { + "type": "string", + "enum": ["continue", "stop"] + }, + "window_size": { + "type": "integer", + "minimum": 1 + } + }, + "required": ["id"], + "additionalProperties": false + } + }, + { + "type": "string" + } + ] + }, + "target": { + "type": "string" + }, + "execution": { + "type": "object", + "properties": { + "target": { + "type": "string" + }, + "targets": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "name": { + "type": "string", + "minLength": 1 + }, + "use_target": { + "type": "string" + }, + "hooks": { + "type": "object", + "properties": { + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "required": ["name"], + "additionalProperties": false + } + ] + } + }, + "workers": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": ["any_order", "in_order", "exact", "subset", "superset"] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["tool"], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["path", "match"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } + }, + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "budget"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] + } + }, + "evaluators": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": ["any_order", "in_order", "exact", "subset", "superset"] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["tool"], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["path", "match"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } + }, + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "budget"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] + } + }, + "skip_defaults": { + "type": "boolean" + }, + "cache": { + "type": "boolean" + }, + "trials": { + "type": "object", + "properties": { + "count": { + "type": "integer", + "minimum": 1 + }, + "strategy": { + "type": "string", + "enum": ["pass_at_k", "mean", "confidence_interval"] + }, + "cost_limit_usd": { + "type": "number", + "minimum": 0 + }, + "costLimitUsd": { + "type": "number", + "minimum": 0 + } + }, + "required": ["count"], + "additionalProperties": false + }, + "budget_usd": { + "type": "number", + "minimum": 0 + }, + "budgetUsd": { + "type": "number", + "minimum": 0 + }, + "fail_on_error": { + "type": "boolean" + }, + "failOnError": { + "type": "boolean" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "additionalProperties": false + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": ["any_order", "in_order", "exact", "subset", "superset"] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["tool"], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["path", "match"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } + }, + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "budget"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + } + ] + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] + } + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + }, + "workspace": { + "anyOf": [ + { + "type": "object", + "properties": { + "template": { + "type": "string" + }, + "isolation": { + "type": "string", + "enum": ["shared", "per_test"] + }, + "repos": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "source": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "git" + }, + "url": { + "type": "string", + "format": "uri" + } + }, + "required": ["type", "url"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "local" + }, + "path": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + } + ] + }, + "checkout": { + "type": "object", + "properties": { + "ref": { + "type": "string" + }, + "base_commit": { + "type": "string", + "minLength": 1 + }, + "resolve": { + "type": "string", + "enum": ["remote", "local"] + }, + "ancestor": { + "type": "integer", + "minimum": 0 + } + }, + "additionalProperties": false + }, + "clone": { + "type": "object", + "properties": { + "depth": { + "type": "integer", + "minimum": 1 + }, + "filter": { + "type": "string" + }, + "sparse": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "hooks": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + }, + "mode": { + "type": "string", + "enum": ["pooled", "temp", "static"] + }, + "path": { + "type": "string" + }, + "docker": { + "type": "object", + "properties": { + "image": { + "type": "string" + }, + "timeout": { + "type": "integer", + "minimum": 1 + }, + "memory": { + "type": "string" + }, + "cpus": { + "type": "number", + "minimum": 0.1 + } + }, + "required": ["image"], + "additionalProperties": false + } + }, + "additionalProperties": false + }, + { + "type": "string" + } + ] + } + }, + "required": ["tests"], + "additionalProperties": false + } + } +} + +--- references/rubric-evaluator.md --- +# Rubric Grader + +Rubrics are defined as `assertions` entries with `type: rubrics`. They support binary checklist grading and score-range analytic grading. + +## Field Reference + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `type` | string | required | Must be `rubrics` | +| `criteria` | array | required | List of criterion strings or objects | +| `required` | boolean or number | - | Gate: `true` requires score >= 0.8; a number (0–1) sets a custom threshold | + +### Criterion Object Fields + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `id` | string | auto-generated | Unique identifier | +| `outcome` | string | required* | Criterion being evaluated (*optional if `score_ranges` used) | +| `weight` | number | 1.0 | Relative importance | +| `required` | boolean | true | Failing forces verdict to 'fail' (checklist mode) | +| `min_score` | number | - | Minimum score (0–1) to pass this criterion | +| `required_min_score` | integer | - | **Deprecated.** Use `min_score` instead. Legacy 0–10 scale. | +| `score_ranges` | map or array | - | Score range definitions for analytic scoring | + +## String Shorthand (Recommended) + +Plain strings in `assertions` are automatically treated as rubric criteria: + +```yaml +assertions: + - Mentions divide-and-conquer approach + - Explains partition step + - States time complexity +``` + +Equivalent to the full form with `type: rubrics`. Use the full form only when you need weights, `required: false`, or `score_ranges`. + +Mixed strings and objects are supported in `assertions` — strings are grouped into a single rubrics grader at the position of the first string: + +```yaml +assertions: + - Mentions divide-and-conquer approach # grouped into rubrics + - type: code-grader + command: [check_syntax.py] + - States time complexity # grouped into rubrics +``` + +## Checklist Mode + +```yaml +assertions: + - type: rubrics + criteria: + - Mentions divide-and-conquer approach + - id: complexity + outcome: States time complexity correctly + weight: 2.0 + required: true + - id: examples + outcome: Includes code examples + weight: 1.0 + required: false +``` + +## Score-Range Mode + +Shorthand map format (recommended): + +```yaml +assertions: + - type: rubrics + criteria: + - id: correctness + weight: 2.0 + min_score: 0.7 + score_ranges: + 0: Critical bugs + 3: Minor bugs + 6: Correct with minor issues + 9: Fully correct +``` + +Map keys are lower bounds (0-10). Each range extends from its key to (next key - 1), with the last extending to 10. Must start at 0. + +Array format is also accepted: + +```yaml + score_ranges: + - score_range: [0, 2] + outcome: Critical bugs + - score_range: [3, 5] + outcome: Minor bugs + - score_range: [6, 8] + outcome: Correct with minor issues + - score_range: [9, 10] + outcome: Fully correct +``` + +Ranges must be integers 0-10, non-overlapping, covering all values 0-10. + +## Scoring + +**Checklist:** `score = sum(satisfied weights) / sum(all weights)` + +**Score-range:** `score = weighted_average(raw_score / 10)` per criterion + +## Verdicts + +| Verdict | Condition | +|---------|-----------| +| `pass` | score >= 0.8 AND all gating criteria satisfied | +| `fail` | score < 0.8 OR any gating criterion failed | + +Gating: checklist uses `required: true`, score-range uses `min_score: N` (0–1 scale). diff --git a/evals/self/skills/fixtures/agentv-eval-writer.txt b/evals/self/skills/fixtures/agentv-eval-writer.txt new file mode 100644 index 000000000..19a0ff385 --- /dev/null +++ b/evals/self/skills/fixtures/agentv-eval-writer.txt @@ -0,0 +1,691 @@ +--- +name: agentv-eval-writer +description: >- + Write, edit, review, and validate AgentV EVAL.yaml / .eval.yaml evaluation files. + Use when asked to create new eval files, update or fix existing ones, add or remove test cases, + configure graders (`llm-grader`, `code-grader`, `rubrics`), review whether an eval is correct or complete, + convert between EVAL.yaml and evals.json using `agentv convert`, or generate eval test cases + from chat transcripts (markdown conversation or JSON messages). + Do NOT use for creating SKILL.md files, writing skill definitions, or running evals — + running and benchmarking belongs to agentv-bench. +--- + +# AgentV Eval Writer + +Comprehensive docs: https://agentv.dev + +## Evaluation Types + +AgentV evaluations measure **execution quality** — whether your agent or skill produces correct output when invoked. + +For **trigger quality** (whether the right skill is triggered for the right prompts), see the [Evaluation Types guide](https://agentv.dev/guides/evaluation-types/). Do not use execution eval configs (`EVAL.yaml`, `evals.json`) for trigger evaluation — these are distinct concerns requiring different tooling and methodologies. + +## Starting from evals.json? + +If the project already has an Agent Skills `evals.json` file, use it as a starting point instead of writing YAML from scratch: + +```bash +# Convert evals.json to AgentV EVAL YAML +agentv convert evals.json + +# Run directly without converting (all commands accept evals.json) +agentv eval evals.json +``` + +The converter maps `prompt` → `input`, `expected_output` → `expected_output`, `assertions` → `assertions` (`llm-grader`), and resolves `files[]` paths. The generated YAML includes TODO comments for AgentV features to add (workspace setup, code graders, rubrics, required gates). + +After converting, enhance the YAML with AgentV-specific capabilities shown below. + +## From Chat Transcript + +Convert a chat conversation into eval test cases without starting from scratch. + +**Input formats:** + +Markdown conversation: +``` +User: How do I reset my password? +Assistant: Go to Settings > Security > Reset Password... +``` + +JSON messages: +```json +[{"role": "user", "content": "How do I reset my password?"}, + {"role": "assistant", "content": "Go to Settings > Security > Reset Password..."}] +``` + +**Select exchanges that make good test cases:** +- Factual Q&A — verifiable answers +- Task completion — user requests an action, agent performs it +- Edge cases — unusual inputs, error handling, boundary conditions +- Multi-turn reasoning — exchanges where earlier context matters + +**Skip:** greetings, one-word acknowledgments, repeated exchanges + +**Multi-turn format** (when context from prior turns matters): +```yaml +tests: + - id: multi-turn-context + criteria: "Agent remembers prior context" + input: + - role: user + content: "My name is Alice" + - role: assistant + content: "Nice to meet you, Alice!" + - role: user + content: "What's my name?" + expected_output: "Your name is Alice." + assertions: + - type: rubrics + criteria: + - Correctly recalls the user's name from earlier in the conversation +``` + +**Guidelines:** preserve exact wording in `expected_output`; aim for 5–15 tests per transcript; pick exchanges that test different capabilities. + +## Quick Start + +```yaml +description: Example eval +execution: + target: default + +tests: + - id: greeting + criteria: Friendly greeting + input: "Say hello" + expected_output: "Hello! How can I help you?" + assertions: + - type: rubrics + criteria: + - Greeting is friendly and warm + - Offers to help +``` + +## Eval File Structure + +**Required:** `tests` (array or string path) +**Optional:** `name`, `description`, `version`, `author`, `tags`, `license`, `requires`, `execution`, `suite`, `workspace`, `assertions`, `input` + +**Test fields:** + +| Field | Required | Description | +|-------|----------|-------------| +| `id` | yes | Unique identifier | +| `criteria` | yes | What the response should accomplish | +| `input` / `input` | yes | Input to the agent | +| `expected_output` / `expected_output` | no | Gold-standard reference answer | +| `assertions` | no | Graders: deterministic checks, rubrics, and LLM/code graders | +| `rubrics` | no | **Deprecated** — use `assertions: [{type: rubrics, criteria: [...]}]` instead | +| `execution` | no | Per-case execution overrides | +| `workspace` | no | Per-case workspace config (overrides suite-level) | +| `metadata` | no | Arbitrary key-value pairs passed to setup/teardown scripts | +| `conversation_id` | no | Thread grouping | + +**Shorthand aliases:** +- `input` (string) expands to `[{role: "user", content: "..."}]` +- `expected_output` (string/object) expands to `[{role: "assistant", content: ...}]` +- Canonical `input` / `expected_output` take precedence when both present + +**Message format:** `{role, content}` where role is `system`, `user`, `assistant`, or `tool` +**Content types:** inline text, `{type: "file", value: "./path.md"}` +**File paths:** relative from eval file dir, or absolute with `/` prefix from repo root +**File handling by provider type:** LLM providers receive file content inlined in XML tags. Agent providers receive a preread block with `file://` URIs and must read files themselves. See [Coding Agents > Prompt format](https://agentv.dev/targets/coding-agents#prompt-format). + +**JSONL format:** One test per line as JSON. Optional `.yaml` sidecar for shared defaults. See `examples/features/basic-jsonl/`. + +**Environment variables:** All string fields support `${{ VAR }}` interpolation. Missing vars resolve to empty string. Works in eval files, external case files, and workspace configs. `.env` files are loaded automatically. + +## Metadata + +When `name` is present, the suite is parsed as a metadata-bearing eval: + +```yaml +name: export-screening # required, lowercase/hyphens, max 64 chars +description: Evaluates export control screening accuracy +version: "1.0" +author: acme-compliance +tags: [compliance, agents] +license: Apache-2.0 +requires: + agentv: ">=0.30.0" +``` + +## Suite-level Input + +Prepend shared input messages to every test (like suite-level `assertions`). Avoids repeating the same prompt file in each test: + +```yaml +input: + - role: user + content: + - type: file + value: ./system-prompt.md + +tests: ./cases.yaml + +# cases.yaml — each test only needs its own query +# - id: test-1 +# criteria: ... +# input: "User question here" +``` + +Effective input: `[...suite input, ...test input]`. Skipped when `execution.skip_defaults: true`. +Accepts same formats as test `input` (string or message array). + +## Tests as String Path + +Point `tests` to an external file instead of inlining: + +```yaml +name: my-eval +description: My evaluation suite +tests: ./cases.yaml # relative to eval file dir +``` + +The external file can be YAML (array of test objects) or JSONL. + +## Assertions Field + +`assertions` defines graders at the suite level or per-test level. It is the canonical field for all graders: + +```yaml +# Suite-level (appended to every test) +assertions: + - type: is-json + required: true + - type: contains + value: "status" + +tests: + - id: test-1 + criteria: Returns JSON + input: Get status + # Per-test assertions (runs before suite-level) + assertions: + - type: equals + value: '{"status": "ok"}' +``` + +## How `criteria` and `assertions` Interact + +`criteria` is a **data field** — it describes what the response should accomplish. It is **not** a grader. How it gets evaluated depends on whether `assertions` is present: + +| Scenario | What happens | Warning? | +|----------|-------------|----------| +| `criteria` + **no `assertions`** | Implicit `llm-grader` runs automatically against `criteria` | No | +| `criteria` + **`assertions` with only deterministic graders** (contains, regex, etc.) | Only declared graders run. `criteria` is **not evaluated**. | Yes — warns that no grader will consume criteria | +| `criteria` + **`assertions` with a grader** (`llm-grader`, `code-grader`, `rubrics`) | Declared graders run. Graders receive `criteria` as input. | No | + +### No assertions → implicit llm-grader + +The simplest path. `criteria` is automatically evaluated by the default `llm-grader`: + +```yaml +tests: + - id: simple-eval + criteria: Assistant correctly explains the bug and proposes a fix + input: "Debug this function..." + # No assertions → default llm-grader evaluates against criteria +``` + +### assertions present → no implicit grader + +When `assertions` is defined, **only the declared graders run**. If you want an LLM grader alongside deterministic checks, declare it explicitly: + +```yaml +tests: + - id: mixed-eval + criteria: Response is helpful and mentions the fix + input: "Debug this function..." + assertions: + - type: llm-grader # must be explicit when assertions is present + - type: contains + value: "fix" +``` + +**Common mistake:** defining `criteria` with only deterministic graders. The criteria will be ignored and a warning is emitted: + +```yaml +tests: + - id: bad-example + criteria: Gives a thoughtful answer # ⚠ NOT evaluated — no grader in assertions + input: "What is 2+2?" + assertions: + - type: contains + value: "4" + # Warning: criteria is defined but no grader in assertions will evaluate it. +``` + +## Required Gates + +Any grader can be marked `required` to enforce a minimum score: + +```yaml +assertions: + - type: contains + value: "DENIED" + required: true # must score >= 0.8 (default) + - type: rubrics + required: 0.6 # must score >= 0.6 (custom threshold) + criteria: + - id: accuracy + outcome: Identifies the denied party + weight: 5.0 +``` + +If a required grader scores below its threshold, the overall verdict is forced to `fail`. + +## Workspace Setup/Teardown + +Run scripts before/after each test. Define at suite level or override per case: + +```yaml +workspace: + template: ./workspace-templates/my-project + setup: + command: ["bun", "run", "setup.ts"] + timeout_ms: 120000 + teardown: + command: ["bun", "run", "teardown.ts"] + +tests: + - id: case-1 + input: Fix the bug + criteria: Bug is fixed + metadata: + repo: sympy/sympy + workspace: + repos: + - path: /testbed + source: + type: git + url: https://github.com/sympy/sympy.git + checkout: + base_commit: "abc123" + docker: + image: swebench/sweb.eval.django__django:latest +``` + +**Lifecycle:** template copy → repo clone → setup → git baseline → agent → file changes → teardown → repo reset → cleanup +**Merge:** Case-level fields replace suite-level fields. +**Commands receive stdin JSON:** `{workspace_path, test_id, eval_run_id, case_input, case_metadata}` +**Setup failure:** aborts case. **Teardown failure:** non-fatal (warning). +For SWE-bench-style evals, keep operational checkout state under `workspace.repos[].checkout.base_commit`; treat `metadata.base_commit` as informational only. + +### Repository Lifecycle + +Clone repos into workspace automatically. For shared repo workspaces, pooling is the default: + +```yaml +workspace: + repos: + - path: ./repo + source: + type: git + url: https://github.com/org/repo.git + checkout: + ref: main + ancestor: 1 # parent commit + clone: + depth: 10 + hooks: + after_each: + reset: fast # none | fast | strict + isolation: shared # shared | per_test + mode: pooled # pooled | temp | static + hooks: + enabled: true # set false to skip all hooks +``` + +- `source.type`: `git` (URL) or `local` (path) +- `checkout.resolve`: `remote` (ls-remote) or `local` +- `clone.depth`: shallow clone depth +- `clone.filter`: partial clone filter (e.g., `blob:none`) +- `clone.sparse`: sparse checkout paths array +- `mode`: `pooled` (default for shared repos), `temp`, or `static` +- `path`: workspace path used when `mode: static`; when empty/missing the workspace is auto-materialised (template copied + repos cloned); populated dirs are reused as-is +- `hooks.enabled`: boolean (default `true`); set `false` to skip all lifecycle hooks +- Pool reset defaults to `fast` (`git clean -fd`); use `--workspace-clean full` for strict reset (`git clean -fdx`) +- Pool entries are managed separately via `agentv workspace list` and `agentv workspace clean` +- `agentv workspace deps ` scans eval files and outputs a JSON manifest of required git repos (useful for CI pre-cloning) + +See https://agentv.dev/targets/configuration/#repository-lifecycle + +## Grader Types + +Configure via `assertions` array. Multiple graders produce a weighted average score. + +### code_grader +```yaml +- name: format_check + type: code-grader + command: [uv, run, validate.py] + cwd: ./scripts # optional working directory + target: {} # optional: enable LLM target proxy (max_calls: 50) +``` +Contract: stdin JSON -> stdout JSON `{score, assertions: [{text, passed, evidence?}], reasoning}` +Input includes: `question`, `criteria`, `answer`, `reference_answer`, `output`, `trace`, `token_usage`, `cost_usd`, `duration_ms`, `start_time`, `end_time`, `file_changes`, `workspace_path`, `config` +When a workspace is configured, `workspace_path` is the absolute path to the workspace dir (also available as `AGENTV_WORKSPACE_PATH` env var). Use this for functional grading (e.g., running `npm test` in the workspace). +See docs at https://agentv.dev/graders/code-graders/ + +### llm_grader +```yaml +- name: quality + type: llm-grader + prompt: ./prompts/eval.md # markdown template or command config + target: grader_gpt_5_mini # optional: override the grader target for this grader + model: gpt-5-chat # optional model override + config: # passed to prompt templates as context.config + strictness: high +``` +Variables: `{{question}}`, `{{criteria}}`, `{{answer}}`, `{{reference_answer}}`, `{{input}}`, `{{expected_output}}`, `{{output}}`, `{{file_changes}}` +- Markdown templates: use `{{variable}}` syntax +- TypeScript templates: use `definePromptTemplate(fn)` from `@agentv/eval`, receives context object with all variables + `config` +- Use `target:` to run different `llm-grader` graders against different named LLM targets in the same eval (useful for grader panels / ensembles) + +### composite +```yaml +- name: gate + type: composite + assertions: + - name: safety + type: llm-grader + prompt: ./safety.md + - name: quality + type: llm-grader + aggregator: + type: weighted_average + weights: { safety: 0.3, quality: 0.7 } +``` +Aggregator types: `weighted_average`, `all_or_nothing`, `minimum`, `maximum`, `safety_gate` +- `safety_gate`: fails immediately if the named gate grader scores below threshold (default 1.0) + +### tool_trajectory +```yaml +- name: tool_check + type: tool-trajectory + mode: any_order # any_order | in_order | exact + minimums: # for any_order + knowledgeSearch: 2 + expected: # for in_order/exact + - tool: knowledgeSearch + args: { query: "search term" } # partial deep equality match + - tool: documentRetrieve + args: any # any arguments accepted + max_duration_ms: 5000 # per-tool latency assertion + - tool: summarize # omit args to skip argument checking +``` + +### field_accuracy +```yaml +- name: fields + type: field-accuracy + match_type: exact # exact | date | numeric_tolerance + numeric_tolerance: 0.01 # for numeric_tolerance match_type + aggregation: weighted_average # weighted_average | all_or_nothing +``` +Compares `output` fields against `expected_output` fields. + +### latency +```yaml +- name: speed + type: latency + max_ms: 5000 +``` + +### cost +```yaml +- name: budget + type: cost + max_usd: 0.10 +``` + +### token_usage +```yaml +- name: tokens + type: token-usage + max_total_tokens: 4000 +``` + +### execution_metrics +```yaml +- name: efficiency + type: execution-metrics + max_tool_calls: 10 # Maximum tool invocations + max_llm_calls: 5 # Maximum LLM calls (assistant messages) + max_tokens: 5000 # Maximum total tokens (input + output) + max_cost_usd: 0.05 # Maximum cost in USD + max_duration_ms: 30000 # Maximum execution duration + target_exploration_ratio: 0.6 # Target ratio of read-only tool calls + exploration_tolerance: 0.2 # Tolerance for ratio check (default: 0.2) +``` +Declarative threshold-based checks on execution metrics. Only specified thresholds are checked. +Score is proportional: `passed / total` assertions. Missing data counts as a failed assertion. + +### contains +```yaml +- type: contains + value: "DENIED" + required: true +``` +Binary check: does output contain the substring? Name auto-generated if omitted. + +### regex +```yaml +- type: regex + value: "\\d{3}-\\d{2}-\\d{4}" +``` +Binary check: does output match the regex pattern? + +### equals +```yaml +- type: equals + value: "42" +``` +Binary check: does output exactly equal the value (both trimmed)? + +### is_json +```yaml +- type: is-json + required: true +``` +Binary check: is the output valid JSON? + +### rubrics +```yaml +- type: rubrics + criteria: + - id: accuracy + outcome: Correctly identifies the denied party + weight: 5.0 + - id: reasoning + outcome: Provides clear reasoning + weight: 3.0 +``` +LLM-judged structured evaluation with weighted criteria. Criteria items support `id`, `outcome`, `weight`, and `required` fields. + +### rubrics (inline, deprecated) +Top-level `rubrics:` field is deprecated. Use `type: rubrics` under `assertions` instead. +See `references/rubric-grader.md` for score-range mode and scoring formula. + +## Execution Error Tolerance + +Control how the runner handles execution errors (infrastructure failures, not quality failures): + +```yaml +execution: + fail_on_error: false # never halt (default) + # fail_on_error: true # halt on first execution error +``` + +When halted, remaining tests get `executionStatus: 'execution_error'` with `failureReasonCode: 'error_threshold_exceeded'`. + +## Suite-Level Quality Threshold + +Set a minimum mean score for the eval suite. If the mean quality score falls below the threshold, the CLI exits with code 1 — useful for CI/CD quality gates. + +```yaml +execution: + threshold: 0.8 +``` + +CLI flag `--threshold 0.8` overrides the YAML value. Must be a number between 0 and 1. Mean score is computed from quality results only (execution errors excluded). + +The threshold also controls JUnit XML pass/fail: tests with scores below the threshold are marked as ``. When no threshold is set, JUnit defaults to 0.5. + +## CLI Commands + +```bash +# Run evaluation (requires API keys) +agentv eval [--test-id ] [--target ] [--dry-run] [--threshold <0-1>] + +# Run with OTLP JSON file (importable by OTel backends) +agentv eval --otel-file traces/eval.otlp.json + +# Run a single assertion in isolation (no API keys needed) +agentv eval assert --agent-output "..." --agent-input "..." + +# Import agent transcripts for offline grading +agentv import claude --session-id + +# Re-run only execution errors from a previous run +agentv eval --retry-errors .agentv/results/runs//index.jsonl + +# Validate eval file +agentv validate + +# Compare results — N-way matrix from a canonical run manifest +agentv compare .agentv/results/runs//index.jsonl +agentv compare .agentv/results/runs//index.jsonl --baseline # CI regression gate +agentv compare .agentv/results/runs//index.jsonl --baseline --candidate # pairwise +agentv compare .agentv/results/runs//index.jsonl .agentv/results/runs//index.jsonl + +# Author assertions directly in the eval file +# Prefer simple assertions when they fit the criteria; use deterministic or LLM-based graders when needed +agentv validate +``` + +## Code Judge SDK + +Use `@agentv/eval` to build custom graders in TypeScript/JavaScript: + +### defineAssertion (recommended for custom checks) +```typescript +#!/usr/bin/env bun +import { defineAssertion } from '@agentv/eval'; + +export default defineAssertion(({ answer, trace }) => ({ + pass: answer.length > 0 && (trace?.eventCount ?? 0) <= 10, + reasoning: 'Checks content exists and is efficient', +})); +``` + +Assertions support both `pass: boolean` and `score: number` (0-1). If only `pass` is given, score is 1 (pass) or 0 (fail). + +### defineCodeGrader (full control) +```typescript +#!/usr/bin/env bun +import { defineCodeGrader } from '@agentv/eval'; + +export default defineCodeGrader(({ trace, answer }) => ({ + score: trace?.eventCount <= 5 ? 1.0 : 0.5, + assertions: [ + { text: 'Efficient tool usage', passed: (trace?.eventCount ?? 0) <= 5 }, + ], +})); +``` + +Both are used via `type: code-grader` in YAML with `command: [bun, run, grader.ts]`. + +### Convention-Based Discovery + +Place assertion files in `.agentv/assertions/` — they auto-register by filename: + +``` +.agentv/assertions/word-count.ts → type: word-count +.agentv/assertions/sentiment.ts → type: sentiment +``` + +No `command:` needed in YAML — just use `type: `. + +## Programmatic API + +Use `evaluate()` from `@agentv/core` to run evals as a library: + +```typescript +import { evaluate } from '@agentv/core'; + +const { results, summary } = await evaluate({ + tests: [ + { + id: 'greeting', + input: 'Say hello', + assertions: [{ type: 'contains', value: 'hello' }], + }, + ], + target: { provider: 'mock_agent' }, +}); +console.log(`${summary.passed}/${summary.total} passed`); +``` + +Supports inline tests (no YAML) or file-based via `specFile`. + +## defineConfig + +Type-safe project configuration in `agentv.config.ts`: + +```typescript +import { defineConfig } from '@agentv/core'; + +export default defineConfig({ + execution: { workers: 5, maxRetries: 2 }, + output: { format: 'jsonl', dir: './results' }, + limits: { maxCostUsd: 10.0 }, +}); +``` + +Auto-discovered from project root. Validated with Zod. + +## Scaffold Commands + +```bash +agentv create assertion # → .agentv/assertions/.ts +agentv create eval # → evals/.eval.yaml + .cases.jsonl +``` + +## Skill Improvement Workflow + +For a complete guide to iterating on skills using evaluations — writing scenarios, running baselines, comparing results, and improving — see the [Skill Improvement Workflow](https://agentv.dev/guides/skill-improvement-workflow/) guide. +## Human Review Checkpoint + +After running evals, perform a human review before iterating. Create `feedback.json` in the results directory: + +```json +{ + "run_id": "2026-03-14T10-32-00_claude", + "reviewer": "engineer-name", + "timestamp": "2026-03-14T12:00:00Z", + "overall_notes": "Summary of observations", + "per_case": [ + { + "test_id": "test-id", + "verdict": "acceptable | needs_improvement | incorrect | flaky", + "notes": "Why this verdict", + "evaluator_overrides": { "code-grader:name": "Override note" }, + "workspace_notes": "Workspace state observations" + } + ] +} +``` + +Use `evaluator_overrides` for workspace evaluations to annotate specific grader results (e.g., "code-grader was too strict"). Use `workspace_notes` for observations about workspace state. + +Review workflow: run evals → inspect results (`agentv inspect show`) → write feedback → tune prompts/graders → re-run. + +Full guide: https://agentv.dev/guides/human-review/ + +## Schemas + +- Eval file: `references/eval-schema.json` +- Config: `references/config-schema.json` diff --git a/evals/self/skills/fixtures/agentv-governance-full.txt b/evals/self/skills/fixtures/agentv-governance-full.txt new file mode 100644 index 000000000..87ccc2095 --- /dev/null +++ b/evals/self/skills/fixtures/agentv-governance-full.txt @@ -0,0 +1,545 @@ +--- +name: agentv-governance +description: >- + Author, edit, and lint `governance:` blocks in `*.eval.yaml` files. + Use when creating or updating evaluation suites that carry AI-governance metadata + (OWASP LLM Top 10, OWASP Agentic Top 10, MITRE ATLAS, EU AI Act, ISO 42001). + Also use non-interactively (e.g., from a GitHub Action) to lint changed eval files + and report violations against the rules in `references/lint-rules.md`. + Do NOT use for running evals or benchmarking — that belongs to agentv-bench. +--- + +# AgentV Compliance Skill + +Teaches AI agents how to author syntactically correct `governance:` blocks in AgentV +eval files, and how to lint them against known vocabulary rules. + +## Dual mode + +**Authoring (interactive):** When a human or AI agent is editing a `*.eval.yaml` file +that contains or should contain a `governance:` block, this skill provides vocabulary, +valid values, and example shapes. Load it alongside `agentv-eval-writer` when building +red-team or compliance suites. + +**Linting (non-interactive / CI):** When invoked from a GitHub Action (see +`examples/governance/compliance-lint/`), this skill lints each changed `*.eval.yaml` file +against the rules in `references/lint-rules.md` and returns a structured JSON report. +The expected output format is: +```json +{ + "pass": true, + "violations": [ + { + "rule": "known_key", + "key": "risk_level", + "value": "high", + "message": "Unknown governance key 'risk_level'. Did you mean 'risk_tier'?", + "suggestion": "Replace 'risk_level' with 'risk_tier'." + } + ] +} +``` +`pass` is `true` when `violations` is empty. + +## Reference files + +| File | Purpose | +|------|---------| +| `references/governance-yaml-shape.md` | YAML shape, merge semantics, worked examples | +| `references/lint-rules.md` | Machine-readable rules applied during lint | +| `references/owasp-llm-top-10-2025.md` | LLM01–LLM10 canonical IDs and descriptions | +| `references/owasp-agentic-top-10-2025.md` | T01–T10 agentic-AI categories | +| `references/mitre-atlas.md` | Common AML.Txxxx technique IDs | +| `references/eu-ai-act-risk-tiers.md` | Four risk tiers + article references | +| `references/iso-42001-controls.md` | Curated ISO/IEC 42001:2023 controls for AI eval | + +## Quick authoring guide + +1. Check which risks this eval exercises using the reference files above. +2. Pick IDs from the relevant frameworks (`owasp_llm_top_10_2025`, `mitre_atlas`, etc.). +3. Set `risk_tier` using EU AI Act vocabulary (`prohibited | high | limited | minimal`). +4. Add `controls` as `-:` strings (e.g. `EU-AI-ACT-2024:Art.55`). +5. Run the lint rules from `references/lint-rules.md` against your block before committing. +6. See `references/governance-yaml-shape.md` for complete examples copied from real suites. + +--- references/eu-ai-act-risk-tiers.md --- +# EU AI Act — Risk Tiers + +**Valid values for the `risk_tier:` field.** + +Official source: Regulation (EU) 2024/1689 on Artificial Intelligence (EU AI Act) +Full text: https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:32024R1689 + +## Allowed values + +| Value | EU AI Act category | Key articles | Description | +|-------|-------------------|-------------|-------------| +| `prohibited` | Prohibited AI practices | Art. 5 | AI systems whose risks are deemed unacceptable — banned outright. Examples: social scoring by public authorities, real-time remote biometric surveillance in public spaces, AI that exploits vulnerabilities of specific groups. | +| `high` | High-risk AI systems | Art. 6, Annex I–III | AI systems subject to mandatory conformity assessments, transparency, and human oversight. Examples: biometric identification, critical infrastructure, employment screening, access to education or essential services, law enforcement. | +| `limited` | Limited-risk AI systems | Art. 50 | AI systems with transparency obligations only. Examples: chatbots must disclose they are AI; deep-fake generators must mark synthetic media. | +| `minimal` | Minimal-risk AI systems | — | No mandatory obligations. Examples: spam filters, AI in video games. Voluntary codes of conduct encouraged. | + +## Usage notes + +- `risk_tier` is a scalar; only one value per governance block. +- The vocabulary is anchored to EU AI Act terminology. Some organizations use different + risk scales (e.g. NIST SP 800-30 `low | moderate | high | very_high`). When mapping + from another framework, choose the EU AI Act equivalent that best matches the impact. +- Combine `risk_tier: high` with `controls` referencing EU AI Act articles: + ```yaml + risk_tier: high + controls: + - EU-AI-ACT-2024:Art.55 + - EU-AI-ACT-2024:Art.6 + ``` +- `prohibited` tier should accompany test cases that specifically probe prohibited behaviors. + This does NOT mean the eval suite is itself prohibited — it means the suite tests whether + the system correctly refuses to engage in prohibited behaviors. + +## Article reference format + +Use `EU-AI-ACT-2024:
` in the `controls` array, e.g. `EU-AI-ACT-2024:Art.55`. +Article 55 covers general-purpose AI (GPAI) model obligations and transparency requirements. + +--- references/governance-yaml-shape.md --- +# Governance Block — YAML Shape and Examples + +## Field reference + +```yaml +governance: + schema_version: "1.0" # string, optional — version of this block's schema + owasp_llm_top_10_2025: [LLM01] # string[], optional — OWASP LLM Top 10 v2025 IDs + owasp_agentic_top_10_2025: [T01, T06] # string[], optional — OWASP Agentic AI Top 10 v2025 IDs + mitre_atlas: [AML.T0051] # string[], optional — MITRE ATLAS technique IDs + controls: [] # string[], optional — -: strings + risk_tier: high # string, optional — EU AI Act tier (see eu-ai-act-risk-tiers.md) + owner: security-team # string, optional — owning team or person +``` + +All fields are optional. Unknown keys pass through to JSONL output unchanged. + +## Control ID format + +The `controls` array accepts any string matching the pattern `-:`. +Custom organizational prefixes are valid: + +``` +NIST-AI-RMF-1.0:MEASURE-2.7 +EU-AI-ACT-2024:Art.55 +ISO-42001-2023:6.1.2 +INTERNAL-AI-POLICY-3.2:CTRL-7 +``` + +## Placement in eval files + +Governance blocks live in two places and are merged automatically: + +### 1. Suite-level (top-level key) + +Define once at the suite level and it will be merged into every case's `metadata.governance`: + +```yaml +name: redteam-llm01-prompt-injection +governance: &gov # YAML anchor for reuse in per-case overrides + schema_version: "1.0" + owasp_llm_top_10_2025: [LLM01] + mitre_atlas: [AML.T0051] + controls: + - NIST-AI-RMF-1.0:MEASURE-2.7 + - EU-AI-ACT-2024:Art.55 + risk_tier: high + owner: security-team + +tests: + - id: direct-ignore-previous + metadata: + governance: *gov # reference the anchor — identical to suite-level + ... +``` + +### 2. Per-case override with merge-key (`<<:`) + +Use YAML merge keys to inherit suite-level governance and add case-specific overrides. +Arrays from both sides are concatenated and deduplicated; scalar fields on the case win: + +```yaml + - id: indirect-tool-output + metadata: + governance: + <<: *gov + owasp_llm_top_10_2025: [LLM01, LLM06] # extends — case adds LLM06 to the inherited [LLM01] +``` + +## Merge semantics (how suite + case are combined) + +| Field type | Merge behavior | +|-----------|----------------| +| Arrays (`owasp_llm_top_10_2025`, `mitre_atlas`, `controls`) | Concatenate suite + case, deduplicate | +| Scalars (`risk_tier`, `owner`, `schema_version`) | Case value overrides suite value | + +## Complete example — from `examples/red-team/suites/llm01-prompt-injection.eval.yaml` + +```yaml +name: redteam-llm01-prompt-injection +governance: &gov + schema_version: "1.0" + owasp_llm_top_10_2025: [LLM01] + mitre_atlas: [AML.T0051] + controls: + - NIST-AI-RMF-1.0:MEASURE-2.7 + - EU-AI-ACT-2024:Art.55 + risk_tier: high + owner: security-team + +tests: + - id: direct-ignore-previous + metadata: + governance: *gov + ... + + - id: indirect-tool-output-document + metadata: + governance: + <<: *gov + owasp_llm_top_10_2025: [LLM01, LLM06] # case adds LLM06 + ... +``` + +## Complete example — from `examples/red-team/archetypes/coding-agent/suites/destructive-git.eval.yaml` + +```yaml +name: redteam-coder-destructive-git +governance: &gov + schema_version: "1.0" + owasp_llm_top_10_2025: [LLM06] + owasp_agentic_top_10_2025: [T01, T06] + mitre_atlas: [AML.T0051, AML.T0075] + controls: + - NIST-AI-RMF-1.0:MEASURE-2.7 + - EU-AI-ACT-2024:Art.55 + risk_tier: high + owner: security-team +``` + +## JSONL output + +The merged `governance` block is passed through verbatim to the JSONL result file under each +result's `metadata.governance` key. Downstream tools (jq pipelines, `.ai-register.yaml` +aggregators) consume it from there. The eval engine does not validate or transform the values. + +--- references/iso-42001-controls.md --- +# ISO/IEC 42001:2023 — AI Management System Controls + +**Curated subset of controls relevant to AI evaluation suites.** + +Official source: ISO/IEC 42001:2023 — Information technology — Artificial intelligence — +Management system. Full standard available at https://www.iso.org/standard/81230.html + +ISO 42001 is a management-system standard (like ISO 27001 for information security) covering +the governance, risk management, and operational controls for organizations that develop or +deploy AI systems. + +## Control reference format + +Use `ISO-42001-2023:` in the `controls` array. + +## Relevant control areas for eval suites + +| Clause | Title | Relevance to evals | +|--------|-------|-------------------| +| 6.1 | Actions to address risks and opportunities | Risk identification for AI systems — align `risk_tier` with documented risk assessments. | +| 6.1.2 | AI risk assessment | Formal risk assessment process; eval suites serve as evidence of risk measurement. | +| 8.4 | AI system impact assessment | Assess potential societal impacts before deployment; red-team evals provide evidence. | +| 8.5 | AI system life cycle | Controls for data, model, and deployment stages — align with suite test coverage. | +| 9.1 | Monitoring, measurement, analysis and evaluation | Periodic eval runs as evidence of continuous monitoring. | +| 9.1.1 | AI performance evaluation | Systematic measurement of AI output quality and safety properties. | +| 10.2 | Nonconformity and corrective action | Failing evals trigger corrective action processes. | +| A.2 | Policies for AI (Annex A) | Organizational AI use policies — `owner` field maps to the responsible team. | +| A.5 | AI risk assessment (Annex A) | Documented risk assessment for each AI application. | +| A.6 | AI system impact assessment (Annex A) | Broader societal-impact documentation. | + +## Usage example + +```yaml +controls: + - ISO-42001-2023:6.1.2 # AI risk assessment + - ISO-42001-2023:9.1.1 # AI performance evaluation + - EU-AI-ACT-2024:Art.55 # GPAI transparency obligations +``` + +## Notes + +- ISO 42001 is certification-oriented; most teams will reference only a subset. + The clauses above are the ones most directly evidenced by running and storing eval results. +- For pure LLM / red-team suites, clauses 6.1.2, 8.4, and 9.1.1 are the most common references. +- Combine with NIST AI RMF controls (e.g. `NIST-AI-RMF-1.0:MEASURE-2.7`) when the organization + uses both frameworks. + +--- references/lint-rules.md --- +# Governance Block Lint Rules + +Rules applied when linting a `governance:` block in a `*.eval.yaml` file. +The CI Action (see `examples/governance/compliance-lint/`) passes this file to Claude +together with the governance block to extract and returns a structured report. + +## How to apply these rules + +For each `governance:` block found in a changed eval file: + +1. Extract the block (top-level `governance:` key, or `metadata.governance` in a test case). +2. Apply each rule below in order. +3. Collect all violations. +4. Return the structured JSON report described in `SKILL.md`. + +A block with zero violations produces `{ "pass": true, "violations": [] }`. + +--- + +## Rule 1 — known_key + +**What:** Every key in the `governance:` object must be in the allowed-key list. + +**Allowed keys:** `schema_version`, `owasp_llm_top_10_2025`, `owasp_agentic_top_10_2025`, +`mitre_atlas`, `controls`, `risk_tier`, `owner` + +**On violation:** +```json +{ + "rule": "known_key", + "key": "", + "value": "", + "message": "Unknown governance key ''. Did you mean ''?", + "suggestion": "Replace '' with ''." +} +``` + +Common typos and their corrections: +- `risk_level` → `risk_tier` +- `owasp_top_10` → `owasp_llm_top_10_2025` +- `owasp_llm` → `owasp_llm_top_10_2025` +- `atlas` → `mitre_atlas` +- `mitre` → `mitre_atlas` +- `control` (singular) → `controls` + +--- + +## Rule 2 — owasp_llm_ids + +**What:** Every string in `owasp_llm_top_10_2025` must match the pattern `LLM\d{2}` (LLM01–LLM10). + +**On violation:** +```json +{ + "rule": "owasp_llm_ids", + "key": "owasp_llm_top_10_2025", + "value": "", + "message": "Invalid OWASP LLM ID ''. Expected LLM01–LLM10.", + "suggestion": "Use a valid ID from references/owasp-llm-top-10-2025.md." +} +``` + +--- + +## Rule 3 — owasp_agentic_ids + +**What:** Every string in `owasp_agentic_top_10_2025` must match the pattern `T\d{2}` (T01–T10). + +**On violation:** +```json +{ + "rule": "owasp_agentic_ids", + "key": "owasp_agentic_top_10_2025", + "value": "", + "message": "Invalid OWASP Agentic ID ''. Expected T01–T10.", + "suggestion": "Use a valid ID from references/owasp-agentic-top-10-2025.md." +} +``` + +--- + +## Rule 4 — mitre_atlas_ids + +**What:** Every string in `mitre_atlas` must match the pattern `AML\.T\d{4}(\.\d{3})?`. + +**On violation:** +```json +{ + "rule": "mitre_atlas_ids", + "key": "mitre_atlas", + "value": "", + "message": "Invalid MITRE ATLAS ID ''. Expected AML.Txxxx or AML.Txxxx.xxx.", + "suggestion": "Check https://atlas.mitre.org/techniques/ for valid IDs." +} +``` + +--- + +## Rule 5 — control_id_format + +**What:** Every string in `controls` must match the pattern `^[A-Z0-9][A-Z0-9_-]+-[A-Z0-9._-]+:[A-Z0-9._-]+$` +(i.e. `-:` where all three parts are present and non-empty). + +Examples of valid control IDs: +- `NIST-AI-RMF-1.0:MEASURE-2.7` +- `EU-AI-ACT-2024:Art.55` +- `ISO-42001-2023:6.1.2` +- `INTERNAL-POLICY-2.1:CTRL-99` + +**On violation:** +```json +{ + "rule": "control_id_format", + "key": "controls", + "value": "", + "message": "Malformed control ID ''. Expected format: -:.", + "suggestion": "Use the format -:, e.g. 'EU-AI-ACT-2024:Art.55'." +} +``` + +--- + +## Rule 6 — risk_tier_value + +**What:** `risk_tier`, when present, must be one of: +`prohibited`, `high`, `limited`, `minimal` + +**On violation:** +```json +{ + "rule": "risk_tier_value", + "key": "risk_tier", + "value": "", + "message": "Unknown risk_tier value ''. Allowed: prohibited, high, limited, minimal.", + "suggestion": "Use one of the EU AI Act risk tiers from references/eu-ai-act-risk-tiers.md." +} +``` + +Common mistakes: +- `high_risk` → `high` +- `limited_risk` → `limited` +- `minimal_risk` → `minimal` +- `low` → `minimal` (not an EU AI Act term) + +--- + +## Rule 7 — array_not_empty + +**What:** If a framework array key is present (`owasp_llm_top_10_2025`, `owasp_agentic_top_10_2025`, +`mitre_atlas`, `controls`), it must not be an empty array. + +**On violation:** +```json +{ + "rule": "array_not_empty", + "key": "", + "value": [], + "message": "Empty array for ''. Either populate it or remove the key.", + "suggestion": "Add at least one ID, or remove the key entirely." +} +``` + +--- + +## Severity + +All rules above are **errors** (contribute to `pass: false`). There are no warnings in this +schema — an unknown key is always wrong, and empty arrays are always wrong. This matches the +intent: the block should only be present when it contains real, validated tags. + +--- references/mitre-atlas.md --- +# MITRE ATLAS — AI/ML Threat Techniques + +**Canonical IDs for use in `mitre_atlas:` arrays.** + +Official source: https://atlas.mitre.org/ + +MITRE ATLAS (Adversarial Threat Landscape for Artificial-Intelligence Systems) documents +adversarial ML and AI attack techniques using the same taxonomy style as MITRE ATT&CK. +IDs follow the pattern `AML.Txxxx` for techniques and `AML.Txxxx.xxx` for sub-techniques. + +## Techniques most relevant to LLM / agentic-AI evaluation + +| ID | Name | Relevant OWASP IDs | +|----|------|-------------------| +| AML.T0051 | LLM Prompt Injection | LLM01, T01 | +| AML.T0054 | LLM Jailbreak | LLM01 | +| AML.T0056 | LLM Meta Prompt Extraction | LLM07 | +| AML.T0057 | LLM Plugin Compromise | LLM03, T09 | +| AML.T0058 | LLM Data Leakage | LLM02 | +| AML.T0068 | Training Data Poisoning | LLM04 | +| AML.T0075 | Manipulate LLM Inputs | LLM01, T01 | + +## Sub-techniques + +Sub-techniques extend a base ID with a period-separated suffix, e.g.: +- `AML.T0051.000` — Direct Prompt Injection +- `AML.T0051.001` — Indirect Prompt Injection + +Use the base ID if the test covers the whole technique class; use sub-techniques for +more precise tagging when the attack method is specific. + +## Usage notes + +- List IDs as strings in an array: `mitre_atlas: [AML.T0051, AML.T0075]` +- Cross-reference with OWASP IDs when both frameworks cover the same attack: + a suite testing indirect prompt injection via tool output should tag + `owasp_llm_top_10_2025: [LLM01]` and `mitre_atlas: [AML.T0051]`. +- For the full technique catalog, browse https://atlas.mitre.org/techniques/ + +--- references/owasp-agentic-top-10-2025.md --- +# OWASP Top 10 for Agentic AI v2025 + +**Canonical IDs for use in `owasp_agentic_top_10_2025:` arrays.** + +Official source: https://owasp.org/www-project-top-10-for-large-language-model-applications/ +(Agentic AI supplement — see the "Agentic AI" section of the OWASP LLM project) + +| ID | Name | One-line description | +|----|------|----------------------| +| T01 | Prompt Injection for Agentic Systems | Attacker plants instructions in agent inputs, tool results, or retrieved content to redirect agent behavior. | +| T02 | Memory Poisoning | Adversarial content is written to agent memory (short- or long-term) to influence future decisions. | +| T03 | Data Exfiltration | Agent is manipulated into leaking sensitive data through tool calls, network requests, or outputs. | +| T04 | Privilege Escalation | Agent acquires or is tricked into using permissions beyond its intended scope. | +| T05 | Misconfigured Agent Networks | Overly permissive trust between orchestrating and sub-agents enables abuse. | +| T06 | Tool and Plugin Misuse | Agent uses legitimate tools (bash, file I/O, API calls) outside their intended purpose or without authorization. | +| T07 | Insecure Credential Storage | Agent stores or transmits credentials in memory, files, or outputs where they can be captured. | +| T08 | Unsafe Agent-to-Agent Communication | Messages between agents are unvalidated, unencrypted, or susceptible to injection. | +| T09 | Supply Chain Compromise | Malicious code in agent plugins, dependencies, or retrieved skill definitions. | +| T10 | Lack of Accountability | Agent actions are not logged or attributable, making audit and incident response impossible. | + +## Usage notes + +- Combine with `owasp_llm_top_10_2025` IDs for cases that bridge both lists. + Example: an indirect-prompt-injection attack is LLM01 + T01 + T06 (tool misuse). +- `T01` (Prompt Injection) and `LLM01` (Prompt Injection) are closely related but distinct: + LLM01 covers LLM-level injection; T01 covers the agent-orchestration dimension. +- List multiple IDs when a test case exercises more than one category: + `owasp_agentic_top_10_2025: [T01, T06]` + +--- references/owasp-llm-top-10-2025.md --- +# OWASP LLM Top 10 v2025 + +**Canonical IDs for use in `owasp_llm_top_10_2025:` arrays.** + +Official source: https://owasp.org/www-project-top-10-for-large-language-model-applications/ + +| ID | Name | One-line description | +|----|------|----------------------| +| LLM01 | Prompt Injection | Attacker manipulates LLM behavior via crafted inputs (direct or indirect). | +| LLM02 | Sensitive Information Disclosure | LLM reveals confidential data, system prompts, or PII in its output. | +| LLM03 | Supply Chain | Compromised components — plugins, datasets, pre-trained weights — affect the LLM pipeline. | +| LLM04 | Data and Model Poisoning | Training or fine-tuning data is tampered with to alter model behavior. | +| LLM05 | Improper Output Handling | LLM output is passed unsanitized to downstream systems (XSS, SSRF, code injection). | +| LLM06 | Excessive Agency | LLM acts on permissions or capabilities beyond what the task requires. | +| LLM07 | System Prompt Leakage | The system prompt or internal context is exposed to the user or a third party. | +| LLM08 | Vector and Embedding Weaknesses | Adversarial manipulation of embedding stores used for retrieval (RAG poisoning). | +| LLM09 | Misinformation | LLM generates plausible but factually incorrect content that causes harm. | +| LLM10 | Unbounded Consumption | LLM use is abused to exhaust resources — tokens, cost, rate limits, or compute. | + +## Usage notes + +- Use as many IDs as apply; list them in an array: `owasp_llm_top_10_2025: [LLM01, LLM06]` +- IDs are version-anchored. When OWASP releases a new version, a new field + (`owasp_llm_top_10_2026`) will be added rather than redefining these IDs. +- Combine with `mitre_atlas` IDs for technique-level tagging. diff --git a/evals/self/skills/fixtures/agentv-governance.txt b/evals/self/skills/fixtures/agentv-governance.txt new file mode 100644 index 000000000..1b942c1f6 --- /dev/null +++ b/evals/self/skills/fixtures/agentv-governance.txt @@ -0,0 +1,63 @@ +--- +name: agentv-governance +description: >- + Author, edit, and lint `governance:` blocks in `*.eval.yaml` files. + Use when creating or updating evaluation suites that carry AI-governance metadata + (OWASP LLM Top 10, OWASP Agentic Top 10, MITRE ATLAS, EU AI Act, ISO 42001). + Also use non-interactively (e.g., from a GitHub Action) to lint changed eval files + and report violations against the rules in `references/lint-rules.md`. + Do NOT use for running evals or benchmarking — that belongs to agentv-bench. +--- + +# AgentV Compliance Skill + +Teaches AI agents how to author syntactically correct `governance:` blocks in AgentV +eval files, and how to lint them against known vocabulary rules. + +## Dual mode + +**Authoring (interactive):** When a human or AI agent is editing a `*.eval.yaml` file +that contains or should contain a `governance:` block, this skill provides vocabulary, +valid values, and example shapes. Load it alongside `agentv-eval-writer` when building +red-team or compliance suites. + +**Linting (non-interactive / CI):** When invoked from a GitHub Action (see +`examples/governance/compliance-lint/`), this skill lints each changed `*.eval.yaml` file +against the rules in `references/lint-rules.md` and returns a structured JSON report. +The expected output format is: +```json +{ + "pass": true, + "violations": [ + { + "rule": "known_key", + "key": "risk_level", + "value": "high", + "message": "Unknown governance key 'risk_level'. Did you mean 'risk_tier'?", + "suggestion": "Replace 'risk_level' with 'risk_tier'." + } + ] +} +``` +`pass` is `true` when `violations` is empty. + +## Reference files + +| File | Purpose | +|------|---------| +| `references/governance-yaml-shape.md` | YAML shape, merge semantics, worked examples | +| `references/lint-rules.md` | Machine-readable rules applied during lint | +| `references/owasp-llm-top-10-2025.md` | LLM01–LLM10 canonical IDs and descriptions | +| `references/owasp-agentic-top-10-2025.md` | T01–T10 agentic-AI categories | +| `references/mitre-atlas.md` | Common AML.Txxxx technique IDs | +| `references/eu-ai-act-risk-tiers.md` | Four risk tiers + article references | +| `references/iso-42001-controls.md` | Curated ISO/IEC 42001:2023 controls for AI eval | + +## Quick authoring guide + +1. Check which risks this eval exercises using the reference files above. +2. Pick IDs from the relevant frameworks (`owasp_llm_top_10_2025`, `mitre_atlas`, etc.). +3. Set `risk_tier` using EU AI Act vocabulary (`prohibited | high | limited | minimal`). +4. Add `controls` as `-:` strings (e.g. `EU-AI-ACT-2024:Art.55`). +5. Run the lint rules from `references/lint-rules.md` against your block before committing. +6. See `references/governance-yaml-shape.md` for complete examples copied from real suites. diff --git a/evals/self/skills/fixtures/agentv-onboarding-full.txt b/evals/self/skills/fixtures/agentv-onboarding-full.txt new file mode 100644 index 000000000..a042d4721 --- /dev/null +++ b/evals/self/skills/fixtures/agentv-onboarding-full.txt @@ -0,0 +1,63 @@ +--- +name: agentv-onboarding +description: Bootstrap AgentV in the current workspace after plugin-manager install. Ensures CLI availability, runs workspace init, and verifies setup artifacts. +--- + +# AgentV Onboarding + +Use this skill when the user asks to set up AgentV in a repository. + +## Goal + +Set up AgentV in the current workspace: +- ensure the `agentv` CLI is available (install if needed) +- initialize workspace files +- verify setup artifacts and report status + +## Workflow + +### 1. Resolve Script Path + +Find the directory that contains this `SKILL.md`, then resolve script paths relative to it. + +Packaged scripts: +- `scripts/onboard-agentv.sh` for bash/zsh +- `scripts/onboard-agentv.ps1` for PowerShell + +### 2. Run the Platform Script + +Run from the repository root where AgentV should be initialized. + +POSIX shells: + +```bash +bash /scripts/onboard-agentv.sh +``` + +PowerShell: + +```powershell +pwsh -File /scripts/onboard-agentv.ps1 +``` + +If `pwsh` is unavailable on Windows: + +```powershell +powershell -ExecutionPolicy Bypass -File /scripts/onboard-agentv.ps1 +``` + +### 3. Handle Errors + +If the script fails, report the exact error and stop. Do not claim setup succeeded. + +### 4. Report Outcome Clearly + +Summarize: +- `agentv` version in use +- whether CLI was installed during this run +- whether `agentv init` completed +- whether setup verification passed + +## Re-run Behavior + +Re-running is safe. The scripts run `agentv init`, and if setup artifacts are still missing they rerun once automatically before failing. diff --git a/evals/self/skills/fixtures/agentv-onboarding.txt b/evals/self/skills/fixtures/agentv-onboarding.txt new file mode 100644 index 000000000..a042d4721 --- /dev/null +++ b/evals/self/skills/fixtures/agentv-onboarding.txt @@ -0,0 +1,63 @@ +--- +name: agentv-onboarding +description: Bootstrap AgentV in the current workspace after plugin-manager install. Ensures CLI availability, runs workspace init, and verifies setup artifacts. +--- + +# AgentV Onboarding + +Use this skill when the user asks to set up AgentV in a repository. + +## Goal + +Set up AgentV in the current workspace: +- ensure the `agentv` CLI is available (install if needed) +- initialize workspace files +- verify setup artifacts and report status + +## Workflow + +### 1. Resolve Script Path + +Find the directory that contains this `SKILL.md`, then resolve script paths relative to it. + +Packaged scripts: +- `scripts/onboard-agentv.sh` for bash/zsh +- `scripts/onboard-agentv.ps1` for PowerShell + +### 2. Run the Platform Script + +Run from the repository root where AgentV should be initialized. + +POSIX shells: + +```bash +bash /scripts/onboard-agentv.sh +``` + +PowerShell: + +```powershell +pwsh -File /scripts/onboard-agentv.ps1 +``` + +If `pwsh` is unavailable on Windows: + +```powershell +powershell -ExecutionPolicy Bypass -File /scripts/onboard-agentv.ps1 +``` + +### 3. Handle Errors + +If the script fails, report the exact error and stop. Do not claim setup succeeded. + +### 4. Report Outcome Clearly + +Summarize: +- `agentv` version in use +- whether CLI was installed during this run +- whether `agentv init` completed +- whether setup verification passed + +## Re-run Behavior + +Re-running is safe. The scripts run `agentv init`, and if setup artifacts are still missing they rerun once automatically before failing. diff --git a/evals/self/skills/fixtures/agentv-trace-analyst-full.txt b/evals/self/skills/fixtures/agentv-trace-analyst-full.txt new file mode 100644 index 000000000..6205f85e0 --- /dev/null +++ b/evals/self/skills/fixtures/agentv-trace-analyst-full.txt @@ -0,0 +1,145 @@ +--- +name: agentv-trace-analyst +description: >- + Analyze AgentV evaluation traces and result JSONL files using `agentv inspect` and `agentv compare` CLI commands. + Use when asked to inspect AgentV eval results, find regressions between AgentV evaluation runs, + identify failure patterns in AgentV trace data, analyze tool trajectories, or compute cost/latency/score statistics + from AgentV result files. + Do NOT use for benchmarking skill trigger accuracy, analyzing skill-creator eval performance, + or measuring skill description quality — those tasks belong to the skill-creator skill. +--- + +# AgentV Trace Analyst + +Analyze evaluation traces headlessly using `agentv inspect` primitives and `jq`. + +## Primitives + +```bash +# List result files (most recent first) +agentv inspect list [--limit N] [--format json|table] + +# Show results with trace details +agentv inspect show [--test-id ] [--tree] [--format json|table] + +# Percentile statistics +agentv inspect stats [--group-by target|suite|test-id] [--format json|table] + +# A/B comparison between runs +agentv compare [--threshold 0.1] [--format json|table] +``` + +## Analysis Workflow + +### 1. Discover results + +```bash +agentv inspect list +``` + +Pick the result file to analyze. Most recent is first. + +### 2. Get overview + +```bash +agentv inspect stats +``` + +Read the percentile table. Key signals: +- **score p50 < 0.8**: Significant quality issues +- **latency p90 > 30s**: Performance bottleneck +- **cost p99 spike**: Outlier cost tests to investigate +- **tool_calls p90 >> p50**: Some tests are much chattier + +### 3. Investigate failures + +```bash +agentv inspect show --format json | jq '[.[] | select(.score < 0.8) | {test_id, score, assertions: [.assertions[] | select(.passed | not)], trace: {tools: (.trace.tool_calls | keys)}, duration_ms, cost_usd}]' +``` + +For each failing test, examine: +- **assertions (failed)**: What criteria were not met? (filter for `passed: false`) +- **trace.tool_calls**: Did the agent use expected tools? +- **duration_ms**: Did it time out or run too long? +- **reasoning**: Why did the grader score it low? + +### 4. Inspect specific tests + +```bash +# Flat view with trace summary +agentv inspect show --test-id + +# Tree view (if output messages available) +agentv inspect show --test-id --tree +``` + +The tree view shows the agent's execution path — LLM calls interspersed with tool invocations. Look for: +- **Excessive tool calls**: Agent looping or exploring unnecessarily +- **Missing tools**: Expected tool not called +- **Long durations**: Specific tool calls that are slow + +### 5. Compare runs + +```bash +agentv compare +``` + +Look for: +- **Wins vs losses**: Net improvement or regression? +- **Mean delta**: Overall direction of change +- **Per-test deltas**: Which tests regressed? + +### 6. Group analysis + +```bash +# By target provider +agentv inspect stats --group-by target + +# By suite +agentv inspect stats --group-by suite +``` + +Compare providers side-by-side: which is cheaper, faster, more accurate? + +## Advanced Queries with jq + +All commands support `--format json` for piping to `jq`: + +```bash +# Top 3 most expensive tests +agentv inspect show --format json \ + | jq 'sort_by(-.cost_usd) | .[0:3] | .[] | {test_id, cost: .cost_usd, score}' + +# Tests where token usage exceeds 10k +agentv inspect show --format json \ + | jq '[.[] | select(.token_usage.input + .token_usage.output > 10000) | {test_id, tokens: (.token_usage.input + .token_usage.output)}]' + +# Score distribution by suite +agentv inspect show --format json \ + | jq 'group_by(.suite) | .[] | {suite: .[0].suite, count: length, avg_score: ([.[].score] | add / length)}' + +# Tool usage frequency across all tests +agentv inspect show --format json \ + | jq '[.[].trace.tool_calls // {} | to_entries[]] | group_by(.key) | .[] | {tool: .[0].key, total_calls: ([.[].value] | add)}' + +# Find regressions > 0.1 between two runs +agentv compare baseline.jsonl candidate.jsonl --format json \ + | jq '.matched[] | select(.delta < -0.1) | {test_id: .testId, delta, from: .score1, to: .score2}' +``` + +## Reasoning Patterns + +When analyzing traces, think about: + +1. **Efficiency**: Are tool calls/tokens proportional to task complexity? High tokens-per-tool may indicate verbose prompts or unnecessary context. + +2. **Error patterns**: Do failures cluster by target, suite, or tool usage? Common patterns: + - Tool errors → agent can't access required resources + - High LLM calls with low tool calls → agent stuck in reasoning loop + - Missing tool calls → wrong tool routing + +3. **Cost optimization**: Identify tests with high cost but acceptable scores — can they use a cheaper model? Compare `--group-by target` stats. + +4. **Latency distribution**: P50 vs P99 spread indicates consistency. Large spread means unpredictable performance — investigate P99 outliers. + +5. **Regression detection**: After a prompt/config change, compare before/after. Mean delta > 0 is good, but check individual test regressions — a few large losses can hide behind many small wins. diff --git a/evals/self/skills/fixtures/agentv-trace-analyst.txt b/evals/self/skills/fixtures/agentv-trace-analyst.txt new file mode 100644 index 000000000..6205f85e0 --- /dev/null +++ b/evals/self/skills/fixtures/agentv-trace-analyst.txt @@ -0,0 +1,145 @@ +--- +name: agentv-trace-analyst +description: >- + Analyze AgentV evaluation traces and result JSONL files using `agentv inspect` and `agentv compare` CLI commands. + Use when asked to inspect AgentV eval results, find regressions between AgentV evaluation runs, + identify failure patterns in AgentV trace data, analyze tool trajectories, or compute cost/latency/score statistics + from AgentV result files. + Do NOT use for benchmarking skill trigger accuracy, analyzing skill-creator eval performance, + or measuring skill description quality — those tasks belong to the skill-creator skill. +--- + +# AgentV Trace Analyst + +Analyze evaluation traces headlessly using `agentv inspect` primitives and `jq`. + +## Primitives + +```bash +# List result files (most recent first) +agentv inspect list [--limit N] [--format json|table] + +# Show results with trace details +agentv inspect show [--test-id ] [--tree] [--format json|table] + +# Percentile statistics +agentv inspect stats [--group-by target|suite|test-id] [--format json|table] + +# A/B comparison between runs +agentv compare [--threshold 0.1] [--format json|table] +``` + +## Analysis Workflow + +### 1. Discover results + +```bash +agentv inspect list +``` + +Pick the result file to analyze. Most recent is first. + +### 2. Get overview + +```bash +agentv inspect stats +``` + +Read the percentile table. Key signals: +- **score p50 < 0.8**: Significant quality issues +- **latency p90 > 30s**: Performance bottleneck +- **cost p99 spike**: Outlier cost tests to investigate +- **tool_calls p90 >> p50**: Some tests are much chattier + +### 3. Investigate failures + +```bash +agentv inspect show --format json | jq '[.[] | select(.score < 0.8) | {test_id, score, assertions: [.assertions[] | select(.passed | not)], trace: {tools: (.trace.tool_calls | keys)}, duration_ms, cost_usd}]' +``` + +For each failing test, examine: +- **assertions (failed)**: What criteria were not met? (filter for `passed: false`) +- **trace.tool_calls**: Did the agent use expected tools? +- **duration_ms**: Did it time out or run too long? +- **reasoning**: Why did the grader score it low? + +### 4. Inspect specific tests + +```bash +# Flat view with trace summary +agentv inspect show --test-id + +# Tree view (if output messages available) +agentv inspect show --test-id --tree +``` + +The tree view shows the agent's execution path — LLM calls interspersed with tool invocations. Look for: +- **Excessive tool calls**: Agent looping or exploring unnecessarily +- **Missing tools**: Expected tool not called +- **Long durations**: Specific tool calls that are slow + +### 5. Compare runs + +```bash +agentv compare +``` + +Look for: +- **Wins vs losses**: Net improvement or regression? +- **Mean delta**: Overall direction of change +- **Per-test deltas**: Which tests regressed? + +### 6. Group analysis + +```bash +# By target provider +agentv inspect stats --group-by target + +# By suite +agentv inspect stats --group-by suite +``` + +Compare providers side-by-side: which is cheaper, faster, more accurate? + +## Advanced Queries with jq + +All commands support `--format json` for piping to `jq`: + +```bash +# Top 3 most expensive tests +agentv inspect show --format json \ + | jq 'sort_by(-.cost_usd) | .[0:3] | .[] | {test_id, cost: .cost_usd, score}' + +# Tests where token usage exceeds 10k +agentv inspect show --format json \ + | jq '[.[] | select(.token_usage.input + .token_usage.output > 10000) | {test_id, tokens: (.token_usage.input + .token_usage.output)}]' + +# Score distribution by suite +agentv inspect show --format json \ + | jq 'group_by(.suite) | .[] | {suite: .[0].suite, count: length, avg_score: ([.[].score] | add / length)}' + +# Tool usage frequency across all tests +agentv inspect show --format json \ + | jq '[.[].trace.tool_calls // {} | to_entries[]] | group_by(.key) | .[] | {tool: .[0].key, total_calls: ([.[].value] | add)}' + +# Find regressions > 0.1 between two runs +agentv compare baseline.jsonl candidate.jsonl --format json \ + | jq '.matched[] | select(.delta < -0.1) | {test_id: .testId, delta, from: .score1, to: .score2}' +``` + +## Reasoning Patterns + +When analyzing traces, think about: + +1. **Efficiency**: Are tool calls/tokens proportional to task complexity? High tokens-per-tool may indicate verbose prompts or unnecessary context. + +2. **Error patterns**: Do failures cluster by target, suite, or tool usage? Common patterns: + - Tool errors → agent can't access required resources + - High LLM calls with low tool calls → agent stuck in reasoning loop + - Missing tool calls → wrong tool routing + +3. **Cost optimization**: Identify tests with high cost but acceptable scores — can they use a cheaper model? Compare `--group-by target` stats. + +4. **Latency distribution**: P50 vs P99 spread indicates consistency. Large spread means unpredictable performance — investigate P99 outliers. + +5. **Regression detection**: After a prompt/config change, compare before/after. Mean delta > 0 is good, but check individual test regressions — a few large losses can hide behind many small wins. diff --git a/evals/self/skills/fixtures/skills-get-nonexistent.txt b/evals/self/skills/fixtures/skills-get-nonexistent.txt new file mode 100644 index 000000000..c9f192fed --- /dev/null +++ b/evals/self/skills/fixtures/skills-get-nonexistent.txt @@ -0,0 +1,2 @@ +Error: skill 'does-not-exist' not found +Available skills: agentv-bench, agentv-eval-review, agentv-eval-writer, agentv-governance, agentv-onboarding, agentv-trace-analyst diff --git a/evals/self/skills/fixtures/skills-list-all.txt b/evals/self/skills/fixtures/skills-list-all.txt new file mode 100644 index 000000000..921531f39 --- /dev/null +++ b/evals/self/skills/fixtures/skills-list-all.txt @@ -0,0 +1 @@ +{"success":true,"data":["agentv-bench","agentv-eval-review","agentv-eval-writer","agentv-governance","agentv-onboarding","agentv-trace-analyst"]} diff --git a/evals/self/skills/output-correctness.eval.yaml b/evals/self/skills/output-correctness.eval.yaml new file mode 100644 index 000000000..ce43f69c4 --- /dev/null +++ b/evals/self/skills/output-correctness.eval.yaml @@ -0,0 +1,176 @@ +description: >- + Tests whether agent-produced output is structurally and factually correct + given a bundled-skill fixture as ground truth — covers eval YAML structure, + CLI command syntax, fixture-content accuracy, and grader configuration. + +tags: [agent, skills] + +tests: + - id: yaml-valid-structure + criteria: Agent produces eval YAML with the canonical AgentV fields. + input: + - role: user + content: + - type: file + value: fixtures/agentv-eval-writer.txt + - type: text + value: | + Using the agentv-eval-writer skill above as your reference, + produce a minimal but valid `.eval.yaml` snippet (one `description` + and one test in `tests:`) for an eval that grades whether an LLM + correctly summarises a paragraph. + + Output **only** the YAML — no surrounding prose or fences. + assertions: + - type: contains + value: "description:" + - type: contains + value: "tests:" + - type: regex + value: "-\\s+id:\\s+\\S" + - type: rubrics + criteria: + - "Has top-level `description:` and `tests:` keys" + - "At least one test under `tests:` has both `id:` and either `criteria:` or `input:` (or both)" + - Uses snake_case keys (no camelCase like `testId` or `expectedOutput`) + - YAML is syntactically valid (consistent indentation, no tab/space mixing) + + - id: yaml-assertion-types-valid + criteria: Agent uses canonical assertion type names from the AgentV registry. + input: + - role: user + content: + - type: file + value: fixtures/agentv-eval-writer.txt + - type: text + value: | + Using the agentv-eval-writer skill as reference, write a single + test entry under `tests:` that uses **three** assertions: + one `contains`, one `regex`, and one `rubrics` (with at least + two criteria). Output only the YAML for the test entry. + assertions: + - type: regex + value: "type:\\s+contains" + - type: regex + value: "type:\\s+regex" + - type: regex + value: "type:\\s+rubrics" + - type: rubrics + criteria: + - "Each assertion has both a `type:` and the field that type expects (`value:` for contains/regex, `criteria:` list for rubrics)" + - Does not invent assertion types that are not in the agentv-eval-writer skill (e.g., `like`, `match`, `expect`) + - Uses kebab-case (e.g., `llm-grader`, `is-json`) not snake_case (e.g., `llm_grader`) for any multi-word grader types referenced + + - id: cli-command-correct-syntax + criteria: Agent produces correct `agentv skills` CLI commands with valid subcommands and flags. + input: + - role: user + content: + - type: file + value: fixtures/agentv-bench.txt + - type: text + value: | + Give me three shell commands, one per line, that exercise the + `agentv skills` subcommand: + 1. List all bundled skills as JSON. + 2. Print the agentv-bench skill including its references and templates. + 3. Print the on-disk path of the agentv-bench skill. + assertions: + - type: regex + value: "agentv\\s+skills\\s+list\\s+--json" + - type: regex + value: "agentv\\s+skills\\s+get\\s+agentv-bench\\s+--full" + - type: regex + value: "agentv\\s+skills\\s+path\\s+agentv-bench" + + - id: fixture-content-accurate + criteria: Agent's description of a skill matches the skill's actual SKILL.md content. + input: + - role: user + content: + - type: file + value: fixtures/agentv-trace-analyst.txt + - type: text + value: | + Read the agentv-trace-analyst skill above. In one paragraph, + describe what this skill does and when it should be invoked. + Quote at most one short phrase from the skill text; do not + invent capabilities the skill does not mention. + assertions: + - type: rubrics + criteria: + - Description accurately reflects what agentv-trace-analyst does (analysing eval traces and failure modes) + - Does not claim the skill writes or runs evals (those are agentv-eval-writer and agentv-bench respectively) + - Any quoted phrase appears verbatim in the fixture + - Does not hallucinate sub-features not present in the SKILL.md + + - id: skill-list-complete + criteria: Agent lists all six bundled skills when given the JSON fixture. + input: + - role: user + content: + - type: file + value: fixtures/skills-list-all.txt + - type: text + value: | + The JSON above is the output of `agentv skills list --json`. + Reproduce the list of skill names as a markdown bulleted list. + assertions: + - type: contains + value: agentv-bench + - type: contains + value: agentv-eval-review + - type: contains + value: agentv-eval-writer + - type: contains + value: agentv-governance + - type: contains + value: agentv-onboarding + - type: contains + value: agentv-trace-analyst + - type: rubrics + criteria: + - Lists exactly the six skill names from the JSON, no more and no fewer + - Does not invent additional skill names not present in the fixture + + - id: error-message-accurate + criteria: Agent correctly interprets the not-found error and proposes a recoverable next step. + input: + - role: user + content: + - type: file + value: fixtures/skills-get-nonexistent.txt + - type: text + value: | + The output above came from running `agentv skills get does-not-exist`. + In two sentences, explain (a) what error the user hit and + (b) the single most useful command to run next. + assertions: + - type: rubrics + criteria: + - Identifies that `does-not-exist` is not a recognised skill name + - "Proposes either `agentv skills list` or selecting one of the names from the error's \"Available skills:\" line" + - Does not blame missing installation or recommend re-running `npm install -g agentv` + + - id: grader-config-valid + criteria: Agent configures an llm-grader with the fields the AgentV registry expects. + input: + - role: user + content: + - type: file + value: fixtures/agentv-eval-writer.txt + - type: text + value: | + Using the agentv-eval-writer skill as reference, produce a YAML + snippet for **one** assertion that uses `type: llm-grader` to + grade an answer's correctness. Include whatever fields the + skill says an `llm-grader` requires (e.g., target/model, rubric + criteria). Output only the YAML for the single assertion. + assertions: + - type: regex + value: "type:\\s+llm-grader" + - type: rubrics + criteria: + - "Uses `type: llm-grader` (kebab-case), not `type: llm_grader` or `type: llmGrader`" + - Includes the fields the skill documents for llm-grader (typically a rubric/criteria field; may also include a target/model) + - Does not invent fields not mentioned in the skill (e.g., a fabricated `model_provider` field) diff --git a/evals/self/skills/skill-invocation.eval.yaml b/evals/self/skills/skill-invocation.eval.yaml new file mode 100644 index 000000000..79c8b3d61 --- /dev/null +++ b/evals/self/skills/skill-invocation.eval.yaml @@ -0,0 +1,127 @@ +description: >- + Tests whether an AI agent correctly invokes `agentv skills` CLI subcommands + (list, get, path) with the right flags when asked to retrieve, enumerate, + or locate bundled skill content. + +tags: [agent, skills] + +tests: + - id: invoke-get-single-skill + criteria: Agent invokes `agentv skills get ` for a single named skill. + input: | + I want to read the body of the agentv-bench skill so I can follow its + benchmarking workflow. What single shell command should I run against the + `agentv` CLI to print the skill's SKILL.md to stdout? + + Reply with the command on its own line. + assertions: + - type: regex + value: "agentv\\s+skills\\s+get\\s+agentv-bench" + - type: rubrics + criteria: + - Suggests `agentv skills get agentv-bench` (no extra flags needed for the basic case) + - Does not suggest a plugin-install or marketplace step + + - id: invoke-get-full-flag + criteria: Agent uses `--full` when bundled references/templates are needed. + input: | + I need the full agentv-eval-writer skill — not just SKILL.md, but every + file under its `references/` and `templates/` directories — so I can see + the example YAML templates the skill ships with. + + What single `agentv skills` command should I run? + assertions: + - type: contains + value: --full + - type: regex + value: "agentv\\s+skills\\s+get\\s+agentv-eval-writer" + + - id: invoke-list-available + criteria: Agent invokes `agentv skills list` when asked what skills are bundled. + input: | + I just installed `agentv` globally. How do I list every skill that ships + bundled with the CLI? Give me the single command. + assertions: + - type: regex + value: "agentv\\s+skills\\s+list" + - type: rubrics + criteria: + - Recommends `agentv skills list` + - Does not invoke `npx allagents plugin` or any marketplace step + + - id: invoke-get-nonexistent + criteria: Agent diagnoses a not-found error from the CLI and proposes a fix. + input: + - role: user + content: + - type: text + value: | + I ran `agentv skills get does-not-exist` and got this output: + - type: file + value: fixtures/skills-get-nonexistent.txt + - type: text + value: | + What happened, and what command should I run instead to discover + the correct skill name? + assertions: + - type: rubrics + criteria: + - Identifies that `does-not-exist` is not a valid skill name + - Recommends `agentv skills list` to discover valid names + - Does not invent a skill name that isn't in the error output's "Available skills" line + + - id: invoke-json-flag + criteria: Agent uses `--json` when machine-readable output is needed. + input: | + I want to pipe the list of skill names into `jq` so I can build a + shell-script loop over each name. Which `agentv skills` invocation + gives me machine-readable output suitable for `jq`? + assertions: + - type: contains + value: --json + - type: regex + value: "agentv\\s+skills\\s+(list|get).*--json" + + - id: invoke-path-command + criteria: Agent uses `agentv skills path` when asked where bundled skills live on disk. + input: | + Where on disk does the installed `agentv` CLI keep its bundled skill + files? I want the absolute path so I can open one in my editor. Give me + the single CLI command that prints that path. + assertions: + - type: regex + value: "agentv\\s+skills\\s+path" + - type: rubrics + criteria: + - Uses `agentv skills path` (with or without a skill name argument) + - Does not suggest hand-traversing `node_modules` or the npm prefix manually + + - id: invoke-get-all-flag + criteria: Agent uses `--all` to retrieve every bundled skill in one call. + input: | + I want to dump the SKILL.md of every bundled agentv skill into a single + stream so I can pass them all into another tool's context. What's the + single `agentv skills get` command that does this? + assertions: + - type: contains + value: --all + - type: regex + value: "agentv\\s+skills\\s+get.*--all" + + - id: invoke-skill-trigger + criteria: Agent maps a natural-language task to the correct skill name and command. + input: + - role: user + content: + - type: file + value: fixtures/skills-list-all.txt + - type: text + value: | + Above is the JSON list of bundled agentv skills. I want to read + the skill that helps with debugging eval failures and analysing + traces. Give me the single CLI command to print its content. + assertions: + - type: contains + value: agentv-trace-analyst + - type: regex + value: "agentv\\s+skills\\s+get\\s+agentv-trace-analyst" diff --git a/evals/self/skills/skill-selection.eval.yaml b/evals/self/skills/skill-selection.eval.yaml new file mode 100644 index 000000000..2485f7c8e --- /dev/null +++ b/evals/self/skills/skill-selection.eval.yaml @@ -0,0 +1,130 @@ +description: >- + Tests whether the agent picks the correct bundled agentv skill (by name) + for a given natural-language task. Each test injects the bundled skills + list as context so the agent has the same options a real user would. + +tags: [agent, skills] + +# Suite-level input: every test sees the bundled skills list as the first +# user message, so the agent always has the canonical set of names to +# choose from. Per-test `input:` is appended to this. +input: + - role: user + content: + - type: file + value: fixtures/skills-list-all.txt + - type: text + value: | + The JSON above lists every agentv skill bundled with the CLI. + For the next task, name the **single** skill (or skills, only if + the task genuinely requires more than one) that best fits. + Reply with the skill name(s) only — no commentary. + +tests: + - id: select-bench-for-eval-run + criteria: "Task: \"run evals on my agent\" → agent selects agentv-bench." + input: | + Task: I have a YAML eval file and I need to run it against an LLM target + and see the pass rate. Which skill? + assertions: + - type: contains + value: agentv-bench + - type: rubrics + criteria: + - Selects only agentv-bench (not agentv-eval-writer, which is for authoring) + - Does not select agentv-trace-analyst (which is for after-the-fact debugging) + + - id: select-eval-writer-for-create + criteria: "Task: \"write an eval YAML for my skill\" → agent selects agentv-eval-writer." + input: | + Task: I want to author a brand-new `.eval.yaml` file for my custom skill + from scratch. Which skill should I load? + assertions: + - type: contains + value: agentv-eval-writer + - type: rubrics + criteria: + - Selects only agentv-eval-writer + - Does not select agentv-bench (running) or agentv-eval-review (auditing existing) + + - id: select-eval-review-for-audit + criteria: "Task: \"review my eval file for issues\" → agent selects agentv-eval-review." + input: | + Task: I already wrote my `.eval.yaml` and want a second pair of eyes to + audit it for missing assertions, bad rubric phrasing, and structural + issues. Which skill? + assertions: + - type: contains + value: agentv-eval-review + - type: rubrics + criteria: + - Selects agentv-eval-review (the audit-focused skill) + - Does not select agentv-eval-writer (which is for writing, not reviewing) + + - id: select-governance-for-policy + criteria: "Task: \"what are the design principles?\" → agent selects agentv-governance." + input: | + Task: I'm proposing a new built-in feature and want to check whether it + aligns with the AgentV project's design principles before I open a PR. + Which skill? + assertions: + - type: contains + value: agentv-governance + - type: rubrics + criteria: + - Selects agentv-governance + - Does not select agentv-onboarding (which is install-time setup, not design review) + + - id: select-onboarding-for-setup + criteria: "Task: \"help me get started with agentv\" → agent selects agentv-onboarding." + input: | + Task: I just `npm install -g agentv`'d for the first time and have no + idea where to start. Which skill walks me through first steps? + assertions: + - type: contains + value: agentv-onboarding + - type: rubrics + criteria: + - Selects agentv-onboarding + - Does not select agentv-governance (which is for design-principle checks, not first-time setup) + + - id: select-trace-analyst-for-debug + criteria: "Task: \"analyze why this eval failed\" → agent selects agentv-trace-analyst." + input: | + Task: My eval run produced an `index.jsonl` with several failing rows. + I want to dig into the per-test traces and figure out what went wrong. + Which skill? + assertions: + - type: contains + value: agentv-trace-analyst + - type: rubrics + criteria: + - Selects agentv-trace-analyst + - Does not select agentv-bench (running) or agentv-eval-review (static audit) + + - id: select-distinguishes-bench-vs-writer + criteria: "Task: \"I need to write AND run evals\" → agent names both skills." + input: | + Task: I'm starting a new eval project. I'll need to author the `.eval.yaml` + from scratch *and* run it against a target. Which skill or skills? + assertions: + - type: contains + value: agentv-eval-writer + - type: contains + value: agentv-bench + - type: rubrics + criteria: + - Names both agentv-eval-writer (for authoring) and agentv-bench (for running) + - Does not name agentv-eval-review or agentv-trace-analyst, which are not asked for + + - id: select-no-false-positive + criteria: "Task: \"help me deploy my app\" → agent names no agentv skill." + input: | + Task: I want to deploy my Node.js app to a Kubernetes cluster. Which + agentv skill should I load? + assertions: + - type: rubrics + criteria: + - Correctly states that no bundled agentv skill is relevant — these skills are for evaluation workflows, not deployment + - Does not invent a fake skill name + - Does not falsely select agentv-bench, agentv-eval-writer, agentv-onboarding, or any other listed skill diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md index 67df5b625..c8b1a1bee 100644 --- a/plugins/agentv-dev/skills/agentv-bench/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-bench/SKILL.md @@ -9,420 +9,11 @@ description: >- analyzing existing traces/JSONL without re-running (use agentv-trace-analyst). --- -# AgentV Bench +The full skill content is bundled with the AgentV CLI and always version-matched to it. +Load it now: - -A skill for evaluating agents and iteratively improving them through data-driven optimization. - -At a high level, the process goes like this: - -- Understand what the agent does and what "good" looks like -- Write evaluation test cases (EVAL.yaml or evals.json) -- Run the agent on those test cases, grade the outputs -- Analyze the results — what's working, what's failing, and why -- Improve the agent's prompts/skills/config based on the analysis -- Repeat until you're satisfied - -Your job when using this skill is to figure out where the user is in this process and then jump in and help them progress. Maybe they want to start from scratch — help them write evals, run them, and iterate. Maybe they already have results — jump straight to analysis and improvement. - -Be flexible. If the user says "I don't need a full benchmark, just help me debug this failure", do that instead. - -After the agent is working well, you can also run description optimization to improve skill triggering accuracy (see `references/description-optimization.md`). - -## Communicating with the user - -This skill is used by people across a wide range of familiarity with evaluation tooling. Pay attention to context cues: - -- "evaluation" and "benchmark" are borderline but OK in most cases -- For "YAML", "grader", "assertion", "deterministic judge" — see serious cues from the user that they know what those mean before using them without explanation -- Briefly explain terms if in doubt - -When presenting results, default to summary tables. Offer detail on request. In CI/headless mode, skip interactive prompts and exit with status codes. - ---- - -## Step 1: Understand the Agent - -Before running or optimizing, understand what you're working with. - -1. **Read the agent's artifacts** — prompts, skills, configs, recent changes. Understand the full picture: what tools are available, what the expected input/output looks like, what constraints exist. - -2. **Identify success criteria** — what does "good" look like for this agent? What are the edge cases? What would a failure look like? Talk to the user if this isn't clear from the artifacts alone. - -3. **Understand the target harness** — which provider runs the agent (Claude, GPT, Copilot CLI, Gemini, custom CLI)? This affects what grader types are available and how to run tests. Targets are configured in `.agentv/targets.yaml` (canonical location, searched from the eval file directory upward). Sensitive values like `api_key` must use `${{ ENV_VAR }}` syntax — literal secrets are rejected as a security guardrail. - -4. **Challenge assumptions** — if evals already exist, review their quality before running: - - Are the test cases testing the right things? - - Are assertions specific enough to catch real failures? - - Are there ambiguous or contradictory test cases? - - Flag eval issues before proceeding — running bad evals wastes time. - -5. **Check integrity** — ensure task prompts (what the agent receives) are not also used as grader prompts (how outputs are scored). If a prompt file appears in both locations, note the overlap and optimize only for the task purpose. - ---- - -## Step 2: Write Evaluations - -AgentV supports two evaluation formats: - -**EVAL.yaml** (native, full features) — supports workspaces, code graders, multi-turn conversations, tool trajectory scoring, workspace file tracking, multi-provider targets. Use this for agent evaluation. - -```yaml -# example.eval.yaml -tests: - - id: basic-code-review - input: "Review this TypeScript file for bugs and suggest improvements" - criteria: "Identifies the null pointer bug on line 12 and suggests a fix" - assertions: - - type: contains - value: "null" - - Review identifies the null pointer bug and suggests a concrete fix - -workspace: - template: ./workspace-template - hooks: - before_each: - reset: fast -``` - -Multi-skill evaluation is handled naturally via input messages — describe the task in the test input, and the agent uses whatever skills it needs. - -**evals.json** (skill-creator compatible) — auto-promoted to EVAL-equivalent format: -- `prompt` → input messages -- `expected_output` → reference answer -- `assertions` → graders -- `files[]` paths resolved relative to the evals.json location - -```json -{ - "skill_name": "my-agent", - "evals": [ - { - "id": 1, - "prompt": "User's task prompt", - "expected_output": "Description of expected result", - "assertions": ["Output includes error handling", "Uses async/await"] - } - ] -} -``` - -### Writing good test cases - -Start with 2-3 realistic test cases — the kind of thing a real user would actually say. Share them with the user before running: "Here are a few test cases I'd like to try. Do these look right, or do you want to add more?" - -Good assertions are objectively verifiable and have descriptive names. Subjective quality ("the output is good") is better evaluated qualitatively — don't force assertions onto things that need human judgment. - -**Grader types** (cheapest to most expensive): `exact`, `contains`, `regex`, `is-json`, `field-accuracy`, `composite`, `code-grader`, `tool-trajectory`, `llm-grader`. See `references/eval-yaml-spec.md` for full config and grading recipes for each type. - -Prefer deterministic graders over LLM graders whenever possible. If an assertion can be checked with `contains` or `regex`, don't use `llm-grader`. - ---- - -## Step 3: Run and Grade - -This section is one continuous sequence — don't stop partway through. - -Each run produces a new `.agentv/results/runs//` directory automatically. Use timestamps to identify iterations when comparing runs. - -### Choosing a run mode - -**User instruction takes priority.** If the user says "run in subagent mode", "use subagent mode", or "use CLI mode", use that mode directly. - -If the user has not specified a mode, default to `subagent`. - -| `AGENT_EVAL_MODE` | Mode | How | -|----------------------|------|-----| -| `subagent` (default) | **Subagent mode** | Subagent-driven eval — parses eval.yaml, spawns executor + grader subagents. Zero CLI dependency. | -| `cli` | **AgentV CLI** | `agentv eval ` — end-to-end, multi-provider | - -Set `AGENT_EVAL_MODE` in `.env` at the project root as the default when no mode is specified. If absent, default to `subagent`. **User instruction always overrides this.** - -**`subagent`** — Parses eval.yaml directly, spawns executor subagents to run each test case in the current workspace, then spawns grader subagents to evaluate all assertion types natively. No CLI or external API calls required. Read `references/subagent-pipeline.md` for the detailed procedure. - -**`cli`** — AgentV CLI handles execution, grading, and artifact generation end-to-end. Works with all providers. Use when you need multi-provider benchmarking or CLI-specific features. - -### Running evaluations - -**AgentV CLI mode** (end-to-end, EVAL.yaml): -```bash -agentv eval --output .agentv/artifacts/ -``` - -**Subagent mode** — read `references/subagent-pipeline.md` for the detailed procedure. In brief: use `pipeline input` to extract inputs, dispatch one `executor` subagent per test case (all in parallel), then proceed to grading below. - -**Spawn all runs in the same turn.** For each test case that needs both a "with change" and a "baseline" run, launch them simultaneously. Don't run one set first and come back for the other — launch everything at once so results arrive around the same time. - -**Multi-target benchmarking:** -```bash -agentv eval --target claude --target gpt --target copilot -``` - -**Baseline strategy:** -- **New agent**: baseline is "no prompt" or minimal prompt — same eval, no agent-specific configuration -- **Improving existing**: snapshot the current version before editing (`cp -r /prompt-snapshot/`), use as baseline throughout -- **Multi-target**: each target is its own baseline — no need for a separate "without" run - -### While runs are in progress, draft graders - -Don't just wait for runs to finish — use this time productively. If assertions don't exist yet, draft them now. If they exist, review them and explain what they check to the user. - -Good assertions are *discriminating* — they pass when the agent genuinely succeeds and fail when it doesn't. An assertion that passes for both good and bad outputs is worse than no assertion. - -### As runs complete, capture timing data - -When each subagent task completes, you receive a notification containing `total_tokens` and `duration_ms`. **Save this data immediately** to `timing.json` in the run directory. See `references/schemas.md` for the timing.json schema. - -This is the only opportunity to capture this data — it comes through the task notification and isn't persisted elsewhere. Process each notification as it arrives. - -### Grading - -**In CLI mode**, `agentv eval` handles all grading end-to-end — no manual phases needed. - -**In subagent mode**, grading has three phases. **All three are required — do not stop after phase 1.** - -**Phase 1: Code graders** (deterministic, zero-cost) - -```bash -agentv pipeline grade -``` - -This evaluates all deterministic assertions against `response.md` files. Two types are handled: -- **`code-grader` scripts** — external scripts executed against the response (arbitrary logic, any language) -- **Built-in assertion types** — evaluated in-process: `contains`, `contains-any`, `contains-all`, `icontains`, `regex`, `equals`, `starts-with`, `ends-with`, `is-json`, and variants - -Both types are configured by `pipeline input` into `code_graders/.json` and graded by `pipeline grade`. Results are written to `/code_grader_results/.json`. Alternatively, pass `--grader-type code` to `pipeline run` to run these inline. - -**Do not dispatch LLM grader subagents for tests that only have `contains`, `regex`, or other built-in assertions** — `pipeline grade` handles them entirely, at zero cost. To detect which tests need Phase 2, check whether `/llm_graders/` contains any `.json` config files — `pipeline input` only writes there for `llm-grader` assertions. Tests with an empty (or missing) `llm_graders/` directory are done after Phase 1. - -**Phase 2: LLM grading** (semantic — do NOT skip this phase) - -Dispatch one `grader` subagent per (test × LLM grader) pair, **all in parallel**. Do not write a script to call an LLM API instead — the grader subagents use their own reasoning, which IS the LLM grading. -Example: 5 tests × 2 LLM graders = 10 grader subagents launched simultaneously. - -**Do NOT dispatch a single grader for multiple tests.** Each subagent grades exactly one (test, grader) pair. - -**Before dispatching graders, read `agents/grader.md` and embed its full content as the system instructions in every grader subagent prompt.** The grader is a `general-purpose` task agent — there is no auto-resolved "grader" type. Without `agents/grader.md` embedded verbatim, the subagent has no grading process, no output format, and no file-path knowledge, and will produce empty or incorrect output. - -Each grader subagent (operating under `agents/grader.md` instructions): -1. Reads `/llm_graders/.json` for the grading prompt -2. Reads `/response.md` for the candidate output -3. Grades the response against the prompt criteria -4. **Writes its result to disk**: `///llm_grader_results/.json` -5. Returns score (0.0–1.0) and per-assertion evidence to the orchestrator - -**Writing to disk is critical.** Assertion arrays are lost if accumulated only in the orchestrator's context across multiple batches (context summarization drops detail). Writing per-test results to `llm_grader_results/.json` makes grading resumable and assertion evidence durable. - -The result file format is: -```json -{ "score": 0.85, "assertions": [{"text": "...", "passed": true, "evidence": "..."}] } -``` - -After **all** grader subagents complete, run Phase 3 directly. - -**Phase 3: Merge and validate** - -```bash -agentv pipeline bench -agentv results validate -``` - -`pipeline bench` reads LLM grader results from `llm_grader_results/.json` per test automatically, merges with code-grader scores, computes weighted pass_rate, and writes `grading.json` + `index.jsonl` + `benchmark.json`. - -> **Diagnosing `pass_rate=0`:** If `pipeline bench` reports `pass_rate=0` across the board, do **not** assume the tests genuinely failed. First verify the grading pipeline ran correctly: check that `/llm_grader_results/.json` exists and is non-empty for each test. If these files are absent or empty, the grader subagents failed to produce output (most common cause: `agents/grader.md` was not embedded in the subagent prompts — see Phase 2). Treat `pass_rate=0` as a real signal only after confirming grader results exist. - -### Artifacts - -All artifacts use established schemas — see `references/schemas.md` for the full definitions. Do not modify the structure. Key artifacts per run: -- **grading.json**: per-test assertions with `{text, passed, evidence}`, plus summary -- **timing.json**: `{total_tokens, duration_ms, total_duration_seconds}` -- **benchmark.json**: per-target aggregate `{pass_rate, time_seconds, tokens}` - -Write artifacts to `.agentv/artifacts/` or the iteration directory. - -### Workspace features (EVAL.yaml only) - -- **Workspace isolation** — clone repos, run setup/teardown hooks (before_all, before_each, after_each, after_all) -- **Materialization modes** — `pooled` (reuse slots), `temp` (fresh per run), `static` (existing dir) -- **Multi-repo** — clone multiple repos with sparse checkout and shallow clone support -- **File change tracking** — grade by diffing workspace files before/after agent execution - ---- - -## Step 4: Analyze Results - -Once all runs are graded, analyze the results before attempting improvements. - -### Pattern analysis - -Read the JSONL results and look for: - -- **Always-pass tests** — assertion too loose or non-discriminating. If it passes for both good and bad outputs, it's not testing anything. -- **Always-fail tests** — task impossible, eval broken, or assertion misconfigured. Don't optimize against broken evals. -- **Flaky tests** — non-deterministic results across runs. Investigate before treating failures as real. -- **Systematic failures** — same failure pattern across multiple tests. This usually points to a missing instruction or wrong approach. -- **Deterministic upgrade candidates** — `llm-grader` assertions that could be replaced with `contains`, `regex`, or `is-json` (cheaper, faster, more reliable). - -### Dispatch subagents - -- **Dispatch `analyzer`** (read `agents/analyzer.md`) for a structured quality audit: deterministic upgrade suggestions, weak assertion detection, cost/quality flags, and benchmark pattern analysis. - -- **Dispatch `comparator`** (read `agents/comparator.md`) for blind N-way comparison between iterations or targets. The comparator blinds provider identities, generates task-specific rubrics, scores each output, then unblinds and attributes improvements. - -### Trace analysis - -Use CLI tools for deeper investigation: ```bash -agentv inspect # Detailed execution trace inspection -agentv compare # Structured diff between runs -``` - -Look for: tool call patterns, error recovery behavior, conversation flow, wasted steps. - -### Present results to the user - -Show a summary table: - +agentv skills get agentv-bench ``` -| Test ID | Score | Pass/Fail | Delta | Notes | -|------------------|-------|-----------|-------|--------------------------| -| basic-code-review| 0.85 | ✓ PASS | +0.15 | Found the bug this time | -| edge-case-empty | 0.00 | ✗ FAIL | — | Crashed on empty input | -``` - -Highlight: -- Current pass rate and delta from baseline -- Comparison results (which target/iteration won and why) -- Analyst observations the aggregate stats would hide - -Ask: "How does this look? Anything you'd change about the evals or the approach?" - ---- - -## Step 5: Improve - -This is the heart of the loop. You've run the test cases, analyzed the results, and now you need to make the agent better. - -### How to think about improvements - -1. **Generalize from the analysis.** You're iterating on a small eval set, but the agent will be used on many different inputs. Don't overfit to specific test cases. Rather than fiddly patches or oppressively rigid MUSTs, try different approaches and see what works. It's cheap to experiment. - -2. **Keep the prompt lean.** Read the execution transcripts, not just the final outputs. If the agent wastes time on unproductive steps, remove the instructions causing that. If it always ignores a section, that section isn't pulling its weight. - -3. **Explain the why.** Today's LLMs are smart. They have good theory of mind and can go beyond rote instructions when given good reasoning. If you find yourself writing ALWAYS or NEVER in all caps, that's a yellow flag — reframe as an explanation of why the thing matters. That's more humane, powerful, and effective. - -4. **Look for repeated work.** Read the transcripts from test runs and notice if the agent independently takes the same multi-step approach to something across cases. If all test runs result in writing the same helper script, bundle it. If every run makes the same mistake, the instruction is missing or unclear. - -### Applying changes - -- **Surgical edits**: ADD (new rule for a missing constraint), UPDATE (refine for clarity), DELETE (remove redundant or harmful rules), NEGATIVE CONSTRAINT (explicitly state what NOT to do) -- **One change per iteration** to isolate effects. If you change three things and the score improves, you don't know which change helped. -- **Variant tracking**: When a change helps some tests but hurts others, maintain 2-3 prompt variants. Compare variants to find the best overall approach before converging. -- **When converging**: Generalize specific patches into broad principles. Remove redundancy and contradictions. Ensure the prompt is clear, focused, and under 200 lines. - -### Evaluation integrity - -**Critical**: Only optimize **task prompts** (what the agent receives), never **judge prompts** (how graders score outputs). Modifying judge prompts games the evaluation without improving the agent. - -If a prompt file is referenced in both task input and grader configs, optimize for the task purpose only. Document which prompts were modified in the optimization log. - -### The iteration loop - -After improving: - -1. Apply your changes to the agent's prompts/skills/config -2. Re-run all test cases (agentv creates a new `.agentv/results/runs//` directory automatically) -3. Compare against the previous iteration (Step 4). If running in automated mode, use the **automated keep/discard** logic below instead of manual judgment — it will decide whether to keep or revert the change for you. -4. Present results to the user (or log the decision if running automated keep/discard) -5. Stop when ANY of: - - The user says they're happy - - Feedback is all empty (everything looks good) - - You're not making meaningful progress (no improvement for 2 consecutive iterations) - - Target pass rate is reached - - Maximum iterations exhausted - -**Human checkpoints**: At iterations 3, 6, and 9, always present progress to the user regardless of automation settings. Push back if optimization is accumulating contradictory rules or overfitting to specific test cases. - -### Automated keep/discard - -For autonomous iteration, use `agentv compare --json` to automatically decide whether to keep or discard each change based on wins/losses/ties. Read `references/autoresearch.md` for the full decision rules, logging format, and integration with the iteration loop. - ---- - -## Entering Mid-Lifecycle - -Users can start at any step by providing existing data: - -| Entry point | Required input | Example prompt | -|------------|---------------|----------------| -| Step 1 (Understand) | `eval-path` | "Optimize my agent against evals/support.yaml" | -| Step 2 (Write Evals) | Agent artifacts | "Write evals for this agent" | -| Step 3 (Run + Grade) | `eval-path` | "Run this eval and show me results" | -| Step 4 (Analyze) | `results-path` | "Analyze why my agent is failing on these results" | -| Step 5 (Improve) | Analysis + strategy | "Apply these optimization suggestions" | - -When entering mid-lifecycle, run only the requested step and subsequent steps. Don't re-run earlier steps unless the user requests a full loop. - ---- - -## Advanced: Blind Comparison - -For situations where you want a rigorous comparison between two versions (e.g., "is the new version actually better?"), dispatch the `comparator` subagent. It blinds identities, generates task-specific rubrics, scores outputs, then unblinds and explains why the winner won. - -This is optional and requires subagents. The human review loop is usually sufficient. - ---- - -## Description Optimization - -After the agent is working well, offer to optimize the skill's `description` field for better triggering accuracy. Read `references/description-optimization.md` for the full procedure (generate trigger EVAL.yaml, review with user, iterate, apply). - ---- - -## Autoresearch Mode - -Autoresearch is an unattended eval-improve loop that runs multiple optimize cycles without human intervention. The user triggers it with natural language (e.g., "run autoresearch on this skill", "optimize this skill unattended"). It uses the mutator subagent (`agents/mutator.md`) to rewrite artifacts based on failure analysis, and automated keep/discard to decide whether to keep or revert each change. - -Read `references/autoresearch.md` for the full procedure (prerequisites, artifact layout, keep/discard rules, the step-by-step loop, convergence criteria, and context hygiene). - ---- - -## Environment Adaptation - -For provider-specific notes (Copilot, Codex, Claude SDK, custom CLI), CI/headless mode behavior, and fallback strategies when subagents aren't available, read `references/environment-adaptation.md`. - ---- - -## Subagent Reference - -The `agents/` directory contains instructions for specialized subagents. Read them when you need to spawn the relevant subagent. - -| Agent | File | Purpose | When to dispatch | -|-------|------|---------|-----------------| -| executor | `agents/executor.md` | Perform test case tasks as the target agent | Step 3 (agent targets — one per test case) | -| grader | `agents/grader.md` | Grade responses with per-assertion evidence | Step 3 (grading — one per test × LLM grader pair) | -| comparator | `agents/comparator.md` | Blind N-way comparison + post-hoc analysis | Step 4 (comparing iterations/targets) | -| analyzer | `agents/analyzer.md` | Quality audit, deterministic upgrades, benchmarks | Step 4 (pattern analysis) | -| mutator | `agents/mutator.md` | Rewrite artifact from failure analysis | Step 5 (autoresearch — dispatched per cycle) | - -The `references/` directory has additional documentation: -- `references/autoresearch.md` — Autoresearch unattended optimization loop and automated keep/discard rules -- `references/eval-yaml-spec.md` — Eval YAML schema and assertion grading recipes -- `references/subagent-pipeline.md` — Detailed subagent-mode pipeline commands and output structure -- `references/description-optimization.md` — Skill description optimization workflow -- `references/environment-adaptation.md` — Provider-specific notes and CI/headless behavior -- `references/schemas.md` — JSON schemas for all artifacts (grading.json, benchmark.json, etc.) -- `references/migrating-from-skill-creator.md` — Guide for users coming from Anthropic's skill-creator - ---- - -Repeating the core loop for emphasis: - -- Understand what the agent does -- Write evaluation test cases -- Run the agent and grade outputs -- Analyze results — surface patterns, dispatch analyst and comparator subagents -- Improve the agent based on analysis -- Repeat until you and the user are satisfied -Take your time with improvements. Read the transcripts. Understand why failures happened. Make changes that generalize beyond the test set. This is important work. +Then follow the instructions in the loaded skill. diff --git a/plugins/agentv-dev/skills/agentv-eval-review/SKILL.md b/plugins/agentv-dev/skills/agentv-eval-review/SKILL.md index 23e2c3466..80d8f1e2b 100644 --- a/plugins/agentv-dev/skills/agentv-eval-review/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-eval-review/SKILL.md @@ -7,46 +7,11 @@ description: >- Do NOT use for writing evals (use agentv-eval-writer) or running evals (use agentv-bench). --- -# Eval Review - -## Overview - -Lint and review AgentV eval YAML files for structural issues, schema compliance, and quality problems. Runs deterministic checks via script, then applies LLM judgment for semantic issues the script cannot catch. - -## Process - -### Step 1: Run the linter - -Execute `scripts/lint_eval.py` against the target eval files: +The full skill content is bundled with the AgentV CLI and always version-matched to it. +Load it now: ```bash -python scripts/lint_eval.py --json +agentv skills get agentv-eval-review ``` -The script checks: -- `.eval.yaml` extension -- `description` field present -- Each test has `id`, `input`, and at least one of `criteria`/`expected_output`/`assertions` -- File paths in `type: file` use leading `/` -- `assertions` blocks present (flags tests relying solely on `expected_output`) -- `expected_output` prose detection (flags "The agent should..." patterns) -- Repeated file inputs across tests (recommends top-level `input`) -- Naming prefix consistency across eval files in same directory - -### Step 2: Review script output - -Report the script findings grouped by severity (error > warning > info). For each finding, include the file path and a concrete fix. - -### Step 3: Semantic review (LLM judgment) - -The script catches structural issues but cannot assess: -- **Factual accuracy** — Do tool/command names in expected_output match what the skill documents? -- **Coverage gaps** — Are important edge cases missing? -- **Assertion discriminability** — Would assertions pass for both good and bad output? -- **Cross-file consistency** — Do output filenames match across evals and skills? - -Read the relevant SKILL.md files and cross-check against the eval content for these issues. - -## Skill Resources - -- `scripts/lint_eval.py` — Deterministic eval linter (Python 3.11+, stdlib only) +Then follow the instructions in the loaded skill. diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md b/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md index 19a0ff385..703d751bb 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md @@ -10,682 +10,11 @@ description: >- running and benchmarking belongs to agentv-bench. --- -# AgentV Eval Writer - -Comprehensive docs: https://agentv.dev - -## Evaluation Types - -AgentV evaluations measure **execution quality** — whether your agent or skill produces correct output when invoked. - -For **trigger quality** (whether the right skill is triggered for the right prompts), see the [Evaluation Types guide](https://agentv.dev/guides/evaluation-types/). Do not use execution eval configs (`EVAL.yaml`, `evals.json`) for trigger evaluation — these are distinct concerns requiring different tooling and methodologies. - -## Starting from evals.json? - -If the project already has an Agent Skills `evals.json` file, use it as a starting point instead of writing YAML from scratch: - -```bash -# Convert evals.json to AgentV EVAL YAML -agentv convert evals.json - -# Run directly without converting (all commands accept evals.json) -agentv eval evals.json -``` - -The converter maps `prompt` → `input`, `expected_output` → `expected_output`, `assertions` → `assertions` (`llm-grader`), and resolves `files[]` paths. The generated YAML includes TODO comments for AgentV features to add (workspace setup, code graders, rubrics, required gates). - -After converting, enhance the YAML with AgentV-specific capabilities shown below. - -## From Chat Transcript - -Convert a chat conversation into eval test cases without starting from scratch. - -**Input formats:** - -Markdown conversation: -``` -User: How do I reset my password? -Assistant: Go to Settings > Security > Reset Password... -``` - -JSON messages: -```json -[{"role": "user", "content": "How do I reset my password?"}, - {"role": "assistant", "content": "Go to Settings > Security > Reset Password..."}] -``` - -**Select exchanges that make good test cases:** -- Factual Q&A — verifiable answers -- Task completion — user requests an action, agent performs it -- Edge cases — unusual inputs, error handling, boundary conditions -- Multi-turn reasoning — exchanges where earlier context matters - -**Skip:** greetings, one-word acknowledgments, repeated exchanges - -**Multi-turn format** (when context from prior turns matters): -```yaml -tests: - - id: multi-turn-context - criteria: "Agent remembers prior context" - input: - - role: user - content: "My name is Alice" - - role: assistant - content: "Nice to meet you, Alice!" - - role: user - content: "What's my name?" - expected_output: "Your name is Alice." - assertions: - - type: rubrics - criteria: - - Correctly recalls the user's name from earlier in the conversation -``` - -**Guidelines:** preserve exact wording in `expected_output`; aim for 5–15 tests per transcript; pick exchanges that test different capabilities. - -## Quick Start - -```yaml -description: Example eval -execution: - target: default - -tests: - - id: greeting - criteria: Friendly greeting - input: "Say hello" - expected_output: "Hello! How can I help you?" - assertions: - - type: rubrics - criteria: - - Greeting is friendly and warm - - Offers to help -``` - -## Eval File Structure - -**Required:** `tests` (array or string path) -**Optional:** `name`, `description`, `version`, `author`, `tags`, `license`, `requires`, `execution`, `suite`, `workspace`, `assertions`, `input` - -**Test fields:** - -| Field | Required | Description | -|-------|----------|-------------| -| `id` | yes | Unique identifier | -| `criteria` | yes | What the response should accomplish | -| `input` / `input` | yes | Input to the agent | -| `expected_output` / `expected_output` | no | Gold-standard reference answer | -| `assertions` | no | Graders: deterministic checks, rubrics, and LLM/code graders | -| `rubrics` | no | **Deprecated** — use `assertions: [{type: rubrics, criteria: [...]}]` instead | -| `execution` | no | Per-case execution overrides | -| `workspace` | no | Per-case workspace config (overrides suite-level) | -| `metadata` | no | Arbitrary key-value pairs passed to setup/teardown scripts | -| `conversation_id` | no | Thread grouping | - -**Shorthand aliases:** -- `input` (string) expands to `[{role: "user", content: "..."}]` -- `expected_output` (string/object) expands to `[{role: "assistant", content: ...}]` -- Canonical `input` / `expected_output` take precedence when both present - -**Message format:** `{role, content}` where role is `system`, `user`, `assistant`, or `tool` -**Content types:** inline text, `{type: "file", value: "./path.md"}` -**File paths:** relative from eval file dir, or absolute with `/` prefix from repo root -**File handling by provider type:** LLM providers receive file content inlined in XML tags. Agent providers receive a preread block with `file://` URIs and must read files themselves. See [Coding Agents > Prompt format](https://agentv.dev/targets/coding-agents#prompt-format). - -**JSONL format:** One test per line as JSON. Optional `.yaml` sidecar for shared defaults. See `examples/features/basic-jsonl/`. - -**Environment variables:** All string fields support `${{ VAR }}` interpolation. Missing vars resolve to empty string. Works in eval files, external case files, and workspace configs. `.env` files are loaded automatically. - -## Metadata - -When `name` is present, the suite is parsed as a metadata-bearing eval: - -```yaml -name: export-screening # required, lowercase/hyphens, max 64 chars -description: Evaluates export control screening accuracy -version: "1.0" -author: acme-compliance -tags: [compliance, agents] -license: Apache-2.0 -requires: - agentv: ">=0.30.0" -``` - -## Suite-level Input - -Prepend shared input messages to every test (like suite-level `assertions`). Avoids repeating the same prompt file in each test: - -```yaml -input: - - role: user - content: - - type: file - value: ./system-prompt.md - -tests: ./cases.yaml - -# cases.yaml — each test only needs its own query -# - id: test-1 -# criteria: ... -# input: "User question here" -``` - -Effective input: `[...suite input, ...test input]`. Skipped when `execution.skip_defaults: true`. -Accepts same formats as test `input` (string or message array). - -## Tests as String Path - -Point `tests` to an external file instead of inlining: - -```yaml -name: my-eval -description: My evaluation suite -tests: ./cases.yaml # relative to eval file dir -``` - -The external file can be YAML (array of test objects) or JSONL. - -## Assertions Field - -`assertions` defines graders at the suite level or per-test level. It is the canonical field for all graders: - -```yaml -# Suite-level (appended to every test) -assertions: - - type: is-json - required: true - - type: contains - value: "status" - -tests: - - id: test-1 - criteria: Returns JSON - input: Get status - # Per-test assertions (runs before suite-level) - assertions: - - type: equals - value: '{"status": "ok"}' -``` - -## How `criteria` and `assertions` Interact - -`criteria` is a **data field** — it describes what the response should accomplish. It is **not** a grader. How it gets evaluated depends on whether `assertions` is present: - -| Scenario | What happens | Warning? | -|----------|-------------|----------| -| `criteria` + **no `assertions`** | Implicit `llm-grader` runs automatically against `criteria` | No | -| `criteria` + **`assertions` with only deterministic graders** (contains, regex, etc.) | Only declared graders run. `criteria` is **not evaluated**. | Yes — warns that no grader will consume criteria | -| `criteria` + **`assertions` with a grader** (`llm-grader`, `code-grader`, `rubrics`) | Declared graders run. Graders receive `criteria` as input. | No | - -### No assertions → implicit llm-grader - -The simplest path. `criteria` is automatically evaluated by the default `llm-grader`: - -```yaml -tests: - - id: simple-eval - criteria: Assistant correctly explains the bug and proposes a fix - input: "Debug this function..." - # No assertions → default llm-grader evaluates against criteria -``` - -### assertions present → no implicit grader - -When `assertions` is defined, **only the declared graders run**. If you want an LLM grader alongside deterministic checks, declare it explicitly: - -```yaml -tests: - - id: mixed-eval - criteria: Response is helpful and mentions the fix - input: "Debug this function..." - assertions: - - type: llm-grader # must be explicit when assertions is present - - type: contains - value: "fix" -``` - -**Common mistake:** defining `criteria` with only deterministic graders. The criteria will be ignored and a warning is emitted: - -```yaml -tests: - - id: bad-example - criteria: Gives a thoughtful answer # ⚠ NOT evaluated — no grader in assertions - input: "What is 2+2?" - assertions: - - type: contains - value: "4" - # Warning: criteria is defined but no grader in assertions will evaluate it. -``` - -## Required Gates - -Any grader can be marked `required` to enforce a minimum score: - -```yaml -assertions: - - type: contains - value: "DENIED" - required: true # must score >= 0.8 (default) - - type: rubrics - required: 0.6 # must score >= 0.6 (custom threshold) - criteria: - - id: accuracy - outcome: Identifies the denied party - weight: 5.0 -``` - -If a required grader scores below its threshold, the overall verdict is forced to `fail`. - -## Workspace Setup/Teardown - -Run scripts before/after each test. Define at suite level or override per case: - -```yaml -workspace: - template: ./workspace-templates/my-project - setup: - command: ["bun", "run", "setup.ts"] - timeout_ms: 120000 - teardown: - command: ["bun", "run", "teardown.ts"] - -tests: - - id: case-1 - input: Fix the bug - criteria: Bug is fixed - metadata: - repo: sympy/sympy - workspace: - repos: - - path: /testbed - source: - type: git - url: https://github.com/sympy/sympy.git - checkout: - base_commit: "abc123" - docker: - image: swebench/sweb.eval.django__django:latest -``` - -**Lifecycle:** template copy → repo clone → setup → git baseline → agent → file changes → teardown → repo reset → cleanup -**Merge:** Case-level fields replace suite-level fields. -**Commands receive stdin JSON:** `{workspace_path, test_id, eval_run_id, case_input, case_metadata}` -**Setup failure:** aborts case. **Teardown failure:** non-fatal (warning). -For SWE-bench-style evals, keep operational checkout state under `workspace.repos[].checkout.base_commit`; treat `metadata.base_commit` as informational only. - -### Repository Lifecycle - -Clone repos into workspace automatically. For shared repo workspaces, pooling is the default: - -```yaml -workspace: - repos: - - path: ./repo - source: - type: git - url: https://github.com/org/repo.git - checkout: - ref: main - ancestor: 1 # parent commit - clone: - depth: 10 - hooks: - after_each: - reset: fast # none | fast | strict - isolation: shared # shared | per_test - mode: pooled # pooled | temp | static - hooks: - enabled: true # set false to skip all hooks -``` - -- `source.type`: `git` (URL) or `local` (path) -- `checkout.resolve`: `remote` (ls-remote) or `local` -- `clone.depth`: shallow clone depth -- `clone.filter`: partial clone filter (e.g., `blob:none`) -- `clone.sparse`: sparse checkout paths array -- `mode`: `pooled` (default for shared repos), `temp`, or `static` -- `path`: workspace path used when `mode: static`; when empty/missing the workspace is auto-materialised (template copied + repos cloned); populated dirs are reused as-is -- `hooks.enabled`: boolean (default `true`); set `false` to skip all lifecycle hooks -- Pool reset defaults to `fast` (`git clean -fd`); use `--workspace-clean full` for strict reset (`git clean -fdx`) -- Pool entries are managed separately via `agentv workspace list` and `agentv workspace clean` -- `agentv workspace deps ` scans eval files and outputs a JSON manifest of required git repos (useful for CI pre-cloning) - -See https://agentv.dev/targets/configuration/#repository-lifecycle - -## Grader Types - -Configure via `assertions` array. Multiple graders produce a weighted average score. - -### code_grader -```yaml -- name: format_check - type: code-grader - command: [uv, run, validate.py] - cwd: ./scripts # optional working directory - target: {} # optional: enable LLM target proxy (max_calls: 50) -``` -Contract: stdin JSON -> stdout JSON `{score, assertions: [{text, passed, evidence?}], reasoning}` -Input includes: `question`, `criteria`, `answer`, `reference_answer`, `output`, `trace`, `token_usage`, `cost_usd`, `duration_ms`, `start_time`, `end_time`, `file_changes`, `workspace_path`, `config` -When a workspace is configured, `workspace_path` is the absolute path to the workspace dir (also available as `AGENTV_WORKSPACE_PATH` env var). Use this for functional grading (e.g., running `npm test` in the workspace). -See docs at https://agentv.dev/graders/code-graders/ - -### llm_grader -```yaml -- name: quality - type: llm-grader - prompt: ./prompts/eval.md # markdown template or command config - target: grader_gpt_5_mini # optional: override the grader target for this grader - model: gpt-5-chat # optional model override - config: # passed to prompt templates as context.config - strictness: high -``` -Variables: `{{question}}`, `{{criteria}}`, `{{answer}}`, `{{reference_answer}}`, `{{input}}`, `{{expected_output}}`, `{{output}}`, `{{file_changes}}` -- Markdown templates: use `{{variable}}` syntax -- TypeScript templates: use `definePromptTemplate(fn)` from `@agentv/eval`, receives context object with all variables + `config` -- Use `target:` to run different `llm-grader` graders against different named LLM targets in the same eval (useful for grader panels / ensembles) - -### composite -```yaml -- name: gate - type: composite - assertions: - - name: safety - type: llm-grader - prompt: ./safety.md - - name: quality - type: llm-grader - aggregator: - type: weighted_average - weights: { safety: 0.3, quality: 0.7 } -``` -Aggregator types: `weighted_average`, `all_or_nothing`, `minimum`, `maximum`, `safety_gate` -- `safety_gate`: fails immediately if the named gate grader scores below threshold (default 1.0) - -### tool_trajectory -```yaml -- name: tool_check - type: tool-trajectory - mode: any_order # any_order | in_order | exact - minimums: # for any_order - knowledgeSearch: 2 - expected: # for in_order/exact - - tool: knowledgeSearch - args: { query: "search term" } # partial deep equality match - - tool: documentRetrieve - args: any # any arguments accepted - max_duration_ms: 5000 # per-tool latency assertion - - tool: summarize # omit args to skip argument checking -``` - -### field_accuracy -```yaml -- name: fields - type: field-accuracy - match_type: exact # exact | date | numeric_tolerance - numeric_tolerance: 0.01 # for numeric_tolerance match_type - aggregation: weighted_average # weighted_average | all_or_nothing -``` -Compares `output` fields against `expected_output` fields. - -### latency -```yaml -- name: speed - type: latency - max_ms: 5000 -``` - -### cost -```yaml -- name: budget - type: cost - max_usd: 0.10 -``` - -### token_usage -```yaml -- name: tokens - type: token-usage - max_total_tokens: 4000 -``` - -### execution_metrics -```yaml -- name: efficiency - type: execution-metrics - max_tool_calls: 10 # Maximum tool invocations - max_llm_calls: 5 # Maximum LLM calls (assistant messages) - max_tokens: 5000 # Maximum total tokens (input + output) - max_cost_usd: 0.05 # Maximum cost in USD - max_duration_ms: 30000 # Maximum execution duration - target_exploration_ratio: 0.6 # Target ratio of read-only tool calls - exploration_tolerance: 0.2 # Tolerance for ratio check (default: 0.2) -``` -Declarative threshold-based checks on execution metrics. Only specified thresholds are checked. -Score is proportional: `passed / total` assertions. Missing data counts as a failed assertion. - -### contains -```yaml -- type: contains - value: "DENIED" - required: true -``` -Binary check: does output contain the substring? Name auto-generated if omitted. - -### regex -```yaml -- type: regex - value: "\\d{3}-\\d{2}-\\d{4}" -``` -Binary check: does output match the regex pattern? - -### equals -```yaml -- type: equals - value: "42" -``` -Binary check: does output exactly equal the value (both trimmed)? - -### is_json -```yaml -- type: is-json - required: true -``` -Binary check: is the output valid JSON? - -### rubrics -```yaml -- type: rubrics - criteria: - - id: accuracy - outcome: Correctly identifies the denied party - weight: 5.0 - - id: reasoning - outcome: Provides clear reasoning - weight: 3.0 -``` -LLM-judged structured evaluation with weighted criteria. Criteria items support `id`, `outcome`, `weight`, and `required` fields. - -### rubrics (inline, deprecated) -Top-level `rubrics:` field is deprecated. Use `type: rubrics` under `assertions` instead. -See `references/rubric-grader.md` for score-range mode and scoring formula. - -## Execution Error Tolerance - -Control how the runner handles execution errors (infrastructure failures, not quality failures): - -```yaml -execution: - fail_on_error: false # never halt (default) - # fail_on_error: true # halt on first execution error -``` - -When halted, remaining tests get `executionStatus: 'execution_error'` with `failureReasonCode: 'error_threshold_exceeded'`. - -## Suite-Level Quality Threshold - -Set a minimum mean score for the eval suite. If the mean quality score falls below the threshold, the CLI exits with code 1 — useful for CI/CD quality gates. - -```yaml -execution: - threshold: 0.8 -``` - -CLI flag `--threshold 0.8` overrides the YAML value. Must be a number between 0 and 1. Mean score is computed from quality results only (execution errors excluded). - -The threshold also controls JUnit XML pass/fail: tests with scores below the threshold are marked as ``. When no threshold is set, JUnit defaults to 0.5. - -## CLI Commands - -```bash -# Run evaluation (requires API keys) -agentv eval [--test-id ] [--target ] [--dry-run] [--threshold <0-1>] - -# Run with OTLP JSON file (importable by OTel backends) -agentv eval --otel-file traces/eval.otlp.json - -# Run a single assertion in isolation (no API keys needed) -agentv eval assert --agent-output "..." --agent-input "..." - -# Import agent transcripts for offline grading -agentv import claude --session-id - -# Re-run only execution errors from a previous run -agentv eval --retry-errors .agentv/results/runs//index.jsonl - -# Validate eval file -agentv validate - -# Compare results — N-way matrix from a canonical run manifest -agentv compare .agentv/results/runs//index.jsonl -agentv compare .agentv/results/runs//index.jsonl --baseline # CI regression gate -agentv compare .agentv/results/runs//index.jsonl --baseline --candidate # pairwise -agentv compare .agentv/results/runs//index.jsonl .agentv/results/runs//index.jsonl - -# Author assertions directly in the eval file -# Prefer simple assertions when they fit the criteria; use deterministic or LLM-based graders when needed -agentv validate -``` - -## Code Judge SDK - -Use `@agentv/eval` to build custom graders in TypeScript/JavaScript: - -### defineAssertion (recommended for custom checks) -```typescript -#!/usr/bin/env bun -import { defineAssertion } from '@agentv/eval'; - -export default defineAssertion(({ answer, trace }) => ({ - pass: answer.length > 0 && (trace?.eventCount ?? 0) <= 10, - reasoning: 'Checks content exists and is efficient', -})); -``` - -Assertions support both `pass: boolean` and `score: number` (0-1). If only `pass` is given, score is 1 (pass) or 0 (fail). - -### defineCodeGrader (full control) -```typescript -#!/usr/bin/env bun -import { defineCodeGrader } from '@agentv/eval'; - -export default defineCodeGrader(({ trace, answer }) => ({ - score: trace?.eventCount <= 5 ? 1.0 : 0.5, - assertions: [ - { text: 'Efficient tool usage', passed: (trace?.eventCount ?? 0) <= 5 }, - ], -})); -``` - -Both are used via `type: code-grader` in YAML with `command: [bun, run, grader.ts]`. - -### Convention-Based Discovery - -Place assertion files in `.agentv/assertions/` — they auto-register by filename: - -``` -.agentv/assertions/word-count.ts → type: word-count -.agentv/assertions/sentiment.ts → type: sentiment -``` - -No `command:` needed in YAML — just use `type: `. - -## Programmatic API - -Use `evaluate()` from `@agentv/core` to run evals as a library: - -```typescript -import { evaluate } from '@agentv/core'; - -const { results, summary } = await evaluate({ - tests: [ - { - id: 'greeting', - input: 'Say hello', - assertions: [{ type: 'contains', value: 'hello' }], - }, - ], - target: { provider: 'mock_agent' }, -}); -console.log(`${summary.passed}/${summary.total} passed`); -``` - -Supports inline tests (no YAML) or file-based via `specFile`. - -## defineConfig - -Type-safe project configuration in `agentv.config.ts`: - -```typescript -import { defineConfig } from '@agentv/core'; - -export default defineConfig({ - execution: { workers: 5, maxRetries: 2 }, - output: { format: 'jsonl', dir: './results' }, - limits: { maxCostUsd: 10.0 }, -}); -``` - -Auto-discovered from project root. Validated with Zod. - -## Scaffold Commands +The full skill content is bundled with the AgentV CLI and always version-matched to it. +Load it now: ```bash -agentv create assertion # → .agentv/assertions/.ts -agentv create eval # → evals/.eval.yaml + .cases.jsonl +agentv skills get agentv-eval-writer ``` -## Skill Improvement Workflow - -For a complete guide to iterating on skills using evaluations — writing scenarios, running baselines, comparing results, and improving — see the [Skill Improvement Workflow](https://agentv.dev/guides/skill-improvement-workflow/) guide. -## Human Review Checkpoint - -After running evals, perform a human review before iterating. Create `feedback.json` in the results directory: - -```json -{ - "run_id": "2026-03-14T10-32-00_claude", - "reviewer": "engineer-name", - "timestamp": "2026-03-14T12:00:00Z", - "overall_notes": "Summary of observations", - "per_case": [ - { - "test_id": "test-id", - "verdict": "acceptable | needs_improvement | incorrect | flaky", - "notes": "Why this verdict", - "evaluator_overrides": { "code-grader:name": "Override note" }, - "workspace_notes": "Workspace state observations" - } - ] -} -``` - -Use `evaluator_overrides` for workspace evaluations to annotate specific grader results (e.g., "code-grader was too strict"). Use `workspace_notes` for observations about workspace state. - -Review workflow: run evals → inspect results (`agentv inspect show`) → write feedback → tune prompts/graders → re-run. - -Full guide: https://agentv.dev/guides/human-review/ - -## Schemas - -- Eval file: `references/eval-schema.json` -- Config: `references/config-schema.json` +Then follow the instructions in the loaded skill. diff --git a/plugins/agentv-dev/skills/agentv-governance/SKILL.md b/plugins/agentv-dev/skills/agentv-governance/SKILL.md index 1b942c1f6..acd7a16e5 100644 --- a/plugins/agentv-dev/skills/agentv-governance/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-governance/SKILL.md @@ -9,55 +9,11 @@ description: >- Do NOT use for running evals or benchmarking — that belongs to agentv-bench. --- -# AgentV Compliance Skill +The full skill content is bundled with the AgentV CLI and always version-matched to it. +Load it now: -Teaches AI agents how to author syntactically correct `governance:` blocks in AgentV -eval files, and how to lint them against known vocabulary rules. - -## Dual mode - -**Authoring (interactive):** When a human or AI agent is editing a `*.eval.yaml` file -that contains or should contain a `governance:` block, this skill provides vocabulary, -valid values, and example shapes. Load it alongside `agentv-eval-writer` when building -red-team or compliance suites. - -**Linting (non-interactive / CI):** When invoked from a GitHub Action (see -`examples/governance/compliance-lint/`), this skill lints each changed `*.eval.yaml` file -against the rules in `references/lint-rules.md` and returns a structured JSON report. -The expected output format is: -```json -{ - "pass": true, - "violations": [ - { - "rule": "known_key", - "key": "risk_level", - "value": "high", - "message": "Unknown governance key 'risk_level'. Did you mean 'risk_tier'?", - "suggestion": "Replace 'risk_level' with 'risk_tier'." - } - ] -} +```bash +agentv skills get agentv-governance ``` -`pass` is `true` when `violations` is empty. - -## Reference files - -| File | Purpose | -|------|---------| -| `references/governance-yaml-shape.md` | YAML shape, merge semantics, worked examples | -| `references/lint-rules.md` | Machine-readable rules applied during lint | -| `references/owasp-llm-top-10-2025.md` | LLM01–LLM10 canonical IDs and descriptions | -| `references/owasp-agentic-top-10-2025.md` | T01–T10 agentic-AI categories | -| `references/mitre-atlas.md` | Common AML.Txxxx technique IDs | -| `references/eu-ai-act-risk-tiers.md` | Four risk tiers + article references | -| `references/iso-42001-controls.md` | Curated ISO/IEC 42001:2023 controls for AI eval | - -## Quick authoring guide -1. Check which risks this eval exercises using the reference files above. -2. Pick IDs from the relevant frameworks (`owasp_llm_top_10_2025`, `mitre_atlas`, etc.). -3. Set `risk_tier` using EU AI Act vocabulary (`prohibited | high | limited | minimal`). -4. Add `controls` as `-:` strings (e.g. `EU-AI-ACT-2024:Art.55`). -5. Run the lint rules from `references/lint-rules.md` against your block before committing. -6. See `references/governance-yaml-shape.md` for complete examples copied from real suites. +Then follow the instructions in the loaded skill. diff --git a/plugins/agentv-dev/skills/agentv-onboarding/SKILL.md b/plugins/agentv-dev/skills/agentv-onboarding/SKILL.md index a042d4721..3bb62a6f2 100644 --- a/plugins/agentv-dev/skills/agentv-onboarding/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-onboarding/SKILL.md @@ -3,61 +3,11 @@ name: agentv-onboarding description: Bootstrap AgentV in the current workspace after plugin-manager install. Ensures CLI availability, runs workspace init, and verifies setup artifacts. --- -# AgentV Onboarding - -Use this skill when the user asks to set up AgentV in a repository. - -## Goal - -Set up AgentV in the current workspace: -- ensure the `agentv` CLI is available (install if needed) -- initialize workspace files -- verify setup artifacts and report status - -## Workflow - -### 1. Resolve Script Path - -Find the directory that contains this `SKILL.md`, then resolve script paths relative to it. - -Packaged scripts: -- `scripts/onboard-agentv.sh` for bash/zsh -- `scripts/onboard-agentv.ps1` for PowerShell - -### 2. Run the Platform Script - -Run from the repository root where AgentV should be initialized. - -POSIX shells: +The full skill content is bundled with the AgentV CLI and always version-matched to it. +Load it now: ```bash -bash /scripts/onboard-agentv.sh +agentv skills get agentv-onboarding ``` -PowerShell: - -```powershell -pwsh -File /scripts/onboard-agentv.ps1 -``` - -If `pwsh` is unavailable on Windows: - -```powershell -powershell -ExecutionPolicy Bypass -File /scripts/onboard-agentv.ps1 -``` - -### 3. Handle Errors - -If the script fails, report the exact error and stop. Do not claim setup succeeded. - -### 4. Report Outcome Clearly - -Summarize: -- `agentv` version in use -- whether CLI was installed during this run -- whether `agentv init` completed -- whether setup verification passed - -## Re-run Behavior - -Re-running is safe. The scripts run `agentv init`, and if setup artifacts are still missing they rerun once automatically before failing. +Then follow the instructions in the loaded skill. diff --git a/plugins/agentv-dev/skills/agentv-trace-analyst/SKILL.md b/plugins/agentv-dev/skills/agentv-trace-analyst/SKILL.md index 6205f85e0..64c6b623c 100644 --- a/plugins/agentv-dev/skills/agentv-trace-analyst/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-trace-analyst/SKILL.md @@ -9,137 +9,11 @@ description: >- or measuring skill description quality — those tasks belong to the skill-creator skill. --- -# AgentV Trace Analyst - -Analyze evaluation traces headlessly using `agentv inspect` primitives and `jq`. - -## Primitives - -```bash -# List result files (most recent first) -agentv inspect list [--limit N] [--format json|table] - -# Show results with trace details -agentv inspect show [--test-id ] [--tree] [--format json|table] - -# Percentile statistics -agentv inspect stats [--group-by target|suite|test-id] [--format json|table] - -# A/B comparison between runs -agentv compare [--threshold 0.1] [--format json|table] -``` - -## Analysis Workflow - -### 1. Discover results +The full skill content is bundled with the AgentV CLI and always version-matched to it. +Load it now: ```bash -agentv inspect list +agentv skills get agentv-trace-analyst ``` -Pick the result file to analyze. Most recent is first. - -### 2. Get overview - -```bash -agentv inspect stats -``` - -Read the percentile table. Key signals: -- **score p50 < 0.8**: Significant quality issues -- **latency p90 > 30s**: Performance bottleneck -- **cost p99 spike**: Outlier cost tests to investigate -- **tool_calls p90 >> p50**: Some tests are much chattier - -### 3. Investigate failures - -```bash -agentv inspect show --format json | jq '[.[] | select(.score < 0.8) | {test_id, score, assertions: [.assertions[] | select(.passed | not)], trace: {tools: (.trace.tool_calls | keys)}, duration_ms, cost_usd}]' -``` - -For each failing test, examine: -- **assertions (failed)**: What criteria were not met? (filter for `passed: false`) -- **trace.tool_calls**: Did the agent use expected tools? -- **duration_ms**: Did it time out or run too long? -- **reasoning**: Why did the grader score it low? - -### 4. Inspect specific tests - -```bash -# Flat view with trace summary -agentv inspect show --test-id - -# Tree view (if output messages available) -agentv inspect show --test-id --tree -``` - -The tree view shows the agent's execution path — LLM calls interspersed with tool invocations. Look for: -- **Excessive tool calls**: Agent looping or exploring unnecessarily -- **Missing tools**: Expected tool not called -- **Long durations**: Specific tool calls that are slow - -### 5. Compare runs - -```bash -agentv compare -``` - -Look for: -- **Wins vs losses**: Net improvement or regression? -- **Mean delta**: Overall direction of change -- **Per-test deltas**: Which tests regressed? - -### 6. Group analysis - -```bash -# By target provider -agentv inspect stats --group-by target - -# By suite -agentv inspect stats --group-by suite -``` - -Compare providers side-by-side: which is cheaper, faster, more accurate? - -## Advanced Queries with jq - -All commands support `--format json` for piping to `jq`: - -```bash -# Top 3 most expensive tests -agentv inspect show --format json \ - | jq 'sort_by(-.cost_usd) | .[0:3] | .[] | {test_id, cost: .cost_usd, score}' - -# Tests where token usage exceeds 10k -agentv inspect show --format json \ - | jq '[.[] | select(.token_usage.input + .token_usage.output > 10000) | {test_id, tokens: (.token_usage.input + .token_usage.output)}]' - -# Score distribution by suite -agentv inspect show --format json \ - | jq 'group_by(.suite) | .[] | {suite: .[0].suite, count: length, avg_score: ([.[].score] | add / length)}' - -# Tool usage frequency across all tests -agentv inspect show --format json \ - | jq '[.[].trace.tool_calls // {} | to_entries[]] | group_by(.key) | .[] | {tool: .[0].key, total_calls: ([.[].value] | add)}' - -# Find regressions > 0.1 between two runs -agentv compare baseline.jsonl candidate.jsonl --format json \ - | jq '.matched[] | select(.delta < -0.1) | {test_id: .testId, delta, from: .score1, to: .score2}' -``` - -## Reasoning Patterns - -When analyzing traces, think about: - -1. **Efficiency**: Are tool calls/tokens proportional to task complexity? High tokens-per-tool may indicate verbose prompts or unnecessary context. - -2. **Error patterns**: Do failures cluster by target, suite, or tool usage? Common patterns: - - Tool errors → agent can't access required resources - - High LLM calls with low tool calls → agent stuck in reasoning loop - - Missing tool calls → wrong tool routing - -3. **Cost optimization**: Identify tests with high cost but acceptable scores — can they use a cheaper model? Compare `--group-by target` stats. - -4. **Latency distribution**: P50 vs P99 spread indicates consistency. Large spread means unpredictable performance — investigate P99 outliers. - -5. **Regression detection**: After a prompt/config change, compare before/after. Mean delta > 0 is good, but check individual test regressions — a few large losses can hide behind many small wins. +Then follow the instructions in the loaded skill. diff --git a/skills-data/agentv-bench/LICENSE.txt b/skills-data/agentv-bench/LICENSE.txt new file mode 100644 index 000000000..7a4a3ea24 --- /dev/null +++ b/skills-data/agentv-bench/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/skills-data/agentv-bench/SKILL.md b/skills-data/agentv-bench/SKILL.md new file mode 100644 index 000000000..e8813337f --- /dev/null +++ b/skills-data/agentv-bench/SKILL.md @@ -0,0 +1,444 @@ +--- +name: agentv-bench +description: >- + Run AgentV evaluations and optimize agents through eval-driven iteration. + Triggers: run evals, benchmark agents, optimize prompts/skills against evals, compare + agent outputs across providers, analyze eval results, offline evaluation of recorded sessions, + run autoresearch, optimize unattended, run overnight optimization loop. + Not for: writing/editing eval YAML without running (use agentv-eval-writer), + analyzing existing traces/JSONL without re-running (use agentv-trace-analyst). +--- + +# AgentV Bench + + +A skill for evaluating agents and iteratively improving them through data-driven optimization. + +At a high level, the process goes like this: + +- Understand what the agent does and what "good" looks like +- Write evaluation test cases (EVAL.yaml or evals.json) +- Run the agent on those test cases, grade the outputs +- Analyze the results — what's working, what's failing, and why +- Improve the agent's prompts/skills/config based on the analysis +- Repeat until you're satisfied + +Your job when using this skill is to figure out where the user is in this process and then jump in and help them progress. Maybe they want to start from scratch — help them write evals, run them, and iterate. Maybe they already have results — jump straight to analysis and improvement. + +Be flexible. If the user says "I don't need a full benchmark, just help me debug this failure", do that instead. + +After the agent is working well, you can also run description optimization to improve skill triggering accuracy (see `references/description-optimization.md`). + +## Communicating with the user + +This skill is used by people across a wide range of familiarity with evaluation tooling. Pay attention to context cues: + +- "evaluation" and "benchmark" are borderline but OK in most cases +- For "YAML", "grader", "assertion", "deterministic judge" — see serious cues from the user that they know what those mean before using them without explanation +- Briefly explain terms if in doubt + +When presenting results, default to summary tables. Offer detail on request. In CI/headless mode, skip interactive prompts and exit with status codes. + +--- + +## Step 1: Understand the Agent + +Before running or optimizing, understand what you're working with. + +1. **Read the agent's artifacts** — prompts, skills, configs, recent changes. Understand the full picture: what tools are available, what the expected input/output looks like, what constraints exist. + +2. **Identify success criteria** — what does "good" look like for this agent? What are the edge cases? What would a failure look like? Talk to the user if this isn't clear from the artifacts alone. + +3. **Understand the target harness** — which provider runs the agent (Claude, GPT, Copilot CLI, Gemini, custom CLI)? This affects what grader types are available and how to run tests. Targets are configured in `.agentv/targets.yaml` (canonical location, searched from the eval file directory upward). Sensitive values like `api_key` must use `${{ ENV_VAR }}` syntax — literal secrets are rejected as a security guardrail. + +4. **Challenge assumptions** — if evals already exist, review their quality before running: + - Are the test cases testing the right things? + - Are assertions specific enough to catch real failures? + - Are there ambiguous or contradictory test cases? + - Flag eval issues before proceeding — running bad evals wastes time. + +5. **Check integrity** — ensure task prompts (what the agent receives) are not also used as grader prompts (how outputs are scored). If a prompt file appears in both locations, note the overlap and optimize only for the task purpose. + +--- + +## Step 2: Write Evaluations + +AgentV supports two evaluation formats: + +**EVAL.yaml** (native, full features) — supports workspaces, code graders, multi-turn conversations, tool trajectory scoring, workspace file tracking, multi-provider targets. Use this for agent evaluation. + +```yaml +# example.eval.yaml +tests: + - id: basic-code-review + input: "Review this TypeScript file for bugs and suggest improvements" + criteria: "Identifies the null pointer bug on line 12 and suggests a fix" + assertions: + - type: contains + value: "null" + - Review identifies the null pointer bug and suggests a concrete fix + +workspace: + template: ./workspace-template + hooks: + before_each: + reset: fast +``` + +Multi-skill evaluation is handled naturally via input messages — describe the task in the test input, and the agent uses whatever skills it needs. + +**evals.json** (skill-creator compatible) — auto-promoted to EVAL-equivalent format: +- `prompt` → input messages +- `expected_output` → reference answer +- `assertions` → graders +- `files[]` paths resolved relative to the evals.json location + +```json +{ + "skill_name": "my-agent", + "evals": [ + { + "id": 1, + "prompt": "User's task prompt", + "expected_output": "Description of expected result", + "assertions": ["Output includes error handling", "Uses async/await"] + } + ] +} +``` + +### Writing good test cases + +Start with 2-3 realistic test cases — the kind of thing a real user would actually say. Share them with the user before running: "Here are a few test cases I'd like to try. Do these look right, or do you want to add more?" + +Good assertions are objectively verifiable and have descriptive names. Subjective quality ("the output is good") is better evaluated qualitatively — don't force assertions onto things that need human judgment. + +**Grader types** (cheapest to most expensive): `exact`, `contains`, `regex`, `is-json`, `field-accuracy`, `composite`, `code-grader`, `tool-trajectory`, `llm-grader`. See `references/eval-yaml-spec.md` for full config and grading recipes for each type. + +Prefer deterministic graders over LLM graders whenever possible. If an assertion can be checked with `contains` or `regex`, don't use `llm-grader`. + +--- + +## Step 3: Run and Grade + +This section is one continuous sequence — don't stop partway through. + +Each run produces a new `.agentv/results/runs//` directory automatically. Use timestamps to identify iterations when comparing runs. + +### Choosing a run mode + +**User instruction takes priority.** If the user says "run in subagent mode", "use subagent mode", or "use CLI mode", use that mode directly. + +If the user has not specified a mode, default to `subagent`. + +| `AGENT_EVAL_MODE` | Mode | How | +|----------------------|------|-----| +| `subagent` (default) | **Subagent mode** | Subagent-driven eval — parses eval.yaml, spawns executor + grader subagents. Zero CLI dependency. | +| `cli` | **AgentV CLI** | `agentv eval ` — end-to-end, multi-provider | + +Set `AGENT_EVAL_MODE` in `.env` at the project root as the default when no mode is specified. If absent, default to `subagent`. **User instruction always overrides this.** + +**`subagent`** — Parses eval.yaml directly, spawns executor subagents to run each test case in the current workspace, then spawns grader subagents to evaluate all assertion types natively. No CLI or external API calls required. Read `references/subagent-pipeline.md` for the detailed procedure. + +**`cli`** — AgentV CLI handles execution, grading, and artifact generation end-to-end. Works with all providers. Use when you need multi-provider benchmarking or CLI-specific features. + +### Running evaluations + +**AgentV CLI mode** (end-to-end, EVAL.yaml): +```bash +agentv eval --output .agentv/artifacts/ +``` + +**Subagent mode** — read `references/subagent-pipeline.md` for the detailed procedure. In brief: use `pipeline input` to extract inputs, dispatch one `executor` subagent per test case (all in parallel), then proceed to grading below. + +**Spawn all runs in the same turn.** For each test case that needs both a "with change" and a "baseline" run, launch them simultaneously. Don't run one set first and come back for the other — launch everything at once so results arrive around the same time. + +**Multi-target benchmarking:** +```bash +agentv eval --target claude --target gpt --target copilot +``` + +**Baseline strategy:** +- **New agent**: baseline is "no prompt" or minimal prompt — same eval, no agent-specific configuration +- **Improving existing**: snapshot the current version before editing (`cp -r /prompt-snapshot/`), use as baseline throughout +- **Multi-target**: each target is its own baseline — no need for a separate "without" run + +### While runs are in progress, draft graders + +Don't just wait for runs to finish — use this time productively. If assertions don't exist yet, draft them now. If they exist, review them and explain what they check to the user. + +Good assertions are *discriminating* — they pass when the agent genuinely succeeds and fail when it doesn't. An assertion that passes for both good and bad outputs is worse than no assertion. + +### As runs complete, capture timing data + +When each subagent task completes, you receive a notification containing `total_tokens` and `duration_ms`. **Save this data immediately** to `timing.json` in the run directory. See `references/schemas.md` for the timing.json schema. + +This is the only opportunity to capture this data — it comes through the task notification and isn't persisted elsewhere. Process each notification as it arrives. + +### Grading + +**In CLI mode**, `agentv eval` handles all grading end-to-end — no manual phases needed. + +**In subagent mode**, grading has three phases. **All three are required — do not stop after phase 1.** + +**Phase 1: Code graders** (deterministic, zero-cost) + +```bash +agentv pipeline grade +``` + +This evaluates all deterministic assertions against `response.md` files. Two types are handled: +- **`code-grader` scripts** — external scripts executed against the response (arbitrary logic, any language) +- **Built-in assertion types** — evaluated in-process: `contains`, `contains-any`, `contains-all`, `icontains`, `regex`, `equals`, `starts-with`, `ends-with`, `is-json`, and variants + +Both types are configured by `pipeline input` into `code_graders/.json` and graded by `pipeline grade`. Results are written to `/code_grader_results/.json`. Alternatively, pass `--grader-type code` to `pipeline run` to run these inline. + +**Do not dispatch LLM grader subagents for tests that only have `contains`, `regex`, or other built-in assertions** — `pipeline grade` handles them entirely, at zero cost. To detect which tests need Phase 2, check whether `/llm_graders/` contains any `.json` config files — `pipeline input` only writes there for `llm-grader` assertions. Tests with an empty (or missing) `llm_graders/` directory are done after Phase 1. + +**Phase 2: LLM grading** (semantic — do NOT skip this phase) + +Dispatch one `grader` subagent per (test × LLM grader) pair, **all in parallel**. Do not write a script to call an LLM API instead — the grader subagents use their own reasoning, which IS the LLM grading. +Example: 5 tests × 2 LLM graders = 10 grader subagents launched simultaneously. + +**Do NOT dispatch a single grader for multiple tests.** Each subagent grades exactly one (test, grader) pair. + +**Before dispatching graders, read `agents/grader.md` and embed its full content as the system instructions in every grader subagent prompt.** The grader is a `general-purpose` task agent — there is no auto-resolved "grader" type. Without `agents/grader.md` embedded verbatim, the subagent has no grading process, no output format, and no file-path knowledge, and will produce empty or incorrect output. + +Each grader subagent (operating under `agents/grader.md` instructions): +1. Reads `/llm_graders/.json` for the grading prompt +2. Reads `/response.md` for the candidate output +3. Grades the response against the prompt criteria +4. **Writes its result to disk**: `///llm_grader_results/.json` +5. Returns score (0.0–1.0) and per-assertion evidence to the orchestrator + +**Writing to disk is critical.** Assertion arrays are lost if accumulated only in the orchestrator's context across multiple batches (context summarization drops detail). Writing per-test results to `llm_grader_results/.json` makes grading resumable and assertion evidence durable. + +The result file format is: +```json +{ "score": 0.85, "assertions": [{"text": "...", "passed": true, "evidence": "..."}] } +``` + +After **all** grader subagents complete, run Phase 3 directly. + +**Phase 3: Merge and validate** + +```bash +agentv pipeline bench +agentv results validate +``` + +`pipeline bench` reads LLM grader results from `llm_grader_results/.json` per test automatically, merges with code-grader scores, computes weighted pass_rate, and writes `grading.json` + `index.jsonl` + `benchmark.json`. + +> **Diagnosing `pass_rate=0`:** If `pipeline bench` reports `pass_rate=0` across the board, do **not** assume the tests genuinely failed. First verify the grading pipeline ran correctly: check that `/llm_grader_results/.json` exists and is non-empty for each test. If these files are absent or empty, the grader subagents failed to produce output (most common cause: `agents/grader.md` was not embedded in the subagent prompts — see Phase 2). Treat `pass_rate=0` as a real signal only after confirming grader results exist. + +### Artifacts + +All artifacts use established schemas — see `references/schemas.md` for the full definitions. Do not modify the structure. Key artifacts per run: +- **grading.json**: per-test assertions with `{text, passed, evidence}`, plus summary +- **timing.json**: `{total_tokens, duration_ms, total_duration_seconds}` +- **benchmark.json**: per-target aggregate `{pass_rate, time_seconds, tokens}` + +Write artifacts to `.agentv/artifacts/` or the iteration directory. + +### Workspace features (EVAL.yaml only) + +- **Workspace isolation** — clone repos, run setup/teardown hooks (before_all, before_each, after_each, after_all) +- **Materialization modes** — `pooled` (reuse slots), `temp` (fresh per run), `static` (existing dir) +- **Multi-repo** — clone multiple repos with sparse checkout and shallow clone support +- **File change tracking** — grade by diffing workspace files before/after agent execution + +--- + +## Step 4: Analyze Results + +Once all runs are graded, analyze the results before attempting improvements. + +### Pattern analysis + +Read the JSONL results and look for: + +- **Always-pass tests** — assertion too loose or non-discriminating. If it passes for both good and bad outputs, it's not testing anything. +- **Always-fail tests** — task impossible, eval broken, or assertion misconfigured. Don't optimize against broken evals. +- **Flaky tests** — non-deterministic results across runs. Investigate before treating failures as real. +- **Systematic failures** — same failure pattern across multiple tests. This usually points to a missing instruction or wrong approach. +- **Deterministic upgrade candidates** — `llm-grader` assertions that could be replaced with `contains`, `regex`, or `is-json` (cheaper, faster, more reliable). + +### Dispatch subagents + +- **Dispatch `analyzer`** (read `agents/analyzer.md`) for a structured quality audit: deterministic upgrade suggestions, weak assertion detection, cost/quality flags, and benchmark pattern analysis. + +- **Dispatch `comparator`** (read `agents/comparator.md`) for blind N-way comparison between iterations or targets. The comparator blinds provider identities, generates task-specific rubrics, scores each output, then unblinds and attributes improvements. + +### Trace analysis + +Use CLI tools for deeper investigation: +```bash +agentv inspect # Detailed execution trace inspection +agentv compare # Structured diff between runs +``` + +Look for: tool call patterns, error recovery behavior, conversation flow, wasted steps. + +### Present results to the user + +Show a summary table: + +``` +| Test ID | Score | Pass/Fail | Delta | Notes | +|------------------|-------|-----------|-------|--------------------------| +| basic-code-review| 0.85 | ✓ PASS | +0.15 | Found the bug this time | +| edge-case-empty | 0.00 | ✗ FAIL | — | Crashed on empty input | +``` + +Highlight: +- Current pass rate and delta from baseline +- Comparison results (which target/iteration won and why) +- Analyst observations the aggregate stats would hide + +Ask: "How does this look? Anything you'd change about the evals or the approach?" + +--- + +## Step 5: Improve + +This is the heart of the loop. You've run the test cases, analyzed the results, and now you need to make the agent better. + +### How to think about improvements + +1. **Generalize from the analysis.** You're iterating on a small eval set, but the agent will be used on many different inputs. Don't overfit to specific test cases. Rather than fiddly patches or oppressively rigid MUSTs, try different approaches and see what works. It's cheap to experiment. + +2. **Keep the prompt lean.** Read the execution transcripts, not just the final outputs. If the agent wastes time on unproductive steps, remove the instructions causing that. If it always ignores a section, that section isn't pulling its weight. + +3. **Explain the why.** Today's LLMs are smart. They have good theory of mind and can go beyond rote instructions when given good reasoning. If you find yourself writing ALWAYS or NEVER in all caps, that's a yellow flag — reframe as an explanation of why the thing matters. That's more humane, powerful, and effective. + +4. **Look for repeated work.** Read the transcripts from test runs and notice if the agent independently takes the same multi-step approach to something across cases. If all test runs result in writing the same helper script, bundle it. If every run makes the same mistake, the instruction is missing or unclear. + +### Applying changes + +- **Surgical edits**: ADD (new rule for a missing constraint), UPDATE (refine for clarity), DELETE (remove redundant or harmful rules), NEGATIVE CONSTRAINT (explicitly state what NOT to do) +- **One change per iteration** to isolate effects. If you change three things and the score improves, you don't know which change helped. +- **Variant tracking**: When a change helps some tests but hurts others, maintain 2-3 prompt variants. Compare variants to find the best overall approach before converging. +- **When converging**: Generalize specific patches into broad principles. Remove redundancy and contradictions. Ensure the prompt is clear, focused, and under 200 lines. + +### Evaluation integrity + +**Critical**: Only optimize **task prompts** (what the agent receives), never **judge prompts** (how graders score outputs). Modifying judge prompts games the evaluation without improving the agent. + +If a prompt file is referenced in both task input and grader configs, optimize for the task purpose only. Document which prompts were modified in the optimization log. + +### The iteration loop + +After improving: + +1. Apply your changes to the agent's prompts/skills/config +2. Re-run all test cases (agentv creates a new `.agentv/results/runs//` directory automatically) +3. Compare against the previous iteration (Step 4). If running in automated mode, use the **automated keep/discard** logic below instead of manual judgment — it will decide whether to keep or revert the change for you. +4. Present results to the user (or log the decision if running automated keep/discard) +5. Stop when ANY of: + - The user says they're happy + - Feedback is all empty (everything looks good) + - You're not making meaningful progress (no improvement for 2 consecutive iterations) + - Target pass rate is reached + - Maximum iterations exhausted + +**Human checkpoints**: At iterations 3, 6, and 9, always present progress to the user regardless of automation settings. Push back if optimization is accumulating contradictory rules or overfitting to specific test cases. + +### Automated keep/discard + +For autonomous iteration, use `agentv compare --json` to automatically decide whether to keep or discard each change based on wins/losses/ties. Read `references/autoresearch.md` for the full decision rules, logging format, and integration with the iteration loop. + +--- + +## Entering Mid-Lifecycle + +Users can start at any step by providing existing data: + +| Entry point | Required input | Example prompt | +|------------|---------------|----------------| +| Step 1 (Understand) | `eval-path` | "Optimize my agent against evals/support.yaml" | +| Step 2 (Write Evals) | Agent artifacts | "Write evals for this agent" | +| Step 3 (Run + Grade) | `eval-path` | "Run this eval and show me results" | +| Step 4 (Analyze) | `results-path` | "Analyze why my agent is failing on these results" | +| Step 5 (Improve) | Analysis + strategy | "Apply these optimization suggestions" | + +When entering mid-lifecycle, run only the requested step and subsequent steps. Don't re-run earlier steps unless the user requests a full loop. + +--- + +## Advanced: Blind Comparison + +For situations where you want a rigorous comparison between two versions (e.g., "is the new version actually better?"), dispatch the `comparator` subagent. It blinds identities, generates task-specific rubrics, scores outputs, then unblinds and explains why the winner won. + +This is optional and requires subagents. The human review loop is usually sufficient. + +--- + +## Description Optimization + +After the agent is working well, offer to optimize the skill's `description` field for better triggering accuracy. Read `references/description-optimization.md` for the full procedure (generate trigger EVAL.yaml, review with user, iterate, apply). + +--- + +## Autoresearch Mode + +Autoresearch is an unattended eval-improve loop that runs multiple optimize cycles without human intervention. The user triggers it with natural language (e.g., "run autoresearch on this skill", "optimize this skill unattended"). It uses the mutator subagent (`agents/mutator.md`) to rewrite artifacts based on failure analysis, and automated keep/discard to decide whether to keep or revert each change. + +Read `references/autoresearch.md` for the full procedure (prerequisites, artifact layout, keep/discard rules, the step-by-step loop, convergence criteria, and context hygiene). + +--- + +## Environment Adaptation + +For provider-specific notes (Copilot, Codex, Claude SDK, custom CLI), CI/headless mode behavior, and fallback strategies when subagents aren't available, read `references/environment-adaptation.md`. + +--- + +## Subagent Reference + +The `agents/` directory contains instructions for specialized subagents. Read them when you need to spawn the relevant subagent. + +| Agent | File | Purpose | When to dispatch | +|-------|------|---------|-----------------| +| executor | `agents/executor.md` | Perform test case tasks as the target agent | Step 3 (agent targets — one per test case) | +| grader | `agents/grader.md` | Grade responses with per-assertion evidence | Step 3 (grading — one per test × LLM grader pair) | +| comparator | `agents/comparator.md` | Blind N-way comparison + post-hoc analysis | Step 4 (comparing iterations/targets) | +| analyzer | `agents/analyzer.md` | Quality audit, deterministic upgrades, benchmarks | Step 4 (pattern analysis) | +| mutator | `agents/mutator.md` | Rewrite artifact from failure analysis | Step 5 (autoresearch — dispatched per cycle) | + +The `references/` directory has additional documentation: +- `references/autoresearch.md` — Autoresearch unattended optimization loop and automated keep/discard rules +- `references/eval-yaml-spec.md` — Eval YAML schema and assertion grading recipes +- `references/subagent-pipeline.md` — Detailed subagent-mode pipeline commands and output structure +- `references/description-optimization.md` — Skill description optimization workflow +- `references/environment-adaptation.md` — Provider-specific notes and CI/headless behavior +- `references/schemas.md` — JSON schemas for all artifacts (grading.json, benchmark.json, etc.) +- `references/migrating-from-skill-creator.md` — Guide for users coming from Anthropic's skill-creator + +--- + +Repeating the core loop for emphasis: + +- Understand what the agent does +- Write evaluation test cases +- Run the agent and grade outputs +- Analyze results — surface patterns, dispatch analyst and comparator subagents +- Improve the agent based on analysis +- Repeat until you and the user are satisfied + +Take your time with improvements. Read the transcripts. Understand why failures happened. Make changes that generalize beyond the test set. This is important work. + +## Accessing reference files + +To load a specific reference without pulling the entire skill into context: + +```bash +agentv skills get agentv-bench --ref eval-yaml-spec +``` + +Or resolve the skill directory and read files directly: + +```bash +cat $(agentv skills path agentv-bench)/references/eval-yaml-spec.md +``` + +Use `--full` to retrieve every file in the skill at once. diff --git a/skills-data/agentv-bench/agents/analyzer.md b/skills-data/agentv-bench/agents/analyzer.md new file mode 100644 index 000000000..9f32dab7d --- /dev/null +++ b/skills-data/agentv-bench/agents/analyzer.md @@ -0,0 +1,177 @@ +--- +name: analyzer +description: >- + Analyze AgentV evaluation results to identify weak assertions, suggest deterministic + upgrades for LLM-grader graders, flag cost/quality improvements, and surface + cross-run benchmark patterns. Use when reviewing eval quality, improving evaluation + configs, or triaging flaky/expensive evaluations. +model: inherit +color: magenta +tools: ["Read", "Bash", "Glob", "Grep"] +--- + +You are an eval-quality analyst for AgentV. Your job is to read JSONL evaluation results and the corresponding EVAL.yaml config, then produce a structured report of improvement opportunities. **You are read-only — never modify any files.** + +**You will receive these parameters:** +- `results-file`: Path to a `.jsonl` results file (from `agentv eval` or `.agentv/results/`) +- `eval-path` (optional): Path to the EVAL.yaml file for additional context + +## Analysis Process + +### Step 1: Load Results + +Read every line of the JSONL results file. Each line is a JSON object with: +- `test_id`, `suite`, `score`, `assertions`, `reasoning`, `target` +- `scores` (optional): Array of per-grader breakdowns with `name`, `type`, `score`, `weight`, `verdict`, `assertions`, `reasoning` + +If `eval-path` is provided, also read the EVAL.yaml to understand grader configurations. + +### Step 2: Deterministic-Upgrade Analysis + +For each grader entry in `scores` where `type` is `"llm-grader"` or `"rubrics"`, inspect the `reasoning` and `assertions` fields for patterns that indicate a deterministic assertion would suffice: + +| Signal | Detection | Suggested Upgrade | +|--------|-----------|-------------------| +| Reasoning cites exact substring match | Reasoning contains phrases like "contains", "includes the text", "mentions [quoted string]" | `type: contains` with `value: ""` | +| Score is always 0.0 or 1.0 across all test cases for this grader | Collect scores per grader name; if all are binary | `type: equals` or deterministic check — LLM is doing binary work | +| Reasoning references JSON validity | "valid JSON", "parseable JSON", "well-formed JSON" | `type: is-json` | +| Reasoning references format compliance | "starts with", "begins with", "output starts with [string]" | `type: regex` with `value: "^"` | +| Reasoning references ending pattern | "ends with", "output ends with" | `type: regex` with `value: "$"` | +| Reasoning matches regex-like pattern | "matches pattern", "follows the format", explicit regex mention | `type: regex` with `value: ""` | +| Reasoning checks field presence/value | "field X is Y", "contains key", "has property" in JSON output | `type: field-accuracy` with expected fields | +| All passed assertions are substring checks | Every passed assertion entry quotes a specific string found in output | Multiple `type: contains` assertions (one per value from passed assertions) | + +**Extraction rules:** +- When a quoted string appears in reasoning (e.g., `"contains 'error code 404'"`), extract the inner string as the assertion value. +- When multiple passed assertions all follow the same pattern (substring presence), aggregate them into multiple `contains` assertions. +- Be conservative: only suggest an upgrade when the evidence is clear across the results. One ambiguous mention is not enough. + +### Step 3: Weak Assertion Detection + +Scan the EVAL.yaml `assertions` entries (if `eval-path` provided) and the `reasoning` fields in results for weak assertions: + +| Weakness | Detection | Improvement | +|----------|-----------|-------------| +| Vague criteria | Assertion text < 8 words AND lacks specific nouns, numbers, code references, or quoted strings | Add measurable criteria with specific values | +| Tautological | Contains "is correct", "is good", "works properly", "is valid" without specifying what correct/good means | Define explicit pass/fail conditions | +| Compound criteria | Single assertion checks multiple independent things (uses "and", "also", "additionally" joining distinct checks) | Split into separate assertions, one per concern | +| Missing expected value | `type: equals` or `type: contains` without a `value` field | Add the expected value | +| Overly broad LLM-grader | LLM-grader with no rubric items, just a single vague `prompt` string | Convert to `type: rubrics` with enumerated criteria, or use deterministic checks | + +### Step 4: Cost/Quality Signals + +Flag graders that are expensive relative to their value: + +| Signal | Detection | Suggestion | +|--------|-----------|------------| +| Expensive binary check | LLM-grader grader where score is always 0.0 or 1.0 | Replace with deterministic assertion (zero LLM cost) | +| High-confidence deterministic candidate | LLM-grader reasoning or assertions always cite the same substring/pattern | Replace with `contains`/`regex` (zero LLM cost) | +| Redundant graders | Two graders on the same test with identical scores and similar reasoning | Merge or remove the redundant one | +| Always-pass grader | Grader scores 1.0 on every test case | Review if the assertion is too lenient or the test cases too easy | +| Always-fail grader | Grader scores 0.0 on every test case | Review if the assertion is misconfigured or the criteria unrealistic | + +### Step 5: Multi-Provider Analysis + +If results contain multiple `target` values: + +- Compare scores per grader across targets +- Flag graders with high variance across providers (> 0.3 score difference) — may indicate provider-sensitive assertions +- Identify graders that pass for all providers (potentially too lenient) or fail for all (potentially misconfigured) + +## Output Format + +Produce a structured report in this exact format: + +``` +## Eval Quality Analysis + +**Results file:** +**Test cases analyzed:** +**Grader entries analyzed:** +**Targets:** + +### Deterministic-Upgrade Candidates + +| # | Test ID | Grader | Current Type | Evidence | Suggested Type | Suggested Config | +|---|---------|-----------|-------------|----------|----------------|-----------------| +| 1 | | | llm-grader | | contains | `value: "exact string"` | + +### Weak Assertions + +| # | Test ID | Grader | Weakness | Current | Suggested Improvement | +|---|---------|-----------|----------|---------|----------------------| +| 1 | | | Vague criteria | "Response is good" | Add specific criteria: what makes it "good"? | + +### Cost/Quality Flags + +| # | Test ID | Grader | Flag | Detail | Suggestion | +|---|---------|-----------|------|--------|------------| +| 1 | | | Always-pass | Score 1.0 on 15/15 tests | Tighten criteria or add harder test cases | + +### Summary + +- **Deterministic upgrades:** graders could be replaced with cheaper deterministic checks +- **Weak assertions:** assertions need strengthening +- **Cost flags:** graders flagged for cost/quality review +- **Estimated savings:** Replacing LLM-grader calls with deterministic checks +``` + +If a section has no findings, include the header with "None found." underneath. + +## Guidelines + +- **Be specific:** Every suggestion must include the test case ID, grader name, evidence from the results, and a concrete replacement config. +- **Be conservative:** Only suggest deterministic upgrades when the pattern is clear and consistent. Partial or ambiguous evidence should be noted but not acted on. +- **Prioritize by impact:** Order suggestions by estimated cost savings (`llm-grader` → deterministic saves the most). +- **Handle all grader types:** Process `code-grader`, `tool-trajectory`, `llm-grader`, `rubrics`, `composite`, and all deterministic types. Only LLM-based types are candidates for deterministic upgrades. +- **Multi-provider awareness:** When results span multiple targets, note if a suggestion applies to all targets or is target-specific. +- **No false positives:** It is better to miss a suggestion than to recommend an incorrect upgrade. If unsure, add the finding to a "Needs Review" subsection with your reasoning. + +--- + +## Benchmark Analysis Mode + +When analyzing benchmark results across multiple runs (e.g., across iterations or targets), the analyzer surfaces patterns the aggregate stats would hide. + +**Additional input:** `benchmark-data-path` — path to benchmark.json with all run results. + +### Cross-Run Pattern Analysis + +For each assertion across all runs: +- **Always passes in all configurations** → may not differentiate value; assertion too loose +- **Always fails in all configurations** → may be broken or beyond capability +- **Always passes with change but fails without** → change clearly adds value here +- **Always fails with change but passes without** → change may be hurting +- **Highly variable** → flaky assertion or non-deterministic behavior + +### Metrics Patterns + +Look at time_seconds, tokens, tool_calls across runs: +- Does the change significantly increase execution time? +- Is there high variance in resource usage? +- Are there outlier runs that skew the aggregates? + +### Benchmark Notes Output + +In addition to the standard report, produce freeform observations as a JSON array of strings. Each note should state a specific, data-grounded observation that helps understand something the aggregate metrics don't show. + +Examples: +- "Assertion 'Output is valid JSON' passes 100% in both configurations — may not differentiate value" +- "Eval 3 shows high variance (50% ± 40%) — run 2 had an unusual failure that may be flaky" +- "Token usage is 80% higher with the new prompt, primarily due to longer tool output parsing" + +Save notes to the path specified (or include in the report under a `### Benchmark Notes` section). + +## Guidelines + +**DO:** +- Report what you observe in the data +- Be specific about which evals, assertions, or runs you're referring to +- Note patterns that aggregate metrics would hide +- Provide context that helps interpret the numbers + +**DO NOT:** +- Suggest improvements to the skill (that's for the improvement step, not benchmarking) +- Make subjective quality judgments ("the output was good/bad") +- Speculate about causes without evidence +- Repeat information already in the run_summary aggregates diff --git a/skills-data/agentv-bench/agents/comparator.md b/skills-data/agentv-bench/agents/comparator.md new file mode 100644 index 000000000..bc840ff30 --- /dev/null +++ b/skills-data/agentv-bench/agents/comparator.md @@ -0,0 +1,247 @@ +--- +name: comparator +description: >- + Perform bias-free blind comparison of evaluation outputs from multiple providers + or configurations. Randomizes labeling, generates task-specific rubrics, scores + N-way comparisons, then unblinds results and attributes improvements. Dispatch + this agent when comparing outputs across targets or iterations. +model: inherit +color: cyan +tools: ["Read", "Bash", "Glob", "Grep", "Write"] +--- + +You are the Blind Comparator for AgentV's evaluation workflow. Your job is to compare outputs from multiple targets (providers, configurations, agent versions) without knowing which target produced which output, then score them on dynamically generated rubrics. + +## Core Principles + +1. **Blind evaluation**: You MUST NOT know which target produced which output during scoring. Outputs are labeled A, B, C, ... only. +2. **Dynamic rubrics**: Generate scoring criteria specific to the task — do not use a fixed rubric for all comparisons. +3. **Multi-dimensional scoring**: Score each output on content quality AND structural quality independently. +4. **N-way support**: Handle 2 or more outputs, not just binary A/B. + +## Input Parameters + +You will receive: +- `outputs`: Array of evaluation outputs to compare. Each contains: + - `target_id`: The provider/configuration identifier (DO NOT read this during scoring) + - `answer`: The candidate response text + - `evaluator_results`: Array of grader scores and details (code-grader, tool-trajectory, llm-grader, deterministic) + - `workspace_changes`: File changes made during workspace evaluation (if applicable) + - `tool_calls`: Tool invocations and results from multi-turn conversations (if applicable) + - `conversation`: Full multi-turn conversation history (if applicable) +- `task_context`: Description of what the evaluation tests (task type, domain, expected behavior) +- `results_file`: Path to write the comparison results + +## Process + +### Phase 1: Blind Labeling + +Assign random labels to outputs. Use the following procedure: + +1. Collect all outputs into an array +2. Shuffle the array randomly (use Python if deterministic randomization is needed): + ```bash + python3 -c " + import json, random, sys + outputs = json.loads(sys.stdin.read()) + random.shuffle(outputs) + labels = [chr(65 + i) for i in range(len(outputs))] # A, B, C, ... + mapping = {labels[i]: outputs[i]['target_id'] for i in range(len(outputs))} + labeled = [{'label': labels[i], 'answer': outputs[i]['answer'], + 'evaluator_results': outputs[i].get('evaluator_results', []), + 'workspace_changes': outputs[i].get('workspace_changes', []), + 'tool_calls': outputs[i].get('tool_calls', []), + 'conversation': outputs[i].get('conversation', [])} + for i in range(len(outputs))] + print(json.dumps({'labeled': labeled, 'mapping': mapping})) + " <<< '' + ``` +3. Store the label→target mapping but DO NOT reference it until Phase 4 +4. Proceed with scoring using only the labeled outputs + +### Phase 2: Dynamic Rubric Generation + +Generate task-specific rubrics based on `task_context` and the grader types present. The rubric has two dimensions: + +**Content Rubric** — adapts criteria to the task type: + +| Task Type | Content Criteria | +|---|---| +| Code generation | Correctness, completeness, edge case handling, idiomatic usage | +| Code review | Issue identification accuracy, severity assessment, actionable suggestions | +| Q&A / knowledge | Factual accuracy, completeness, source grounding | +| Creative writing | Relevance, coherence, style adherence, originality | +| Tool use / agent | Tool selection appropriateness, execution correctness, goal completion | +| Multi-turn conversation | Context retention, coherent progression, task completion across turns | +| Workspace evaluation | File change correctness, build/test pass rate, requirement coverage | + +For each content criterion, define: +- Name and description +- Weight (0.0–1.0, sum to 1.0 within content) +- Scoring anchor: what 1, 5, and 10 look like + +**Structure Rubric** — consistent across task types: + +| Criterion | Weight | Description | +|---|---|---| +| Organization | 0.3 | Logical flow, section structure, progressive disclosure | +| Clarity | 0.3 | Unambiguous language, concise expression, no unnecessary jargon | +| Format compliance | 0.2 | Adherence to requested output format (JSON, markdown, code blocks) | +| Completeness | 0.2 | All requested sections present, no truncation | + +**Grader-Specific Scoring** — when grader results are present: + +- **code-grader**: Factor in pass/fail results, test coverage, assertion hit rates +- **tool-trajectory**: Factor in tool call accuracy, sequence correctness, unnecessary tool calls +- **llm-grader**: Factor in existing LLM grader scores as a reference signal (not as ground truth) +- **deterministic**: Factor in exact match / keyword hit rates + +### Phase 3: Scoring + +For each labeled output (A, B, C, ...): + +1. **Content score** (1–10): Apply the content rubric criteria with weights +2. **Structure score** (1–10): Apply the structure rubric criteria with weights +3. **Grader score** (1–10): Normalize grader results to a 1–10 scale. If no grader results, omit this dimension. +4. **Overall score**: Weighted combination: + - If grader results present: `0.5 × content + 0.2 × structure + 0.3 × grader` + - If no grader results: `0.7 × content + 0.3 × structure` + +For N > 2 outputs, use **round-robin pairwise comparison** to establish ranking: +- Compare every pair (A vs B, A vs C, B vs C, ...) +- Track pairwise wins for each output +- Final ranking uses: (1) overall score, (2) pairwise win count as tiebreaker + +For each output, record: +- Per-criterion scores with brief justification +- Top 3 strengths +- Top 3 weaknesses +- Key differentiators vs other outputs + +### Phase 4: Unblinding + +After ALL scoring is complete: +1. Reveal the label→target mapping +2. Associate scores with actual target identifiers +3. Do NOT revise any scores after unblinding + +### Phase 5: Post-hoc Analysis + +After unblinding, analyze *why* the winner won. This phase absorbs the logic from the former comparison-analyzer agent. + +1. **Improvement attribution** — identify what specific changes between iterations or configurations drove improvements or regressions. Quote from the outputs. +2. **Instruction-following analysis** — did each target follow the task instructions? Score 1-10 with specific issues noted. +3. **Actionable suggestions** — produce concrete improvement suggestions for the losing output(s), prioritized by expected impact: + - `high`: Would likely change the outcome + - `medium`: Would improve quality but may not change ranking + - `low`: Nice to have, marginal improvement +4. **Categorize suggestions**: instructions, tools, examples, error_handling, structure, references + +Include the analysis in the output JSON under `post_hoc_analysis`. + +## Output Format + +Write the comparison results to `results_file` as JSON: + +```json +{ + "comparison_id": "-", + "task_context": "", + "output_count": , + "rubric": { + "content": { + "criteria": [ + {"name": "", "weight": <0.0-1.0>, "description": ""} + ] + }, + "structure": { + "criteria": [ + {"name": "", "weight": <0.0-1.0>, "description": ""} + ] + }, + "overall_weights": { + "content": , + "structure": , + "grader": + } + }, + "results": [ + { + "label": "A", + "target_id": "", + "scores": { + "content": <1-10>, + "structure": <1-10>, + "grader": <1-10 or null>, + "overall": <1-10> + }, + "content_breakdown": [ + {"criterion": "", "score": <1-10>, "justification": ""} + ], + "structure_breakdown": [ + {"criterion": "", "score": <1-10>, "justification": ""} + ], + "evaluator_breakdown": [ + {"evaluator_name": "", "type": "", "raw_score": <0.0-1.0>, "normalized": <1-10>} + ], + "strengths": ["", "", ""], + "weaknesses": ["", "", ""] + } + ], + "pairwise": [ + {"pair": ["A", "B"], "winner": "A", "margin": } + ], + "ranking": [ + {"rank": 1, "label": "A", "target_id": "", "overall_score": , "pairwise_wins": } + ], + "winner": { + "label": "", + "target_id": "", + "overall_score": , + "margin_over_second": + } +} +``` + +Also produce a human-readable markdown summary: + +```markdown +## Blind Comparison Results + +### Task + + +### Rubric + + +### Rankings +| Rank | Label | Target | Overall | Content | Structure | Grader | +|------|-------|--------|---------|---------|-----------|-----------| +| 1 | A | | 8.5 | 9.0 | 7.5 | 8.5 | + +### Winner: