From 129cac2cfe6d411f96cd10686f4b09aff76fa420 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 7 May 2026 03:15:31 +0200 Subject: [PATCH 1/8] feat(cli): bundle agentv-dev skills and add `agentv skills` subcommand MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Skills are now bundled inside the CLI npm package (`apps/cli/skills/` → `dist/skills/` at build time), version-matched to the binary. A new `agentv skills` subcommand serves the bundled content without any separate plugin install step. - `agentv skills list` — list available skill names (--json) - `agentv skills get ` — print SKILL.md content (--full, --json) - `agentv skills get --all` — print all skills - `agentv skills path []` — print resolved skills directory Resolution walks upward from the module file, validating by SKILL.md presence to avoid false matches. Prefers `dist/skills/` (production layout) over bare `skills/` (source layout). The marketplace plugin SKILL.md files are converted to discovery stubs that redirect agents to `agentv skills get `. Full skill content lives in `apps/cli/skills/` as the single source of truth. Docs: update installation.mdx so the canonical setup is `npm install -g agentv` alone; the allagents plugin step moves to an optional "Claude Code Plugin" section. Closes #1224 Co-Authored-By: Claude Sonnet 4.6 --- apps/cli/skills/agentv-bench/LICENSE.txt | 202 + apps/cli/skills/agentv-bench/SKILL.md | 428 + .../skills/agentv-bench/agents/analyzer.md | 177 + .../skills/agentv-bench/agents/comparator.md | 247 + .../skills/agentv-bench/agents/executor.md | 30 + apps/cli/skills/agentv-bench/agents/grader.md | 238 + .../cli/skills/agentv-bench/agents/mutator.md | 172 + .../agentv-bench/assets/eval_review.html | 146 + .../agentv-bench/references/autoresearch.md | 309 + .../references/description-optimization.md | 66 + .../references/environment-adaptation.md | 82 + .../agentv-bench/references/eval-yaml-spec.md | 338 + .../migrating-from-skill-creator.md | 103 + .../skills/agentv-bench/references/schemas.md | 432 + .../references/subagent-pipeline.md | 166 + .../agentv-bench/scripts/trajectory.html | 462 + apps/cli/skills/agentv-eval-review/SKILL.md | 52 + .../agentv-eval-review/scripts/lint_eval.py | 239 + apps/cli/skills/agentv-eval-writer/SKILL.md | 691 + .../references/config-schema.json | 63 + .../references/custom-evaluators.md | 119 + .../references/eval-schema.json | 17200 ++++++++++++++++ .../references/rubric-evaluator.md | 114 + apps/cli/skills/agentv-governance/SKILL.md | 63 + .../references/eu-ai-act-risk-tiers.md | 37 + .../references/governance-yaml-shape.md | 125 + .../references/iso-42001-controls.md | 46 + .../references/lint-rules.md | 169 + .../references/mitre-atlas.md | 38 + .../references/owasp-agentic-top-10-2025.md | 28 + .../references/owasp-llm-top-10-2025.md | 25 + apps/cli/skills/agentv-onboarding/SKILL.md | 63 + .../scripts/onboard-agentv.ps1 | 58 + .../scripts/onboard-agentv.sh | 66 + apps/cli/skills/agentv-trace-analyst/SKILL.md | 145 + apps/cli/src/commands/init/index.ts | 4 +- apps/cli/src/commands/skills/index.ts | 256 + apps/cli/src/index.ts | 3 + apps/cli/test/unit/skills.test.ts | 151 + apps/cli/tsup.config.ts | 14 + .../docs/getting-started/installation.mdx | 63 +- .../agentv-dev/skills/agentv-bench/SKILL.md | 417 +- .../skills/agentv-eval-review/SKILL.md | 43 +- .../skills/agentv-eval-writer/SKILL.md | 679 +- .../skills/agentv-governance/SKILL.md | 54 +- .../skills/agentv-onboarding/SKILL.md | 58 +- .../skills/agentv-trace-analyst/SKILL.md | 134 +- 47 files changed, 23421 insertions(+), 1394 deletions(-) create mode 100644 apps/cli/skills/agentv-bench/LICENSE.txt create mode 100644 apps/cli/skills/agentv-bench/SKILL.md create mode 100644 apps/cli/skills/agentv-bench/agents/analyzer.md create mode 100644 apps/cli/skills/agentv-bench/agents/comparator.md create mode 100644 apps/cli/skills/agentv-bench/agents/executor.md create mode 100644 apps/cli/skills/agentv-bench/agents/grader.md create mode 100644 apps/cli/skills/agentv-bench/agents/mutator.md create mode 100644 apps/cli/skills/agentv-bench/assets/eval_review.html create mode 100644 apps/cli/skills/agentv-bench/references/autoresearch.md create mode 100644 apps/cli/skills/agentv-bench/references/description-optimization.md create mode 100644 apps/cli/skills/agentv-bench/references/environment-adaptation.md create mode 100644 apps/cli/skills/agentv-bench/references/eval-yaml-spec.md create mode 100644 apps/cli/skills/agentv-bench/references/migrating-from-skill-creator.md create mode 100644 apps/cli/skills/agentv-bench/references/schemas.md create mode 100644 apps/cli/skills/agentv-bench/references/subagent-pipeline.md create mode 100644 apps/cli/skills/agentv-bench/scripts/trajectory.html create mode 100644 apps/cli/skills/agentv-eval-review/SKILL.md create mode 100644 apps/cli/skills/agentv-eval-review/scripts/lint_eval.py create mode 100644 apps/cli/skills/agentv-eval-writer/SKILL.md create mode 100644 apps/cli/skills/agentv-eval-writer/references/config-schema.json create mode 100644 apps/cli/skills/agentv-eval-writer/references/custom-evaluators.md create mode 100644 apps/cli/skills/agentv-eval-writer/references/eval-schema.json create mode 100644 apps/cli/skills/agentv-eval-writer/references/rubric-evaluator.md create mode 100644 apps/cli/skills/agentv-governance/SKILL.md create mode 100644 apps/cli/skills/agentv-governance/references/eu-ai-act-risk-tiers.md create mode 100644 apps/cli/skills/agentv-governance/references/governance-yaml-shape.md create mode 100644 apps/cli/skills/agentv-governance/references/iso-42001-controls.md create mode 100644 apps/cli/skills/agentv-governance/references/lint-rules.md create mode 100644 apps/cli/skills/agentv-governance/references/mitre-atlas.md create mode 100644 apps/cli/skills/agentv-governance/references/owasp-agentic-top-10-2025.md create mode 100644 apps/cli/skills/agentv-governance/references/owasp-llm-top-10-2025.md create mode 100644 apps/cli/skills/agentv-onboarding/SKILL.md create mode 100644 apps/cli/skills/agentv-onboarding/scripts/onboard-agentv.ps1 create mode 100755 apps/cli/skills/agentv-onboarding/scripts/onboard-agentv.sh create mode 100644 apps/cli/skills/agentv-trace-analyst/SKILL.md create mode 100644 apps/cli/src/commands/skills/index.ts create mode 100644 apps/cli/test/unit/skills.test.ts diff --git a/apps/cli/skills/agentv-bench/LICENSE.txt b/apps/cli/skills/agentv-bench/LICENSE.txt new file mode 100644 index 000000000..7a4a3ea24 --- /dev/null +++ b/apps/cli/skills/agentv-bench/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/apps/cli/skills/agentv-bench/SKILL.md b/apps/cli/skills/agentv-bench/SKILL.md new file mode 100644 index 000000000..67df5b625 --- /dev/null +++ b/apps/cli/skills/agentv-bench/SKILL.md @@ -0,0 +1,428 @@ +--- +name: agentv-bench +description: >- + Run AgentV evaluations and optimize agents through eval-driven iteration. + Triggers: run evals, benchmark agents, optimize prompts/skills against evals, compare + agent outputs across providers, analyze eval results, offline evaluation of recorded sessions, + run autoresearch, optimize unattended, run overnight optimization loop. + Not for: writing/editing eval YAML without running (use agentv-eval-writer), + analyzing existing traces/JSONL without re-running (use agentv-trace-analyst). +--- + +# AgentV Bench + + +A skill for evaluating agents and iteratively improving them through data-driven optimization. + +At a high level, the process goes like this: + +- Understand what the agent does and what "good" looks like +- Write evaluation test cases (EVAL.yaml or evals.json) +- Run the agent on those test cases, grade the outputs +- Analyze the results — what's working, what's failing, and why +- Improve the agent's prompts/skills/config based on the analysis +- Repeat until you're satisfied + +Your job when using this skill is to figure out where the user is in this process and then jump in and help them progress. Maybe they want to start from scratch — help them write evals, run them, and iterate. Maybe they already have results — jump straight to analysis and improvement. + +Be flexible. If the user says "I don't need a full benchmark, just help me debug this failure", do that instead. + +After the agent is working well, you can also run description optimization to improve skill triggering accuracy (see `references/description-optimization.md`). + +## Communicating with the user + +This skill is used by people across a wide range of familiarity with evaluation tooling. Pay attention to context cues: + +- "evaluation" and "benchmark" are borderline but OK in most cases +- For "YAML", "grader", "assertion", "deterministic judge" — see serious cues from the user that they know what those mean before using them without explanation +- Briefly explain terms if in doubt + +When presenting results, default to summary tables. Offer detail on request. In CI/headless mode, skip interactive prompts and exit with status codes. + +--- + +## Step 1: Understand the Agent + +Before running or optimizing, understand what you're working with. + +1. **Read the agent's artifacts** — prompts, skills, configs, recent changes. Understand the full picture: what tools are available, what the expected input/output looks like, what constraints exist. + +2. **Identify success criteria** — what does "good" look like for this agent? What are the edge cases? What would a failure look like? Talk to the user if this isn't clear from the artifacts alone. + +3. **Understand the target harness** — which provider runs the agent (Claude, GPT, Copilot CLI, Gemini, custom CLI)? This affects what grader types are available and how to run tests. Targets are configured in `.agentv/targets.yaml` (canonical location, searched from the eval file directory upward). Sensitive values like `api_key` must use `${{ ENV_VAR }}` syntax — literal secrets are rejected as a security guardrail. + +4. **Challenge assumptions** — if evals already exist, review their quality before running: + - Are the test cases testing the right things? + - Are assertions specific enough to catch real failures? + - Are there ambiguous or contradictory test cases? + - Flag eval issues before proceeding — running bad evals wastes time. + +5. **Check integrity** — ensure task prompts (what the agent receives) are not also used as grader prompts (how outputs are scored). If a prompt file appears in both locations, note the overlap and optimize only for the task purpose. + +--- + +## Step 2: Write Evaluations + +AgentV supports two evaluation formats: + +**EVAL.yaml** (native, full features) — supports workspaces, code graders, multi-turn conversations, tool trajectory scoring, workspace file tracking, multi-provider targets. Use this for agent evaluation. + +```yaml +# example.eval.yaml +tests: + - id: basic-code-review + input: "Review this TypeScript file for bugs and suggest improvements" + criteria: "Identifies the null pointer bug on line 12 and suggests a fix" + assertions: + - type: contains + value: "null" + - Review identifies the null pointer bug and suggests a concrete fix + +workspace: + template: ./workspace-template + hooks: + before_each: + reset: fast +``` + +Multi-skill evaluation is handled naturally via input messages — describe the task in the test input, and the agent uses whatever skills it needs. + +**evals.json** (skill-creator compatible) — auto-promoted to EVAL-equivalent format: +- `prompt` → input messages +- `expected_output` → reference answer +- `assertions` → graders +- `files[]` paths resolved relative to the evals.json location + +```json +{ + "skill_name": "my-agent", + "evals": [ + { + "id": 1, + "prompt": "User's task prompt", + "expected_output": "Description of expected result", + "assertions": ["Output includes error handling", "Uses async/await"] + } + ] +} +``` + +### Writing good test cases + +Start with 2-3 realistic test cases — the kind of thing a real user would actually say. Share them with the user before running: "Here are a few test cases I'd like to try. Do these look right, or do you want to add more?" + +Good assertions are objectively verifiable and have descriptive names. Subjective quality ("the output is good") is better evaluated qualitatively — don't force assertions onto things that need human judgment. + +**Grader types** (cheapest to most expensive): `exact`, `contains`, `regex`, `is-json`, `field-accuracy`, `composite`, `code-grader`, `tool-trajectory`, `llm-grader`. See `references/eval-yaml-spec.md` for full config and grading recipes for each type. + +Prefer deterministic graders over LLM graders whenever possible. If an assertion can be checked with `contains` or `regex`, don't use `llm-grader`. + +--- + +## Step 3: Run and Grade + +This section is one continuous sequence — don't stop partway through. + +Each run produces a new `.agentv/results/runs//` directory automatically. Use timestamps to identify iterations when comparing runs. + +### Choosing a run mode + +**User instruction takes priority.** If the user says "run in subagent mode", "use subagent mode", or "use CLI mode", use that mode directly. + +If the user has not specified a mode, default to `subagent`. + +| `AGENT_EVAL_MODE` | Mode | How | +|----------------------|------|-----| +| `subagent` (default) | **Subagent mode** | Subagent-driven eval — parses eval.yaml, spawns executor + grader subagents. Zero CLI dependency. | +| `cli` | **AgentV CLI** | `agentv eval ` — end-to-end, multi-provider | + +Set `AGENT_EVAL_MODE` in `.env` at the project root as the default when no mode is specified. If absent, default to `subagent`. **User instruction always overrides this.** + +**`subagent`** — Parses eval.yaml directly, spawns executor subagents to run each test case in the current workspace, then spawns grader subagents to evaluate all assertion types natively. No CLI or external API calls required. Read `references/subagent-pipeline.md` for the detailed procedure. + +**`cli`** — AgentV CLI handles execution, grading, and artifact generation end-to-end. Works with all providers. Use when you need multi-provider benchmarking or CLI-specific features. + +### Running evaluations + +**AgentV CLI mode** (end-to-end, EVAL.yaml): +```bash +agentv eval --output .agentv/artifacts/ +``` + +**Subagent mode** — read `references/subagent-pipeline.md` for the detailed procedure. In brief: use `pipeline input` to extract inputs, dispatch one `executor` subagent per test case (all in parallel), then proceed to grading below. + +**Spawn all runs in the same turn.** For each test case that needs both a "with change" and a "baseline" run, launch them simultaneously. Don't run one set first and come back for the other — launch everything at once so results arrive around the same time. + +**Multi-target benchmarking:** +```bash +agentv eval --target claude --target gpt --target copilot +``` + +**Baseline strategy:** +- **New agent**: baseline is "no prompt" or minimal prompt — same eval, no agent-specific configuration +- **Improving existing**: snapshot the current version before editing (`cp -r /prompt-snapshot/`), use as baseline throughout +- **Multi-target**: each target is its own baseline — no need for a separate "without" run + +### While runs are in progress, draft graders + +Don't just wait for runs to finish — use this time productively. If assertions don't exist yet, draft them now. If they exist, review them and explain what they check to the user. + +Good assertions are *discriminating* — they pass when the agent genuinely succeeds and fail when it doesn't. An assertion that passes for both good and bad outputs is worse than no assertion. + +### As runs complete, capture timing data + +When each subagent task completes, you receive a notification containing `total_tokens` and `duration_ms`. **Save this data immediately** to `timing.json` in the run directory. See `references/schemas.md` for the timing.json schema. + +This is the only opportunity to capture this data — it comes through the task notification and isn't persisted elsewhere. Process each notification as it arrives. + +### Grading + +**In CLI mode**, `agentv eval` handles all grading end-to-end — no manual phases needed. + +**In subagent mode**, grading has three phases. **All three are required — do not stop after phase 1.** + +**Phase 1: Code graders** (deterministic, zero-cost) + +```bash +agentv pipeline grade +``` + +This evaluates all deterministic assertions against `response.md` files. Two types are handled: +- **`code-grader` scripts** — external scripts executed against the response (arbitrary logic, any language) +- **Built-in assertion types** — evaluated in-process: `contains`, `contains-any`, `contains-all`, `icontains`, `regex`, `equals`, `starts-with`, `ends-with`, `is-json`, and variants + +Both types are configured by `pipeline input` into `code_graders/.json` and graded by `pipeline grade`. Results are written to `/code_grader_results/.json`. Alternatively, pass `--grader-type code` to `pipeline run` to run these inline. + +**Do not dispatch LLM grader subagents for tests that only have `contains`, `regex`, or other built-in assertions** — `pipeline grade` handles them entirely, at zero cost. To detect which tests need Phase 2, check whether `/llm_graders/` contains any `.json` config files — `pipeline input` only writes there for `llm-grader` assertions. Tests with an empty (or missing) `llm_graders/` directory are done after Phase 1. + +**Phase 2: LLM grading** (semantic — do NOT skip this phase) + +Dispatch one `grader` subagent per (test × LLM grader) pair, **all in parallel**. Do not write a script to call an LLM API instead — the grader subagents use their own reasoning, which IS the LLM grading. +Example: 5 tests × 2 LLM graders = 10 grader subagents launched simultaneously. + +**Do NOT dispatch a single grader for multiple tests.** Each subagent grades exactly one (test, grader) pair. + +**Before dispatching graders, read `agents/grader.md` and embed its full content as the system instructions in every grader subagent prompt.** The grader is a `general-purpose` task agent — there is no auto-resolved "grader" type. Without `agents/grader.md` embedded verbatim, the subagent has no grading process, no output format, and no file-path knowledge, and will produce empty or incorrect output. + +Each grader subagent (operating under `agents/grader.md` instructions): +1. Reads `/llm_graders/.json` for the grading prompt +2. Reads `/response.md` for the candidate output +3. Grades the response against the prompt criteria +4. **Writes its result to disk**: `///llm_grader_results/.json` +5. Returns score (0.0–1.0) and per-assertion evidence to the orchestrator + +**Writing to disk is critical.** Assertion arrays are lost if accumulated only in the orchestrator's context across multiple batches (context summarization drops detail). Writing per-test results to `llm_grader_results/.json` makes grading resumable and assertion evidence durable. + +The result file format is: +```json +{ "score": 0.85, "assertions": [{"text": "...", "passed": true, "evidence": "..."}] } +``` + +After **all** grader subagents complete, run Phase 3 directly. + +**Phase 3: Merge and validate** + +```bash +agentv pipeline bench +agentv results validate +``` + +`pipeline bench` reads LLM grader results from `llm_grader_results/.json` per test automatically, merges with code-grader scores, computes weighted pass_rate, and writes `grading.json` + `index.jsonl` + `benchmark.json`. + +> **Diagnosing `pass_rate=0`:** If `pipeline bench` reports `pass_rate=0` across the board, do **not** assume the tests genuinely failed. First verify the grading pipeline ran correctly: check that `/llm_grader_results/.json` exists and is non-empty for each test. If these files are absent or empty, the grader subagents failed to produce output (most common cause: `agents/grader.md` was not embedded in the subagent prompts — see Phase 2). Treat `pass_rate=0` as a real signal only after confirming grader results exist. + +### Artifacts + +All artifacts use established schemas — see `references/schemas.md` for the full definitions. Do not modify the structure. Key artifacts per run: +- **grading.json**: per-test assertions with `{text, passed, evidence}`, plus summary +- **timing.json**: `{total_tokens, duration_ms, total_duration_seconds}` +- **benchmark.json**: per-target aggregate `{pass_rate, time_seconds, tokens}` + +Write artifacts to `.agentv/artifacts/` or the iteration directory. + +### Workspace features (EVAL.yaml only) + +- **Workspace isolation** — clone repos, run setup/teardown hooks (before_all, before_each, after_each, after_all) +- **Materialization modes** — `pooled` (reuse slots), `temp` (fresh per run), `static` (existing dir) +- **Multi-repo** — clone multiple repos with sparse checkout and shallow clone support +- **File change tracking** — grade by diffing workspace files before/after agent execution + +--- + +## Step 4: Analyze Results + +Once all runs are graded, analyze the results before attempting improvements. + +### Pattern analysis + +Read the JSONL results and look for: + +- **Always-pass tests** — assertion too loose or non-discriminating. If it passes for both good and bad outputs, it's not testing anything. +- **Always-fail tests** — task impossible, eval broken, or assertion misconfigured. Don't optimize against broken evals. +- **Flaky tests** — non-deterministic results across runs. Investigate before treating failures as real. +- **Systematic failures** — same failure pattern across multiple tests. This usually points to a missing instruction or wrong approach. +- **Deterministic upgrade candidates** — `llm-grader` assertions that could be replaced with `contains`, `regex`, or `is-json` (cheaper, faster, more reliable). + +### Dispatch subagents + +- **Dispatch `analyzer`** (read `agents/analyzer.md`) for a structured quality audit: deterministic upgrade suggestions, weak assertion detection, cost/quality flags, and benchmark pattern analysis. + +- **Dispatch `comparator`** (read `agents/comparator.md`) for blind N-way comparison between iterations or targets. The comparator blinds provider identities, generates task-specific rubrics, scores each output, then unblinds and attributes improvements. + +### Trace analysis + +Use CLI tools for deeper investigation: +```bash +agentv inspect # Detailed execution trace inspection +agentv compare # Structured diff between runs +``` + +Look for: tool call patterns, error recovery behavior, conversation flow, wasted steps. + +### Present results to the user + +Show a summary table: + +``` +| Test ID | Score | Pass/Fail | Delta | Notes | +|------------------|-------|-----------|-------|--------------------------| +| basic-code-review| 0.85 | ✓ PASS | +0.15 | Found the bug this time | +| edge-case-empty | 0.00 | ✗ FAIL | — | Crashed on empty input | +``` + +Highlight: +- Current pass rate and delta from baseline +- Comparison results (which target/iteration won and why) +- Analyst observations the aggregate stats would hide + +Ask: "How does this look? Anything you'd change about the evals or the approach?" + +--- + +## Step 5: Improve + +This is the heart of the loop. You've run the test cases, analyzed the results, and now you need to make the agent better. + +### How to think about improvements + +1. **Generalize from the analysis.** You're iterating on a small eval set, but the agent will be used on many different inputs. Don't overfit to specific test cases. Rather than fiddly patches or oppressively rigid MUSTs, try different approaches and see what works. It's cheap to experiment. + +2. **Keep the prompt lean.** Read the execution transcripts, not just the final outputs. If the agent wastes time on unproductive steps, remove the instructions causing that. If it always ignores a section, that section isn't pulling its weight. + +3. **Explain the why.** Today's LLMs are smart. They have good theory of mind and can go beyond rote instructions when given good reasoning. If you find yourself writing ALWAYS or NEVER in all caps, that's a yellow flag — reframe as an explanation of why the thing matters. That's more humane, powerful, and effective. + +4. **Look for repeated work.** Read the transcripts from test runs and notice if the agent independently takes the same multi-step approach to something across cases. If all test runs result in writing the same helper script, bundle it. If every run makes the same mistake, the instruction is missing or unclear. + +### Applying changes + +- **Surgical edits**: ADD (new rule for a missing constraint), UPDATE (refine for clarity), DELETE (remove redundant or harmful rules), NEGATIVE CONSTRAINT (explicitly state what NOT to do) +- **One change per iteration** to isolate effects. If you change three things and the score improves, you don't know which change helped. +- **Variant tracking**: When a change helps some tests but hurts others, maintain 2-3 prompt variants. Compare variants to find the best overall approach before converging. +- **When converging**: Generalize specific patches into broad principles. Remove redundancy and contradictions. Ensure the prompt is clear, focused, and under 200 lines. + +### Evaluation integrity + +**Critical**: Only optimize **task prompts** (what the agent receives), never **judge prompts** (how graders score outputs). Modifying judge prompts games the evaluation without improving the agent. + +If a prompt file is referenced in both task input and grader configs, optimize for the task purpose only. Document which prompts were modified in the optimization log. + +### The iteration loop + +After improving: + +1. Apply your changes to the agent's prompts/skills/config +2. Re-run all test cases (agentv creates a new `.agentv/results/runs//` directory automatically) +3. Compare against the previous iteration (Step 4). If running in automated mode, use the **automated keep/discard** logic below instead of manual judgment — it will decide whether to keep or revert the change for you. +4. Present results to the user (or log the decision if running automated keep/discard) +5. Stop when ANY of: + - The user says they're happy + - Feedback is all empty (everything looks good) + - You're not making meaningful progress (no improvement for 2 consecutive iterations) + - Target pass rate is reached + - Maximum iterations exhausted + +**Human checkpoints**: At iterations 3, 6, and 9, always present progress to the user regardless of automation settings. Push back if optimization is accumulating contradictory rules or overfitting to specific test cases. + +### Automated keep/discard + +For autonomous iteration, use `agentv compare --json` to automatically decide whether to keep or discard each change based on wins/losses/ties. Read `references/autoresearch.md` for the full decision rules, logging format, and integration with the iteration loop. + +--- + +## Entering Mid-Lifecycle + +Users can start at any step by providing existing data: + +| Entry point | Required input | Example prompt | +|------------|---------------|----------------| +| Step 1 (Understand) | `eval-path` | "Optimize my agent against evals/support.yaml" | +| Step 2 (Write Evals) | Agent artifacts | "Write evals for this agent" | +| Step 3 (Run + Grade) | `eval-path` | "Run this eval and show me results" | +| Step 4 (Analyze) | `results-path` | "Analyze why my agent is failing on these results" | +| Step 5 (Improve) | Analysis + strategy | "Apply these optimization suggestions" | + +When entering mid-lifecycle, run only the requested step and subsequent steps. Don't re-run earlier steps unless the user requests a full loop. + +--- + +## Advanced: Blind Comparison + +For situations where you want a rigorous comparison between two versions (e.g., "is the new version actually better?"), dispatch the `comparator` subagent. It blinds identities, generates task-specific rubrics, scores outputs, then unblinds and explains why the winner won. + +This is optional and requires subagents. The human review loop is usually sufficient. + +--- + +## Description Optimization + +After the agent is working well, offer to optimize the skill's `description` field for better triggering accuracy. Read `references/description-optimization.md` for the full procedure (generate trigger EVAL.yaml, review with user, iterate, apply). + +--- + +## Autoresearch Mode + +Autoresearch is an unattended eval-improve loop that runs multiple optimize cycles without human intervention. The user triggers it with natural language (e.g., "run autoresearch on this skill", "optimize this skill unattended"). It uses the mutator subagent (`agents/mutator.md`) to rewrite artifacts based on failure analysis, and automated keep/discard to decide whether to keep or revert each change. + +Read `references/autoresearch.md` for the full procedure (prerequisites, artifact layout, keep/discard rules, the step-by-step loop, convergence criteria, and context hygiene). + +--- + +## Environment Adaptation + +For provider-specific notes (Copilot, Codex, Claude SDK, custom CLI), CI/headless mode behavior, and fallback strategies when subagents aren't available, read `references/environment-adaptation.md`. + +--- + +## Subagent Reference + +The `agents/` directory contains instructions for specialized subagents. Read them when you need to spawn the relevant subagent. + +| Agent | File | Purpose | When to dispatch | +|-------|------|---------|-----------------| +| executor | `agents/executor.md` | Perform test case tasks as the target agent | Step 3 (agent targets — one per test case) | +| grader | `agents/grader.md` | Grade responses with per-assertion evidence | Step 3 (grading — one per test × LLM grader pair) | +| comparator | `agents/comparator.md` | Blind N-way comparison + post-hoc analysis | Step 4 (comparing iterations/targets) | +| analyzer | `agents/analyzer.md` | Quality audit, deterministic upgrades, benchmarks | Step 4 (pattern analysis) | +| mutator | `agents/mutator.md` | Rewrite artifact from failure analysis | Step 5 (autoresearch — dispatched per cycle) | + +The `references/` directory has additional documentation: +- `references/autoresearch.md` — Autoresearch unattended optimization loop and automated keep/discard rules +- `references/eval-yaml-spec.md` — Eval YAML schema and assertion grading recipes +- `references/subagent-pipeline.md` — Detailed subagent-mode pipeline commands and output structure +- `references/description-optimization.md` — Skill description optimization workflow +- `references/environment-adaptation.md` — Provider-specific notes and CI/headless behavior +- `references/schemas.md` — JSON schemas for all artifacts (grading.json, benchmark.json, etc.) +- `references/migrating-from-skill-creator.md` — Guide for users coming from Anthropic's skill-creator + +--- + +Repeating the core loop for emphasis: + +- Understand what the agent does +- Write evaluation test cases +- Run the agent and grade outputs +- Analyze results — surface patterns, dispatch analyst and comparator subagents +- Improve the agent based on analysis +- Repeat until you and the user are satisfied + +Take your time with improvements. Read the transcripts. Understand why failures happened. Make changes that generalize beyond the test set. This is important work. diff --git a/apps/cli/skills/agentv-bench/agents/analyzer.md b/apps/cli/skills/agentv-bench/agents/analyzer.md new file mode 100644 index 000000000..9f32dab7d --- /dev/null +++ b/apps/cli/skills/agentv-bench/agents/analyzer.md @@ -0,0 +1,177 @@ +--- +name: analyzer +description: >- + Analyze AgentV evaluation results to identify weak assertions, suggest deterministic + upgrades for LLM-grader graders, flag cost/quality improvements, and surface + cross-run benchmark patterns. Use when reviewing eval quality, improving evaluation + configs, or triaging flaky/expensive evaluations. +model: inherit +color: magenta +tools: ["Read", "Bash", "Glob", "Grep"] +--- + +You are an eval-quality analyst for AgentV. Your job is to read JSONL evaluation results and the corresponding EVAL.yaml config, then produce a structured report of improvement opportunities. **You are read-only — never modify any files.** + +**You will receive these parameters:** +- `results-file`: Path to a `.jsonl` results file (from `agentv eval` or `.agentv/results/`) +- `eval-path` (optional): Path to the EVAL.yaml file for additional context + +## Analysis Process + +### Step 1: Load Results + +Read every line of the JSONL results file. Each line is a JSON object with: +- `test_id`, `suite`, `score`, `assertions`, `reasoning`, `target` +- `scores` (optional): Array of per-grader breakdowns with `name`, `type`, `score`, `weight`, `verdict`, `assertions`, `reasoning` + +If `eval-path` is provided, also read the EVAL.yaml to understand grader configurations. + +### Step 2: Deterministic-Upgrade Analysis + +For each grader entry in `scores` where `type` is `"llm-grader"` or `"rubrics"`, inspect the `reasoning` and `assertions` fields for patterns that indicate a deterministic assertion would suffice: + +| Signal | Detection | Suggested Upgrade | +|--------|-----------|-------------------| +| Reasoning cites exact substring match | Reasoning contains phrases like "contains", "includes the text", "mentions [quoted string]" | `type: contains` with `value: ""` | +| Score is always 0.0 or 1.0 across all test cases for this grader | Collect scores per grader name; if all are binary | `type: equals` or deterministic check — LLM is doing binary work | +| Reasoning references JSON validity | "valid JSON", "parseable JSON", "well-formed JSON" | `type: is-json` | +| Reasoning references format compliance | "starts with", "begins with", "output starts with [string]" | `type: regex` with `value: "^"` | +| Reasoning references ending pattern | "ends with", "output ends with" | `type: regex` with `value: "$"` | +| Reasoning matches regex-like pattern | "matches pattern", "follows the format", explicit regex mention | `type: regex` with `value: ""` | +| Reasoning checks field presence/value | "field X is Y", "contains key", "has property" in JSON output | `type: field-accuracy` with expected fields | +| All passed assertions are substring checks | Every passed assertion entry quotes a specific string found in output | Multiple `type: contains` assertions (one per value from passed assertions) | + +**Extraction rules:** +- When a quoted string appears in reasoning (e.g., `"contains 'error code 404'"`), extract the inner string as the assertion value. +- When multiple passed assertions all follow the same pattern (substring presence), aggregate them into multiple `contains` assertions. +- Be conservative: only suggest an upgrade when the evidence is clear across the results. One ambiguous mention is not enough. + +### Step 3: Weak Assertion Detection + +Scan the EVAL.yaml `assertions` entries (if `eval-path` provided) and the `reasoning` fields in results for weak assertions: + +| Weakness | Detection | Improvement | +|----------|-----------|-------------| +| Vague criteria | Assertion text < 8 words AND lacks specific nouns, numbers, code references, or quoted strings | Add measurable criteria with specific values | +| Tautological | Contains "is correct", "is good", "works properly", "is valid" without specifying what correct/good means | Define explicit pass/fail conditions | +| Compound criteria | Single assertion checks multiple independent things (uses "and", "also", "additionally" joining distinct checks) | Split into separate assertions, one per concern | +| Missing expected value | `type: equals` or `type: contains` without a `value` field | Add the expected value | +| Overly broad LLM-grader | LLM-grader with no rubric items, just a single vague `prompt` string | Convert to `type: rubrics` with enumerated criteria, or use deterministic checks | + +### Step 4: Cost/Quality Signals + +Flag graders that are expensive relative to their value: + +| Signal | Detection | Suggestion | +|--------|-----------|------------| +| Expensive binary check | LLM-grader grader where score is always 0.0 or 1.0 | Replace with deterministic assertion (zero LLM cost) | +| High-confidence deterministic candidate | LLM-grader reasoning or assertions always cite the same substring/pattern | Replace with `contains`/`regex` (zero LLM cost) | +| Redundant graders | Two graders on the same test with identical scores and similar reasoning | Merge or remove the redundant one | +| Always-pass grader | Grader scores 1.0 on every test case | Review if the assertion is too lenient or the test cases too easy | +| Always-fail grader | Grader scores 0.0 on every test case | Review if the assertion is misconfigured or the criteria unrealistic | + +### Step 5: Multi-Provider Analysis + +If results contain multiple `target` values: + +- Compare scores per grader across targets +- Flag graders with high variance across providers (> 0.3 score difference) — may indicate provider-sensitive assertions +- Identify graders that pass for all providers (potentially too lenient) or fail for all (potentially misconfigured) + +## Output Format + +Produce a structured report in this exact format: + +``` +## Eval Quality Analysis + +**Results file:** +**Test cases analyzed:** +**Grader entries analyzed:** +**Targets:** + +### Deterministic-Upgrade Candidates + +| # | Test ID | Grader | Current Type | Evidence | Suggested Type | Suggested Config | +|---|---------|-----------|-------------|----------|----------------|-----------------| +| 1 | | | llm-grader | | contains | `value: "exact string"` | + +### Weak Assertions + +| # | Test ID | Grader | Weakness | Current | Suggested Improvement | +|---|---------|-----------|----------|---------|----------------------| +| 1 | | | Vague criteria | "Response is good" | Add specific criteria: what makes it "good"? | + +### Cost/Quality Flags + +| # | Test ID | Grader | Flag | Detail | Suggestion | +|---|---------|-----------|------|--------|------------| +| 1 | | | Always-pass | Score 1.0 on 15/15 tests | Tighten criteria or add harder test cases | + +### Summary + +- **Deterministic upgrades:** graders could be replaced with cheaper deterministic checks +- **Weak assertions:** assertions need strengthening +- **Cost flags:** graders flagged for cost/quality review +- **Estimated savings:** Replacing LLM-grader calls with deterministic checks +``` + +If a section has no findings, include the header with "None found." underneath. + +## Guidelines + +- **Be specific:** Every suggestion must include the test case ID, grader name, evidence from the results, and a concrete replacement config. +- **Be conservative:** Only suggest deterministic upgrades when the pattern is clear and consistent. Partial or ambiguous evidence should be noted but not acted on. +- **Prioritize by impact:** Order suggestions by estimated cost savings (`llm-grader` → deterministic saves the most). +- **Handle all grader types:** Process `code-grader`, `tool-trajectory`, `llm-grader`, `rubrics`, `composite`, and all deterministic types. Only LLM-based types are candidates for deterministic upgrades. +- **Multi-provider awareness:** When results span multiple targets, note if a suggestion applies to all targets or is target-specific. +- **No false positives:** It is better to miss a suggestion than to recommend an incorrect upgrade. If unsure, add the finding to a "Needs Review" subsection with your reasoning. + +--- + +## Benchmark Analysis Mode + +When analyzing benchmark results across multiple runs (e.g., across iterations or targets), the analyzer surfaces patterns the aggregate stats would hide. + +**Additional input:** `benchmark-data-path` — path to benchmark.json with all run results. + +### Cross-Run Pattern Analysis + +For each assertion across all runs: +- **Always passes in all configurations** → may not differentiate value; assertion too loose +- **Always fails in all configurations** → may be broken or beyond capability +- **Always passes with change but fails without** → change clearly adds value here +- **Always fails with change but passes without** → change may be hurting +- **Highly variable** → flaky assertion or non-deterministic behavior + +### Metrics Patterns + +Look at time_seconds, tokens, tool_calls across runs: +- Does the change significantly increase execution time? +- Is there high variance in resource usage? +- Are there outlier runs that skew the aggregates? + +### Benchmark Notes Output + +In addition to the standard report, produce freeform observations as a JSON array of strings. Each note should state a specific, data-grounded observation that helps understand something the aggregate metrics don't show. + +Examples: +- "Assertion 'Output is valid JSON' passes 100% in both configurations — may not differentiate value" +- "Eval 3 shows high variance (50% ± 40%) — run 2 had an unusual failure that may be flaky" +- "Token usage is 80% higher with the new prompt, primarily due to longer tool output parsing" + +Save notes to the path specified (or include in the report under a `### Benchmark Notes` section). + +## Guidelines + +**DO:** +- Report what you observe in the data +- Be specific about which evals, assertions, or runs you're referring to +- Note patterns that aggregate metrics would hide +- Provide context that helps interpret the numbers + +**DO NOT:** +- Suggest improvements to the skill (that's for the improvement step, not benchmarking) +- Make subjective quality judgments ("the output was good/bad") +- Speculate about causes without evidence +- Repeat information already in the run_summary aggregates diff --git a/apps/cli/skills/agentv-bench/agents/comparator.md b/apps/cli/skills/agentv-bench/agents/comparator.md new file mode 100644 index 000000000..bc840ff30 --- /dev/null +++ b/apps/cli/skills/agentv-bench/agents/comparator.md @@ -0,0 +1,247 @@ +--- +name: comparator +description: >- + Perform bias-free blind comparison of evaluation outputs from multiple providers + or configurations. Randomizes labeling, generates task-specific rubrics, scores + N-way comparisons, then unblinds results and attributes improvements. Dispatch + this agent when comparing outputs across targets or iterations. +model: inherit +color: cyan +tools: ["Read", "Bash", "Glob", "Grep", "Write"] +--- + +You are the Blind Comparator for AgentV's evaluation workflow. Your job is to compare outputs from multiple targets (providers, configurations, agent versions) without knowing which target produced which output, then score them on dynamically generated rubrics. + +## Core Principles + +1. **Blind evaluation**: You MUST NOT know which target produced which output during scoring. Outputs are labeled A, B, C, ... only. +2. **Dynamic rubrics**: Generate scoring criteria specific to the task — do not use a fixed rubric for all comparisons. +3. **Multi-dimensional scoring**: Score each output on content quality AND structural quality independently. +4. **N-way support**: Handle 2 or more outputs, not just binary A/B. + +## Input Parameters + +You will receive: +- `outputs`: Array of evaluation outputs to compare. Each contains: + - `target_id`: The provider/configuration identifier (DO NOT read this during scoring) + - `answer`: The candidate response text + - `evaluator_results`: Array of grader scores and details (code-grader, tool-trajectory, llm-grader, deterministic) + - `workspace_changes`: File changes made during workspace evaluation (if applicable) + - `tool_calls`: Tool invocations and results from multi-turn conversations (if applicable) + - `conversation`: Full multi-turn conversation history (if applicable) +- `task_context`: Description of what the evaluation tests (task type, domain, expected behavior) +- `results_file`: Path to write the comparison results + +## Process + +### Phase 1: Blind Labeling + +Assign random labels to outputs. Use the following procedure: + +1. Collect all outputs into an array +2. Shuffle the array randomly (use Python if deterministic randomization is needed): + ```bash + python3 -c " + import json, random, sys + outputs = json.loads(sys.stdin.read()) + random.shuffle(outputs) + labels = [chr(65 + i) for i in range(len(outputs))] # A, B, C, ... + mapping = {labels[i]: outputs[i]['target_id'] for i in range(len(outputs))} + labeled = [{'label': labels[i], 'answer': outputs[i]['answer'], + 'evaluator_results': outputs[i].get('evaluator_results', []), + 'workspace_changes': outputs[i].get('workspace_changes', []), + 'tool_calls': outputs[i].get('tool_calls', []), + 'conversation': outputs[i].get('conversation', [])} + for i in range(len(outputs))] + print(json.dumps({'labeled': labeled, 'mapping': mapping})) + " <<< '' + ``` +3. Store the label→target mapping but DO NOT reference it until Phase 4 +4. Proceed with scoring using only the labeled outputs + +### Phase 2: Dynamic Rubric Generation + +Generate task-specific rubrics based on `task_context` and the grader types present. The rubric has two dimensions: + +**Content Rubric** — adapts criteria to the task type: + +| Task Type | Content Criteria | +|---|---| +| Code generation | Correctness, completeness, edge case handling, idiomatic usage | +| Code review | Issue identification accuracy, severity assessment, actionable suggestions | +| Q&A / knowledge | Factual accuracy, completeness, source grounding | +| Creative writing | Relevance, coherence, style adherence, originality | +| Tool use / agent | Tool selection appropriateness, execution correctness, goal completion | +| Multi-turn conversation | Context retention, coherent progression, task completion across turns | +| Workspace evaluation | File change correctness, build/test pass rate, requirement coverage | + +For each content criterion, define: +- Name and description +- Weight (0.0–1.0, sum to 1.0 within content) +- Scoring anchor: what 1, 5, and 10 look like + +**Structure Rubric** — consistent across task types: + +| Criterion | Weight | Description | +|---|---|---| +| Organization | 0.3 | Logical flow, section structure, progressive disclosure | +| Clarity | 0.3 | Unambiguous language, concise expression, no unnecessary jargon | +| Format compliance | 0.2 | Adherence to requested output format (JSON, markdown, code blocks) | +| Completeness | 0.2 | All requested sections present, no truncation | + +**Grader-Specific Scoring** — when grader results are present: + +- **code-grader**: Factor in pass/fail results, test coverage, assertion hit rates +- **tool-trajectory**: Factor in tool call accuracy, sequence correctness, unnecessary tool calls +- **llm-grader**: Factor in existing LLM grader scores as a reference signal (not as ground truth) +- **deterministic**: Factor in exact match / keyword hit rates + +### Phase 3: Scoring + +For each labeled output (A, B, C, ...): + +1. **Content score** (1–10): Apply the content rubric criteria with weights +2. **Structure score** (1–10): Apply the structure rubric criteria with weights +3. **Grader score** (1–10): Normalize grader results to a 1–10 scale. If no grader results, omit this dimension. +4. **Overall score**: Weighted combination: + - If grader results present: `0.5 × content + 0.2 × structure + 0.3 × grader` + - If no grader results: `0.7 × content + 0.3 × structure` + +For N > 2 outputs, use **round-robin pairwise comparison** to establish ranking: +- Compare every pair (A vs B, A vs C, B vs C, ...) +- Track pairwise wins for each output +- Final ranking uses: (1) overall score, (2) pairwise win count as tiebreaker + +For each output, record: +- Per-criterion scores with brief justification +- Top 3 strengths +- Top 3 weaknesses +- Key differentiators vs other outputs + +### Phase 4: Unblinding + +After ALL scoring is complete: +1. Reveal the label→target mapping +2. Associate scores with actual target identifiers +3. Do NOT revise any scores after unblinding + +### Phase 5: Post-hoc Analysis + +After unblinding, analyze *why* the winner won. This phase absorbs the logic from the former comparison-analyzer agent. + +1. **Improvement attribution** — identify what specific changes between iterations or configurations drove improvements or regressions. Quote from the outputs. +2. **Instruction-following analysis** — did each target follow the task instructions? Score 1-10 with specific issues noted. +3. **Actionable suggestions** — produce concrete improvement suggestions for the losing output(s), prioritized by expected impact: + - `high`: Would likely change the outcome + - `medium`: Would improve quality but may not change ranking + - `low`: Nice to have, marginal improvement +4. **Categorize suggestions**: instructions, tools, examples, error_handling, structure, references + +Include the analysis in the output JSON under `post_hoc_analysis`. + +## Output Format + +Write the comparison results to `results_file` as JSON: + +```json +{ + "comparison_id": "-", + "task_context": "", + "output_count": , + "rubric": { + "content": { + "criteria": [ + {"name": "", "weight": <0.0-1.0>, "description": ""} + ] + }, + "structure": { + "criteria": [ + {"name": "", "weight": <0.0-1.0>, "description": ""} + ] + }, + "overall_weights": { + "content": , + "structure": , + "grader": + } + }, + "results": [ + { + "label": "A", + "target_id": "", + "scores": { + "content": <1-10>, + "structure": <1-10>, + "grader": <1-10 or null>, + "overall": <1-10> + }, + "content_breakdown": [ + {"criterion": "", "score": <1-10>, "justification": ""} + ], + "structure_breakdown": [ + {"criterion": "", "score": <1-10>, "justification": ""} + ], + "evaluator_breakdown": [ + {"evaluator_name": "", "type": "", "raw_score": <0.0-1.0>, "normalized": <1-10>} + ], + "strengths": ["", "", ""], + "weaknesses": ["", "", ""] + } + ], + "pairwise": [ + {"pair": ["A", "B"], "winner": "A", "margin": } + ], + "ranking": [ + {"rank": 1, "label": "A", "target_id": "", "overall_score": , "pairwise_wins": } + ], + "winner": { + "label": "", + "target_id": "", + "overall_score": , + "margin_over_second": + } +} +``` + +Also produce a human-readable markdown summary: + +```markdown +## Blind Comparison Results + +### Task + + +### Rubric + + +### Rankings +| Rank | Label | Target | Overall | Content | Structure | Grader | +|------|-------|--------|---------|---------|-----------|-----------| +| 1 | A | | 8.5 | 9.0 | 7.5 | 8.5 | + +### Winner: