From c84273562746187ebb76bef35ad56358cb220b06 Mon Sep 17 00:00:00 2001 From: oshrizak <63424207+oshrizak@users.noreply.github.com> Date: Tue, 26 May 2026 09:49:13 -0700 Subject: [PATCH 1/2] feat(feedback): file proposals as issues, fix verify prompt, content-aware regression gate --- agents/feedback.md | 53 ++++++++++++++------------ src/github/issue.ts | 82 +++++++++++++++++++++++++++++++++++----- src/pipeline/feedback.ts | 71 ++++++++++++++++++++++++++++++++-- 3 files changed, 168 insertions(+), 38 deletions(-) diff --git a/agents/feedback.md b/agents/feedback.md index 6ea4e68..57a59f9 100644 --- a/agents/feedback.md +++ b/agents/feedback.md @@ -1,20 +1,22 @@ # Feedback Agent ## Purpose -The Feedback Agent improves the content-agent library from real signals. It does -two jobs: - -- **VERIFY** — judge whether a freshly built agent's HTML output faithfully and - accessibly captures the content of its source image. This is the build-time - source-fidelity check on new (session-built) agents (PRD §7.5 / §7.12). -- **TRAIN** — propose an improved version of an agent's prompt so it stops - repeating a mistake. Driven either by a user-feedback correction (a block before - and after an edit) or by the problems found during VERIFY (PRD §7.12 / §7.13). - -The agent that produced a block may be an existing library agent (a TRAIN result -is proposed as an update PR on close) or a session-built agent (the improvement is -trained into it in place, so its new-agent PR carries the fix). The goal is that -agents learn from real signals rather than repeating the same mistake. +The Feedback Agent helps Iris's agents learn from real signals instead of +repeating the same mistake. It does two jobs: + +- **VERIFY** — judge whether an agent's HTML output faithfully and accessibly + captures its source image. Used at build time to check each page the page agent + produces, and reused by the regression gate before any agent change ships + (PRD §7.5 / §7.12). +- **TRAIN** — propose an improved version of an agent's prompt so it avoids a + recurring issue, driven either by a user-feedback correction or by the problems + found during VERIFY (PRD §7.12 / §7.13). + +A proposed improvement to a library agent (e.g. the page agent) is gated on that +agent's regression fixtures and filed as a GitHub issue for a maintainer to +review; a session-built agent is trained in place so its contribution carries the +fix. The goal is that agents improve from real signals rather than repeating the +same mistake. ## Required capability vision, text @@ -27,19 +29,20 @@ You are the Feedback Agent. The user message begins with `TASK: verify` or `TASK: train`. Do ONLY that task and return ONLY its JSON (no code fences). TASK: verify -You are given a content agent's purpose/contract, the HTML it produced for one -source image, and the source image itself. Decide whether that HTML faithfully -captures the content this agent is responsible for — right text, right structure, -nothing missed, nothing invented — AND is accessible. Judge ONLY this agent's -content type; ignore content that other agents handle. List concrete, actionable -problems (empty when there are none). Respond with ONLY: +You are given an agent's purpose/contract, the HTML it produced for one source +image, and the source image itself. Decide whether that HTML faithfully captures +everything this agent is responsible for in the image — right text, right +structure, nothing missed, nothing invented — AND is accessible (WCAG 2.2 AA). +Respect the agent's declared scope: a whole-page agent is responsible for the +ENTIRE page; a specialist agent is responsible for its content type. Check every +part the agent is responsible for. List concrete, actionable problems (empty when +there are none). Respond with ONLY: { "faithful": true|false, "accessible": true|false, "problems": ["..."] } TASK: train -You are given an agent's full markdown and either a user-feedback correction (a -block before and after an edit) or a list of verification problems. Propose an -improved version of the agent's markdown so it would avoid the issue on similar -inputs. You MUST: +You are given an agent's full markdown and either a user-feedback correction or a +list of verification problems. Propose an improved version of the agent's markdown +so it would avoid the issue on similar inputs. You MUST: - Generalize the lesson into an instruction; do NOT hard-code this document's specific text, values, or wording. - Be ADDITIVE and backward-compatible: keep every existing instruction and @@ -48,7 +51,7 @@ inputs. You MUST: - Keep the section structure (`# Agent`, `## Purpose`, `## Required capability`, `## System prompt`, `## Output contract`), the agent's name, and its declared capabilities unchanged. Forbid CSS/styling; preserve the - fragment-log / provenance requirements. + agent's output contract, including any log/provenance fields it already emits. - If there is no sound, generalizable change to THIS agent, make none. Respond with ONLY: { "changed": true|false, diff --git a/src/github/issue.ts b/src/github/issue.ts index bde5e7f..4e10107 100644 --- a/src/github/issue.ts +++ b/src/github/issue.ts @@ -14,18 +14,18 @@ export function parseRepo(url: string): RepoRef { // Label maintainers can sort/filter agent suggestions by. export const AGENT_LABEL = "iris-agent-suggestion"; -async function ensureLabel(octokit: Octokit, repo: RepoRef): Promise { +async function ensureLabel( + octokit: Octokit, + repo: RepoRef, + name: string = AGENT_LABEL, + color = "5319e7", + description = "Agent suggested automatically by Equalify Iris", +): Promise { try { - await octokit.issues.getLabel({ owner: repo.owner, repo: repo.repo, name: AGENT_LABEL }); + await octokit.issues.getLabel({ owner: repo.owner, repo: repo.repo, name }); } catch { try { - await octokit.issues.createLabel({ - owner: repo.owner, - repo: repo.repo, - name: AGENT_LABEL, - color: "5319e7", - description: "Agent suggested automatically by Equalify Iris", - }); + await octokit.issues.createLabel({ owner: repo.owner, repo: repo.repo, name, color, description }); } catch { // label may have been created concurrently, or insufficient perms — the // issue create below still works if the label already exists. @@ -80,3 +80,67 @@ export async function createAgentIssue( }); return res.data.html_url; } + +// Label for feedback-driven improvements to an EXISTING agent (distinct from new +// agent suggestions, so maintainers can triage them separately). +export const AGENT_UPDATE_LABEL = "iris-agent-update"; + +export interface AgentUpdateIssue { + agentName: string; // e.g. "page.md" + agentMarkdown: string; // full proposed updated agent file + summary: string; // one-line description of the change + diffPreview: string; // human-readable diff of the proposed change + sessionId: string; +} + +// File a labeled issue proposing an improvement to an existing agent, produced by +// the feedback loop (and already gated by the agent's regression fixtures). +// Returns the issue URL, or null if an open update issue for this agent already +// exists (dedupe by title). +export async function createAgentUpdateIssue( + token: string, + upstreamUrl: string, + apiBase: string, + args: AgentUpdateIssue, +): Promise { + const octokit = new Octokit({ auth: token, baseUrl: apiBase }); + const repo = parseRepo(upstreamUrl); + const name = args.agentName.replace(/\.md$/, ""); + const title = `Agent update proposal: ${name}`; + + // Dedupe: skip if an open update issue with this title already exists. + try { + const found = await octokit.search.issuesAndPullRequests({ + q: `repo:${repo.owner}/${repo.repo} is:issue is:open label:"${AGENT_UPDATE_LABEL}" "${name}" in:title`, + }); + if (found.data.items.some((i) => i.title === title)) return null; + } catch { + // search unavailable — proceed (a duplicate is acceptable; not worth failing). + } + + await ensureLabel( + octokit, + repo, + AGENT_UPDATE_LABEL, + "0e8a16", + "Agent improvement proposed by Equalify Iris from user feedback", + ); + const body = + `**Agent:** \`agents/${name}.md\`\n` + + `**Proposed change:** ${args.summary}\n` + + `**Session:** ${args.sessionId}\n\n` + + `_Auto-filed by Equalify Iris when user feedback produced a generalizable improvement to this agent. ` + + `Already gated by the agent's regression fixtures._\n\n` + + `## Diff (preview)\n\n` + + "```diff\n" + args.diffPreview + "\n```\n\n" + + `## Proposed full \`agents/${name}.md\`\n\n` + + "```markdown\n" + args.agentMarkdown + "\n```\n"; + const res = await octokit.issues.create({ + owner: repo.owner, + repo: repo.repo, + title, + body, + labels: [AGENT_UPDATE_LABEL], + }); + return res.data.html_url; +} diff --git a/src/pipeline/feedback.ts b/src/pipeline/feedback.ts index 9a7bc86..ae5b903 100644 --- a/src/pipeline/feedback.ts +++ b/src/pipeline/feedback.ts @@ -4,6 +4,8 @@ import { extractJson } from "../util/json.ts"; import { loadAgent, type AgentSpec } from "../agents/loader.ts"; import { ACCESSIBILITY_REQUIREMENTS } from "./accessibility.ts"; import { loadImage, type InputImage, type PipelineContext } from "./context.ts"; +import { flatten } from "./flatten.ts"; +import { createAgentUpdateIssue } from "../github/issue.ts"; import type { FixtureCase } from "./regression.ts"; // Previously imported from github/contributions.ts, which was removed when the @@ -112,12 +114,39 @@ export async function verifyAgentOutput( // --------------------------------------------------------------------------- const MAX_GATE_FIXTURES = 3; +// An updated agent must still reproduce at least this fraction of the words in a +// fixture's accepted output (by screen-reader-flattened text); below it, the +// change is treated as a content regression. +const MIN_CONTENT_COVERAGE = 0.85; +// Skip the coverage check for very short outputs, where one dropped word swings +// the ratio — rely on the model verdict alone there. +const MIN_COVERAGE_WORDS = 8; export interface RegressionResult { passed: boolean; failures: string[]; } +// Fraction of the accepted output's distinct words that still appear in the +// candidate output (screen-reader-flattened, punctuation-insensitive). Returns +// null when the accepted text is too short to judge reliably. +function contentCoverage(acceptedHtml: string, candidateHtml: string): number | null { + const words = (html: string): Set => + new Set( + flatten(html) + .toLowerCase() + .replace(/[^a-z0-9\s]/g, " ") + .split(/\s+/) + .filter((w) => w.length > 1), + ); + const accepted = words(acceptedHtml); + if (accepted.size < MIN_COVERAGE_WORDS) return null; + const candidate = words(candidateHtml); + let hit = 0; + for (const w of accepted) if (candidate.has(w)) hit++; + return hit / accepted.size; +} + // Re-run an agent (given its current/updated content) on a fixture image, used by // the regression gate. Accepts either output shape: { html } (whole-page agents // like page.md) or { no_content, fragments[] } (content agents). @@ -194,6 +223,16 @@ export async function regressionGate( failures.push(`${c.image_file}: updated agent produced no output`); continue; } + // Content-preservation check: the updated agent must still reproduce the + // content it produced when this fixture was accepted (PRD §7.12). Compare the + // screen-reader-flattened text of the new output against the accepted output; + // a large drop means the change regressed a use we already shipped. + const candidateHtml = blocks.map((b) => b.html).join("\n\n"); + const coverage = contentCoverage(c.accepted_html, candidateHtml); + if (coverage !== null && coverage < MIN_CONTENT_COVERAGE) { + failures.push(`${c.image_file}: only ${(coverage * 100).toFixed(0)}% of the accepted content remained`); + continue; + } const verdict = await verifyAgentOutput(ctx, updatedAgent, img, blocks); if (!verdict.ok) failures.push(`${c.image_file}: ${verdict.problems.join("; ") || "failed verification"}`); } @@ -209,10 +248,11 @@ export async function regressionGate( // On a feedback re-run, turn the document-level correction (the prior reviewed // body vs. this run's reviewed body) into an improved version of the agent that -// produced the document — the page agent in the single-pass pipeline. A library -// agent becomes an update PR (via agent-updates.md, opened on close), gated on its -// regression fixtures so the change can't break a use it already handled; a -// session-built agent is trained in place so its new-agent PR carries the fix. +// produced the document — the page agent in the single-pass pipeline. For a +// library agent the proposal is gated on its regression fixtures and then filed +// as a GitHub issue (the contribution model uses issues, not close-time PRs), +// while also being recorded in agent-updates.md; a session-built agent is trained +// in place so its new-agent contribution carries the fix. export async function proposeAgentUpdatesFromFeedback( ctx: PipelineContext, args: { agentFile: string; before: string; after: string; feedback: string }, @@ -291,5 +331,28 @@ export async function proposeAgentUpdatesFromFeedback( writeFileSync(path, JSON.stringify([...merged.values()], null, 2)); ctx.log.event("agent_updates_proposed", { agents: [proposal.agent_name], count: 1 }); + + // Surface the proposal where maintainers act on it: file a GitHub issue (the + // contribution model uses issues, not close-time PRs). Attributed to the + // logged-in user unless a service token override is configured. No-op without a + // token, so local runs still keep the proposal in agent-updates.md. + const token = ctx.cfg.github.issue_token || ctx.githubToken; + if (token) { + try { + const url = await createAgentUpdateIssue(token, ctx.cfg.github.upstream_repo, ctx.cfg.github.api_base_url, { + agentName: proposal.agent_name, + agentMarkdown: proposal.content, + summary: proposal.summary, + diffPreview: proposal.diff_preview, + sessionId: ctx.sessionId, + }); + ctx.log.event("agent_update_issue", { agent: proposal.agent_name, url: url ?? "(duplicate — skipped)" }); + } catch (e) { + ctx.log.event("agent_update_issue_failed", { agent: proposal.agent_name, error: (e as Error).message }); + } + } else { + ctx.log.event("agent_update_issue_skipped", { agent: proposal.agent_name, reason: "no github token" }); + } + return [proposal]; } From 370cc6dd31a8fd668d7b2ef9f12dc776f463524a Mon Sep 17 00:00:00 2001 From: oshrizak <63424207+oshrizak@users.noreply.github.com> Date: Tue, 26 May 2026 09:58:39 -0700 Subject: [PATCH 2/2] test(feedback): unit tests for contentCoverage; export it and ignore flatten role markers --- package.json | 3 ++- src/pipeline/feedback.ts | 9 ++++++--- test/feedback.test.ts | 41 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 4 deletions(-) create mode 100644 test/feedback.test.ts diff --git a/package.json b/package.json index 4f97fa8..9a6018d 100644 --- a/package.json +++ b/package.json @@ -10,7 +10,8 @@ "scripts": { "start": "node --use-system-ca --env-file-if-exists=.env --experimental-sqlite src/index.ts", "dev": "node --use-system-ca --env-file-if-exists=.env --experimental-sqlite --watch src/index.ts", - "typecheck": "tsc --noEmit" + "typecheck": "tsc --noEmit", + "test": "node --test test/feedback.test.ts" }, "dependencies": { "@aws-sdk/client-bedrock-runtime": "^3.682.0", diff --git a/src/pipeline/feedback.ts b/src/pipeline/feedback.ts index ae5b903..a2c299f 100644 --- a/src/pipeline/feedback.ts +++ b/src/pipeline/feedback.ts @@ -117,7 +117,7 @@ const MAX_GATE_FIXTURES = 3; // An updated agent must still reproduce at least this fraction of the words in a // fixture's accepted output (by screen-reader-flattened text); below it, the // change is treated as a content regression. -const MIN_CONTENT_COVERAGE = 0.85; +export const MIN_CONTENT_COVERAGE = 0.85; // Skip the coverage check for very short outputs, where one dropped word swings // the ratio — rely on the model verdict alone there. const MIN_COVERAGE_WORDS = 8; @@ -129,11 +129,14 @@ export interface RegressionResult { // Fraction of the accepted output's distinct words that still appear in the // candidate output (screen-reader-flattened, punctuation-insensitive). Returns -// null when the accepted text is too short to judge reliably. -function contentCoverage(acceptedHtml: string, candidateHtml: string): number | null { +// null when the accepted text is too short to judge reliably. Structural role +// markers that flatten() injects ([Heading 2], [List item], …) are stripped so a +// structure-only change is not mistaken for dropped content. +export function contentCoverage(acceptedHtml: string, candidateHtml: string): number | null { const words = (html: string): Set => new Set( flatten(html) + .replace(/\[[^\]]*\]/g, " ") .toLowerCase() .replace(/[^a-z0-9\s]/g, " ") .split(/\s+/) diff --git a/test/feedback.test.ts b/test/feedback.test.ts new file mode 100644 index 0000000..70d359b --- /dev/null +++ b/test/feedback.test.ts @@ -0,0 +1,41 @@ +import { test } from "node:test"; +import assert from "node:assert/strict"; +import { contentCoverage, MIN_CONTENT_COVERAGE } from "../src/pipeline/feedback.ts"; + +// contentCoverage backs the regression gate's content-preservation check: it +// measures how much of a fixture's accepted output a candidate (re-run with an +// updated agent) still reproduces, by screen-reader-flattened text. + +test("identical content scores full coverage", () => { + const html = "

one two three four five six seven eight nine ten

"; + assert.equal(contentCoverage(html, html), 1); +}); + +test("a superset candidate still scores full coverage", () => { + const accepted = "

one two three four five six seven eight nine ten

"; + const candidate = "

one two three four five six seven eight nine ten eleven twelve

"; + assert.equal(contentCoverage(accepted, candidate), 1); +}); + +test("structure-only change (heading vs paragraph) is not a content drop", () => { + const accepted = "

alpha bravo charlie delta echo foxtrot golf hotel

"; + const candidate = "

alpha, bravo. charlie! delta? echo: foxtrot; golf hotel

"; + const cov = contentCoverage(accepted, candidate); + assert.ok(cov !== null && cov >= MIN_CONTENT_COVERAGE, `expected >= ${MIN_CONTENT_COVERAGE}, got ${cov}`); +}); + +test("dropping half the content falls below the threshold", () => { + const accepted = "

one two three four five six seven eight nine ten

"; + const candidate = "

one two three four five

"; + const cov = contentCoverage(accepted, candidate); + assert.ok(cov !== null && cov < MIN_CONTENT_COVERAGE, `expected < ${MIN_CONTENT_COVERAGE}, got ${cov}`); +}); + +test("an empty candidate scores zero coverage against substantial accepted text", () => { + const accepted = "

one two three four five six seven eight nine ten

"; + assert.equal(contentCoverage(accepted, ""), 0); +}); + +test("returns null when the accepted text is too short to judge", () => { + assert.equal(contentCoverage("

tiny bit here

", "

totally different words instead

"), null); +});