From c84273562746187ebb76bef35ad56358cb220b06 Mon Sep 17 00:00:00 2001
From: oshrizak <63424207+oshrizak@users.noreply.github.com>
Date: Tue, 26 May 2026 09:49:13 -0700
Subject: [PATCH 1/2] feat(feedback): file proposals as issues, fix verify
 prompt, content-aware regression gate

---
 agents/feedback.md       | 53 ++++++++++++++------------
 src/github/issue.ts      | 82 +++++++++++++++++++++++++++++++++++-----
 src/pipeline/feedback.ts | 71 ++++++++++++++++++++++++++++++++--
 3 files changed, 168 insertions(+), 38 deletions(-)

diff --git a/agents/feedback.md b/agents/feedback.md
index 6ea4e68..57a59f9 100644
--- a/agents/feedback.md
+++ b/agents/feedback.md
@@ -1,20 +1,22 @@
 # Feedback Agent
 
 ## Purpose
-The Feedback Agent improves the content-agent library from real signals. It does
-two jobs:
-
-- **VERIFY** — judge whether a freshly built agent's HTML output faithfully and
-  accessibly captures the content of its source image. This is the build-time
-  source-fidelity check on new (session-built) agents (PRD §7.5 / §7.12).
-- **TRAIN** — propose an improved version of an agent's prompt so it stops
-  repeating a mistake. Driven either by a user-feedback correction (a block before
-  and after an edit) or by the problems found during VERIFY (PRD §7.12 / §7.13).
-
-The agent that produced a block may be an existing library agent (a TRAIN result
-is proposed as an update PR on close) or a session-built agent (the improvement is
-trained into it in place, so its new-agent PR carries the fix). The goal is that
-agents learn from real signals rather than repeating the same mistake.
+The Feedback Agent helps Iris's agents learn from real signals instead of
+repeating the same mistake. It does two jobs:
+
+- **VERIFY** — judge whether an agent's HTML output faithfully and accessibly
+  captures its source image. Used at build time to check each page the page agent
+  produces, and reused by the regression gate before any agent change ships
+  (PRD §7.5 / §7.12).
+- **TRAIN** — propose an improved version of an agent's prompt so it avoids a
+  recurring issue, driven either by a user-feedback correction or by the problems
+  found during VERIFY (PRD §7.12 / §7.13).
+
+A proposed improvement to a library agent (e.g. the page agent) is gated on that
+agent's regression fixtures and filed as a GitHub issue for a maintainer to
+review; a session-built agent is trained in place so its contribution carries the
+fix. The goal is that agents improve from real signals rather than repeating the
+same mistake.
 
 ## Required capability
 vision, text
@@ -27,19 +29,20 @@ You are the Feedback Agent. The user message begins with `TASK: verify` or
 `TASK: train`. Do ONLY that task and return ONLY its JSON (no code fences).
 
 TASK: verify
-You are given a content agent's purpose/contract, the HTML it produced for one
-source image, and the source image itself. Decide whether that HTML faithfully
-captures the content this agent is responsible for — right text, right structure,
-nothing missed, nothing invented — AND is accessible. Judge ONLY this agent's
-content type; ignore content that other agents handle. List concrete, actionable
-problems (empty when there are none). Respond with ONLY:
+You are given an agent's purpose/contract, the HTML it produced for one source
+image, and the source image itself. Decide whether that HTML faithfully captures
+everything this agent is responsible for in the image — right text, right
+structure, nothing missed, nothing invented — AND is accessible (WCAG 2.2 AA).
+Respect the agent's declared scope: a whole-page agent is responsible for the
+ENTIRE page; a specialist agent is responsible for its content type. Check every
+part the agent is responsible for. List concrete, actionable problems (empty when
+there are none). Respond with ONLY:
 { "faithful": true|false, "accessible": true|false, "problems": ["..."] }
 
 TASK: train
-You are given an agent's full markdown and either a user-feedback correction (a
-block before and after an edit) or a list of verification problems. Propose an
-improved version of the agent's markdown so it would avoid the issue on similar
-inputs. You MUST:
+You are given an agent's full markdown and either a user-feedback correction or a
+list of verification problems. Propose an improved version of the agent's markdown
+so it would avoid the issue on similar inputs. You MUST:
 - Generalize the lesson into an instruction; do NOT hard-code this document's
   specific text, values, or wording.
 - Be ADDITIVE and backward-compatible: keep every existing instruction and
@@ -48,7 +51,7 @@ inputs. You MUST:
 - Keep the section structure (`# <Type> Agent`, `## Purpose`,
   `## Required capability`, `## System prompt`, `## Output contract`), the agent's
   name, and its declared capabilities unchanged. Forbid CSS/styling; preserve the
-  fragment-log / provenance requirements.
+  agent's output contract, including any log/provenance fields it already emits.
 - If there is no sound, generalizable change to THIS agent, make none.
 Respond with ONLY:
 { "changed": true|false,
diff --git a/src/github/issue.ts b/src/github/issue.ts
index bde5e7f..4e10107 100644
--- a/src/github/issue.ts
+++ b/src/github/issue.ts
@@ -14,18 +14,18 @@ export function parseRepo(url: string): RepoRef {
 // Label maintainers can sort/filter agent suggestions by.
 export const AGENT_LABEL = "iris-agent-suggestion";
 
-async function ensureLabel(octokit: Octokit, repo: RepoRef): Promise<void> {
+async function ensureLabel(
+  octokit: Octokit,
+  repo: RepoRef,
+  name: string = AGENT_LABEL,
+  color = "5319e7",
+  description = "Agent suggested automatically by Equalify Iris",
+): Promise<void> {
   try {
-    await octokit.issues.getLabel({ owner: repo.owner, repo: repo.repo, name: AGENT_LABEL });
+    await octokit.issues.getLabel({ owner: repo.owner, repo: repo.repo, name });
   } catch {
     try {
-      await octokit.issues.createLabel({
-        owner: repo.owner,
-        repo: repo.repo,
-        name: AGENT_LABEL,
-        color: "5319e7",
-        description: "Agent suggested automatically by Equalify Iris",
-      });
+      await octokit.issues.createLabel({ owner: repo.owner, repo: repo.repo, name, color, description });
     } catch {
       // label may have been created concurrently, or insufficient perms — the
       // issue create below still works if the label already exists.
@@ -80,3 +80,67 @@ export async function createAgentIssue(
   });
   return res.data.html_url;
 }
+
+// Label for feedback-driven improvements to an EXISTING agent (distinct from new
+// agent suggestions, so maintainers can triage them separately).
+export const AGENT_UPDATE_LABEL = "iris-agent-update";
+
+export interface AgentUpdateIssue {
+  agentName: string; // e.g. "page.md"
+  agentMarkdown: string; // full proposed updated agent file
+  summary: string; // one-line description of the change
+  diffPreview: string; // human-readable diff of the proposed change
+  sessionId: string;
+}
+
+// File a labeled issue proposing an improvement to an existing agent, produced by
+// the feedback loop (and already gated by the agent's regression fixtures).
+// Returns the issue URL, or null if an open update issue for this agent already
+// exists (dedupe by title).
+export async function createAgentUpdateIssue(
+  token: string,
+  upstreamUrl: string,
+  apiBase: string,
+  args: AgentUpdateIssue,
+): Promise<string | null> {
+  const octokit = new Octokit({ auth: token, baseUrl: apiBase });
+  const repo = parseRepo(upstreamUrl);
+  const name = args.agentName.replace(/\.md$/, "");
+  const title = `Agent update proposal: ${name}`;
+
+  // Dedupe: skip if an open update issue with this title already exists.
+  try {
+    const found = await octokit.search.issuesAndPullRequests({
+      q: `repo:${repo.owner}/${repo.repo} is:issue is:open label:"${AGENT_UPDATE_LABEL}" "${name}" in:title`,
+    });
+    if (found.data.items.some((i) => i.title === title)) return null;
+  } catch {
+    // search unavailable — proceed (a duplicate is acceptable; not worth failing).
+  }
+
+  await ensureLabel(
+    octokit,
+    repo,
+    AGENT_UPDATE_LABEL,
+    "0e8a16",
+    "Agent improvement proposed by Equalify Iris from user feedback",
+  );
+  const body =
+    `**Agent:** \`agents/${name}.md\`\n` +
+    `**Proposed change:** ${args.summary}\n` +
+    `**Session:** ${args.sessionId}\n\n` +
+    `_Auto-filed by Equalify Iris when user feedback produced a generalizable improvement to this agent. ` +
+    `Already gated by the agent's regression fixtures._\n\n` +
+    `## Diff (preview)\n\n` +
+    "```diff\n" + args.diffPreview + "\n```\n\n" +
+    `## Proposed full \`agents/${name}.md\`\n\n` +
+    "```markdown\n" + args.agentMarkdown + "\n```\n";
+  const res = await octokit.issues.create({
+    owner: repo.owner,
+    repo: repo.repo,
+    title,
+    body,
+    labels: [AGENT_UPDATE_LABEL],
+  });
+  return res.data.html_url;
+}
diff --git a/src/pipeline/feedback.ts b/src/pipeline/feedback.ts
index 9a7bc86..ae5b903 100644
--- a/src/pipeline/feedback.ts
+++ b/src/pipeline/feedback.ts
@@ -4,6 +4,8 @@ import { extractJson } from "../util/json.ts";
 import { loadAgent, type AgentSpec } from "../agents/loader.ts";
 import { ACCESSIBILITY_REQUIREMENTS } from "./accessibility.ts";
 import { loadImage, type InputImage, type PipelineContext } from "./context.ts";
+import { flatten } from "./flatten.ts";
+import { createAgentUpdateIssue } from "../github/issue.ts";
 import type { FixtureCase } from "./regression.ts";
 
 // Previously imported from github/contributions.ts, which was removed when the
@@ -112,12 +114,39 @@ export async function verifyAgentOutput(
 // ---------------------------------------------------------------------------
 
 const MAX_GATE_FIXTURES = 3;
+// An updated agent must still reproduce at least this fraction of the words in a
+// fixture's accepted output (by screen-reader-flattened text); below it, the
+// change is treated as a content regression.
+const MIN_CONTENT_COVERAGE = 0.85;
+// Skip the coverage check for very short outputs, where one dropped word swings
+// the ratio — rely on the model verdict alone there.
+const MIN_COVERAGE_WORDS = 8;
 
 export interface RegressionResult {
   passed: boolean;
   failures: string[];
 }
 
+// Fraction of the accepted output's distinct words that still appear in the
+// candidate output (screen-reader-flattened, punctuation-insensitive). Returns
+// null when the accepted text is too short to judge reliably.
+function contentCoverage(acceptedHtml: string, candidateHtml: string): number | null {
+  const words = (html: string): Set<string> =>
+    new Set(
+      flatten(html)
+        .toLowerCase()
+        .replace(/[^a-z0-9\s]/g, " ")
+        .split(/\s+/)
+        .filter((w) => w.length > 1),
+    );
+  const accepted = words(acceptedHtml);
+  if (accepted.size < MIN_COVERAGE_WORDS) return null;
+  const candidate = words(candidateHtml);
+  let hit = 0;
+  for (const w of accepted) if (candidate.has(w)) hit++;
+  return hit / accepted.size;
+}
+
 // Re-run an agent (given its current/updated content) on a fixture image, used by
 // the regression gate. Accepts either output shape: { html } (whole-page agents
 // like page.md) or { no_content, fragments[] } (content agents).
@@ -194,6 +223,16 @@ export async function regressionGate(
       failures.push(`${c.image_file}: updated agent produced no output`);
       continue;
     }
+    // Content-preservation check: the updated agent must still reproduce the
+    // content it produced when this fixture was accepted (PRD §7.12). Compare the
+    // screen-reader-flattened text of the new output against the accepted output;
+    // a large drop means the change regressed a use we already shipped.
+    const candidateHtml = blocks.map((b) => b.html).join("\n\n");
+    const coverage = contentCoverage(c.accepted_html, candidateHtml);
+    if (coverage !== null && coverage < MIN_CONTENT_COVERAGE) {
+      failures.push(`${c.image_file}: only ${(coverage * 100).toFixed(0)}% of the accepted content remained`);
+      continue;
+    }
     const verdict = await verifyAgentOutput(ctx, updatedAgent, img, blocks);
     if (!verdict.ok) failures.push(`${c.image_file}: ${verdict.problems.join("; ") || "failed verification"}`);
   }
@@ -209,10 +248,11 @@ export async function regressionGate(
 
 // On a feedback re-run, turn the document-level correction (the prior reviewed
 // body vs. this run's reviewed body) into an improved version of the agent that
-// produced the document — the page agent in the single-pass pipeline. A library
-// agent becomes an update PR (via agent-updates.md, opened on close), gated on its
-// regression fixtures so the change can't break a use it already handled; a
-// session-built agent is trained in place so its new-agent PR carries the fix.
+// produced the document — the page agent in the single-pass pipeline. For a
+// library agent the proposal is gated on its regression fixtures and then filed
+// as a GitHub issue (the contribution model uses issues, not close-time PRs),
+// while also being recorded in agent-updates.md; a session-built agent is trained
+// in place so its new-agent contribution carries the fix.
 export async function proposeAgentUpdatesFromFeedback(
   ctx: PipelineContext,
   args: { agentFile: string; before: string; after: string; feedback: string },
@@ -291,5 +331,28 @@ export async function proposeAgentUpdatesFromFeedback(
   writeFileSync(path, JSON.stringify([...merged.values()], null, 2));
 
   ctx.log.event("agent_updates_proposed", { agents: [proposal.agent_name], count: 1 });
+
+  // Surface the proposal where maintainers act on it: file a GitHub issue (the
+  // contribution model uses issues, not close-time PRs). Attributed to the
+  // logged-in user unless a service token override is configured. No-op without a
+  // token, so local runs still keep the proposal in agent-updates.md.
+  const token = ctx.cfg.github.issue_token || ctx.githubToken;
+  if (token) {
+    try {
+      const url = await createAgentUpdateIssue(token, ctx.cfg.github.upstream_repo, ctx.cfg.github.api_base_url, {
+        agentName: proposal.agent_name,
+        agentMarkdown: proposal.content,
+        summary: proposal.summary,
+        diffPreview: proposal.diff_preview,
+        sessionId: ctx.sessionId,
+      });
+      ctx.log.event("agent_update_issue", { agent: proposal.agent_name, url: url ?? "(duplicate — skipped)" });
+    } catch (e) {
+      ctx.log.event("agent_update_issue_failed", { agent: proposal.agent_name, error: (e as Error).message });
+    }
+  } else {
+    ctx.log.event("agent_update_issue_skipped", { agent: proposal.agent_name, reason: "no github token" });
+  }
+
   return [proposal];
 }

From 370cc6dd31a8fd668d7b2ef9f12dc776f463524a Mon Sep 17 00:00:00 2001
From: oshrizak <63424207+oshrizak@users.noreply.github.com>
Date: Tue, 26 May 2026 09:58:39 -0700
Subject: [PATCH 2/2] test(feedback): unit tests for contentCoverage; export it
 and ignore flatten role markers

---
 package.json             |  3 ++-
 src/pipeline/feedback.ts |  9 ++++++---
 test/feedback.test.ts    | 41 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 4 deletions(-)
 create mode 100644 test/feedback.test.ts

diff --git a/package.json b/package.json
index 4f97fa8..9a6018d 100644
--- a/package.json
+++ b/package.json
@@ -10,7 +10,8 @@
   "scripts": {
     "start": "node --use-system-ca --env-file-if-exists=.env --experimental-sqlite src/index.ts",
     "dev": "node --use-system-ca --env-file-if-exists=.env --experimental-sqlite --watch src/index.ts",
-    "typecheck": "tsc --noEmit"
+    "typecheck": "tsc --noEmit",
+    "test": "node --test test/feedback.test.ts"
   },
   "dependencies": {
     "@aws-sdk/client-bedrock-runtime": "^3.682.0",
diff --git a/src/pipeline/feedback.ts b/src/pipeline/feedback.ts
index ae5b903..a2c299f 100644
--- a/src/pipeline/feedback.ts
+++ b/src/pipeline/feedback.ts
@@ -117,7 +117,7 @@ const MAX_GATE_FIXTURES = 3;
 // An updated agent must still reproduce at least this fraction of the words in a
 // fixture's accepted output (by screen-reader-flattened text); below it, the
 // change is treated as a content regression.
-const MIN_CONTENT_COVERAGE = 0.85;
+export const MIN_CONTENT_COVERAGE = 0.85;
 // Skip the coverage check for very short outputs, where one dropped word swings
 // the ratio — rely on the model verdict alone there.
 const MIN_COVERAGE_WORDS = 8;
@@ -129,11 +129,14 @@ export interface RegressionResult {
 
 // Fraction of the accepted output's distinct words that still appear in the
 // candidate output (screen-reader-flattened, punctuation-insensitive). Returns
-// null when the accepted text is too short to judge reliably.
-function contentCoverage(acceptedHtml: string, candidateHtml: string): number | null {
+// null when the accepted text is too short to judge reliably. Structural role
+// markers that flatten() injects ([Heading 2], [List item], …) are stripped so a
+// structure-only change is not mistaken for dropped content.
+export function contentCoverage(acceptedHtml: string, candidateHtml: string): number | null {
   const words = (html: string): Set<string> =>
     new Set(
       flatten(html)
+        .replace(/\[[^\]]*\]/g, " ")
         .toLowerCase()
         .replace(/[^a-z0-9\s]/g, " ")
         .split(/\s+/)
diff --git a/test/feedback.test.ts b/test/feedback.test.ts
new file mode 100644
index 0000000..70d359b
--- /dev/null
+++ b/test/feedback.test.ts
@@ -0,0 +1,41 @@
+import { test } from "node:test";
+import assert from "node:assert/strict";
+import { contentCoverage, MIN_CONTENT_COVERAGE } from "../src/pipeline/feedback.ts";
+
+// contentCoverage backs the regression gate's content-preservation check: it
+// measures how much of a fixture's accepted output a candidate (re-run with an
+// updated agent) still reproduces, by screen-reader-flattened text.
+
+test("identical content scores full coverage", () => {
+  const html = "<p>one two three four five six seven eight nine ten</p>";
+  assert.equal(contentCoverage(html, html), 1);
+});
+
+test("a superset candidate still scores full coverage", () => {
+  const accepted = "<p>one two three four five six seven eight nine ten</p>";
+  const candidate = "<h2>one two three four five six seven eight nine ten eleven twelve</h2>";
+  assert.equal(contentCoverage(accepted, candidate), 1);
+});
+
+test("structure-only change (heading vs paragraph) is not a content drop", () => {
+  const accepted = "<h2>alpha bravo charlie delta echo foxtrot golf hotel</h2>";
+  const candidate = "<p>alpha, bravo. charlie! delta? echo: foxtrot; golf hotel</p>";
+  const cov = contentCoverage(accepted, candidate);
+  assert.ok(cov !== null && cov >= MIN_CONTENT_COVERAGE, `expected >= ${MIN_CONTENT_COVERAGE}, got ${cov}`);
+});
+
+test("dropping half the content falls below the threshold", () => {
+  const accepted = "<p>one two three four five six seven eight nine ten</p>";
+  const candidate = "<p>one two three four five</p>";
+  const cov = contentCoverage(accepted, candidate);
+  assert.ok(cov !== null && cov < MIN_CONTENT_COVERAGE, `expected < ${MIN_CONTENT_COVERAGE}, got ${cov}`);
+});
+
+test("an empty candidate scores zero coverage against substantial accepted text", () => {
+  const accepted = "<p>one two three four five six seven eight nine ten</p>";
+  assert.equal(contentCoverage(accepted, ""), 0);
+});
+
+test("returns null when the accepted text is too short to judge", () => {
+  assert.equal(contentCoverage("<p>tiny bit here</p>", "<p>totally different words instead</p>"), null);
+});