From a4179c390e3ae88b88d8d9c750ebbc06910650a3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 26 May 2026 20:21:22 +0000 Subject: [PATCH 1/6] Initial plan From 69309254f5d4ce2875856833ae5cedc7ff0b07b7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 26 May 2026 20:31:02 +0000 Subject: [PATCH 2/6] feat: normalize outcome evaluation fields Co-authored-by: mnkiefer <8320933+mnkiefer@users.noreply.github.com> --- actions/setup/js/emit_outcome_spans.cjs | 14 ++- actions/setup/js/emit_outcome_spans.test.cjs | 24 ++++- actions/setup/js/evaluate_outcomes.cjs | 88 +++++++++++++++- pkg/cli/outcome_eval.go | 57 +++++++---- pkg/cli/outcome_eval_generic.go | 10 +- pkg/cli/outcome_eval_jsonl.go | 4 + pkg/cli/outcome_eval_test.go | 97 ++++++++++++++++++ pkg/cli/outcome_evaluation.go | 100 +++++++++++++++++++ 8 files changed, 365 insertions(+), 29 deletions(-) create mode 100644 pkg/cli/outcome_evaluation.go diff --git a/actions/setup/js/emit_outcome_spans.cjs b/actions/setup/js/emit_outcome_spans.cjs index 928f30728cb..0c3def7289a 100644 --- a/actions/setup/js/emit_outcome_spans.cjs +++ b/actions/setup/js/emit_outcome_spans.cjs @@ -136,6 +136,9 @@ async function main() { for (const eval_ of evaluations) { const type = typeof eval_.type === "string" ? eval_.type : ""; const result = typeof eval_.result === "string" ? eval_.result : "unknown"; + const outcomeStatus = typeof eval_.outcome_status === "string" ? eval_.outcome_status : result; + const evidenceStrength = typeof eval_.evidence_strength === "string" ? eval_.evidence_strength : "weak"; + const signal = typeof eval_.signal === "string" ? eval_.signal : ""; const detail = typeof eval_.detail === "string" ? eval_.detail : ""; const workflow = typeof eval_.workflow === "string" ? eval_.workflow : ""; const sourceRunId = typeof eval_.run_id === "number" ? eval_.run_id : 0; @@ -159,6 +162,8 @@ async function main() { buildAttr("gh-aw.exporter.name", "outcome-collector"), buildAttr("gh-aw.outcome.type", type), buildAttr("gh-aw.outcome.result", result), + buildAttr("gh-aw.outcome.outcome_status", outcomeStatus), + buildAttr("gh-aw.outcome.evidence_strength", evidenceStrength), buildAttr("gh-aw.outcome.workflow", workflow), buildAttr("gh-aw.outcome.run_id", sourceRunId), buildAttr("gh-aw.outcome.repo", repo), @@ -166,6 +171,7 @@ async function main() { if (url) attributes.push(buildAttr("gh-aw.outcome.url", url)); if (detail) attributes.push(buildAttr("gh-aw.outcome.detail", detail)); + if (signal) attributes.push(buildAttr("gh-aw.outcome.signal", signal)); if (timestamp) attributes.push(buildAttr("gh-aw.outcome.created_at", timestamp)); if (event) attributes.push(buildAttr("gh-aw.outcome.event", event)); if (resolutionSec !== null) attributes.push(buildAttr("gh-aw.outcome.resolution_sec", resolutionSec)); @@ -180,8 +186,8 @@ async function main() { if (comments !== null) attributes.push(buildAttr("gh-aw.outcome.comments", comments)); if (zeroTouch) attributes.push(buildAttr("gh-aw.outcome.zero_touch", true)); - // Map result to OTLP status: accepted=OK, rejected=ERROR, noop=UNSET, pending/ignored=UNSET - const statusCode = result === "rejected" ? 2 : result === "accepted" ? 1 : 0; + // Map normalized outcome_status to OTLP status: accepted=OK, rejected=ERROR, all others=UNSET + const statusCode = outcomeStatus === "rejected" ? 2 : outcomeStatus === "accepted" ? 1 : 0; itemSpans.push( buildOTLPSpan({ @@ -213,6 +219,10 @@ async function main() { buildAttr("gh-aw.outcome.ignored", getSummaryNumber("ignored", 0)), buildAttr("gh-aw.outcome.pending", getSummaryNumber("pending", 0)), buildAttr("gh-aw.outcome.noop", getSummaryNumber("noop", 0)), + buildAttr("gh-aw.outcome.accepted_strong", getSummaryNumber("accepted_strong", 0)), + buildAttr("gh-aw.outcome.accepted_medium", getSummaryNumber("accepted_medium", 0)), + buildAttr("gh-aw.outcome.accepted_weak", getSummaryNumber("accepted_weak", 0)), + buildAttr("gh-aw.outcome.fallback_exists_only_count", getSummaryNumber("fallback_exists_only_count", 0)), buildAttr("gh-aw.outcome.acceptance_rate", getSummaryNumber("acceptance_rate", 0)), buildAttr("gh-aw.outcome.waste_rate", getSummaryNumber("waste_rate", 0)), buildAttr("gh-aw.outcome.noop_rate", getSummaryNumber("noop_rate", 0)), diff --git a/actions/setup/js/emit_outcome_spans.test.cjs b/actions/setup/js/emit_outcome_spans.test.cjs index f13bb36f64e..715e300713d 100644 --- a/actions/setup/js/emit_outcome_spans.test.cjs +++ b/actions/setup/js/emit_outcome_spans.test.cjs @@ -183,6 +183,10 @@ describe("emit_outcome_spans.cjs", () => { ignored: 0, pending: 0, noop: 0, + accepted_strong: 1, + accepted_medium: 0, + accepted_weak: 0, + fallback_exists_only_count: 1, noop_rate: 0, zero_touch: 1, zero_touch_rate: 1, @@ -197,7 +201,10 @@ describe("emit_outcome_spans.cjs", () => { JSON.stringify({ type: "issue", result: "accepted", - detail: "created item", + outcome_status: "accepted", + evidence_strength: "strong", + signal: "merged", + detail: "merged", workflow: "triage", run_id: 101, url: "https://github.com/github/gh-aw/issues/1", @@ -216,6 +223,9 @@ describe("emit_outcome_spans.cjs", () => { JSON.stringify({ type: "comment", result: "rejected", + outcome_status: "unknown", + evidence_strength: "weak", + signal: "target_exists_only", workflow: "triage", run_id: 102, repo: "github/gh-aw", @@ -271,16 +281,21 @@ describe("emit_outcome_spans.cjs", () => { expect.objectContaining({ spanName: "gh-aw.outcome.evaluation", parentSpanId: summarySpan.spanId, - statusCode: 2, + statusCode: 0, }) ); expect(summarySpan.attributes).toContainEqual({ key: "gh-aw.exporter.name", value: "outcome-collector" }); expect(summarySpan.attributes).toContainEqual({ key: "gh-aw.outcome.date", value: "2026-05-13" }); expect(summarySpan.attributes).toContainEqual({ key: "gh-aw.outcome.zero_touch_count", value: 1 }); + expect(summarySpan.attributes).toContainEqual({ key: "gh-aw.outcome.accepted_strong", value: 1 }); + expect(summarySpan.attributes).toContainEqual({ key: "gh-aw.outcome.fallback_exists_only_count", value: 1 }); expect(spans[1].attributes).toContainEqual({ key: "gh-aw.exporter.name", value: "outcome-collector" }); expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.url", value: "https://github.com/github/gh-aw/issues/1" }); - expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.detail", value: "created item" }); + expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.detail", value: "merged" }); + expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.outcome_status", value: "accepted" }); + expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.evidence_strength", value: "strong" }); + expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.signal", value: "merged" }); expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.created_at", value: "2026-05-13T09:00:00Z" }); expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.review_comments", value: 0 }); expect(spans[1].attributes).toContainEqual({ key: "gh-aw.outcome.changed_files", value: 3 }); @@ -300,6 +315,9 @@ describe("emit_outcome_spans.cjs", () => { expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.reactions_negative")).toBeUndefined(); expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.comments")).toBeUndefined(); expect(spans[2].attributes.find(attr => attr.key === "gh-aw.outcome.zero_touch")).toBeUndefined(); + expect(spans[2].attributes).toContainEqual({ key: "gh-aw.outcome.outcome_status", value: "unknown" }); + expect(spans[2].attributes).toContainEqual({ key: "gh-aw.outcome.evidence_strength", value: "weak" }); + expect(spans[2].attributes).toContainEqual({ key: "gh-aw.outcome.signal", value: "target_exists_only" }); expect(mockAppendToOTLPJSONL).toHaveBeenCalledOnce(); expect(mockSendOTLPToAllEndpoints).not.toHaveBeenCalled(); diff --git a/actions/setup/js/evaluate_outcomes.cjs b/actions/setup/js/evaluate_outcomes.cjs index 3fd30d87947..6c1dc2bd6e0 100644 --- a/actions/setup/js/evaluate_outcomes.cjs +++ b/actions/setup/js/evaluate_outcomes.cjs @@ -156,6 +156,9 @@ function secondsBetween(from, to) { /** * @typedef {object} EvalResult * @property {string} result + * @property {"accepted"|"rejected"|"pending"|"ignored"|"skipped"|"unknown"} outcome_status + * @property {"strong"|"medium"|"weak"} evidence_strength + * @property {string} signal * @property {string} detail * @property {number | null} resolution_sec * @property {number | null} pending_age_sec @@ -170,6 +173,44 @@ function secondsBetween(from, to) { * @property {boolean} zero_touch */ +/** + * Normalize legacy result/detail pairs into the shared outcome model. + * @param {string} result + * @param {string} detail + * @returns {{ outcome_status: "accepted"|"rejected"|"pending"|"ignored"|"skipped"|"unknown", evidence_strength: "strong"|"medium"|"weak", signal: string }} + */ +function normalizeOutcome(result, detail) { + const normalizedDetail = String(detail || "").toLowerCase().trim(); + + if (result === "noop") { + return { outcome_status: "skipped", evidence_strength: "weak", signal: "noop" }; + } + if (normalizedDetail === "object exists") { + return { outcome_status: "unknown", evidence_strength: "weak", signal: "target_exists_only" }; + } + if (result === "accepted" && normalizedDetail === "merged") { + return { outcome_status: "accepted", evidence_strength: "strong", signal: "merged" }; + } + if (result === "rejected" && normalizedDetail === "closed") { + return { outcome_status: "rejected", evidence_strength: "strong", signal: "closed" }; + } + if (result === "pending" && normalizedDetail === "open") { + return { outcome_status: "pending", evidence_strength: "medium", signal: "open" }; + } + switch (result) { + case "accepted": + return { outcome_status: "accepted", evidence_strength: "medium", signal: "acted_on" }; + case "rejected": + return { outcome_status: "rejected", evidence_strength: "medium", signal: "rejected" }; + case "ignored": + return { outcome_status: "ignored", evidence_strength: "medium", signal: "ignored" }; + case "pending": + return { outcome_status: "pending", evidence_strength: "medium", signal: "pending" }; + default: + return { outcome_status: "unknown", evidence_strength: "weak", signal: "unknown" }; + } +} + /** * Evaluate a single safe-output item against the GitHub API. * @param {object} item @@ -184,6 +225,9 @@ function evaluateItem(item, defaultRepo) { /** @type {EvalResult} */ const out = { result: "pending", + outcome_status: "pending", + evidence_strength: "medium", + signal: "pending", detail: "", resolution_sec: null, pending_age_sec: null, @@ -291,7 +335,10 @@ function evaluateItem(item, defaultRepo) { } // Comments, labels, etc. — if URL exists, the item was created - out.result = "accepted"; + out.result = "unknown"; + out.outcome_status = "unknown"; + out.evidence_strength = "weak"; + out.signal = "target_exists_only"; out.detail = "object exists"; return out; } @@ -349,11 +396,15 @@ function main() { let checked = 0; let accepted = 0; let rejected = 0; - const ignored = 0; + let ignored = 0; let pending = 0; let total = 0; let noop = 0; let zeroTouchCount = 0; + let acceptedStrong = 0; + let acceptedMedium = 0; + let acceptedWeak = 0; + let fallbackExistsOnlyCount = 0; /** @type {number[]} */ const resolutionTimes = []; @@ -398,6 +449,7 @@ function main() { // Write noop entries for (const n of noops) { + const normalized = normalizeOutcome("noop", n.type || ""); fs.appendFileSync( EVAL_JSONL, JSON.stringify({ @@ -405,6 +457,9 @@ function main() { url: "", repo, result: "noop", + outcome_status: normalized.outcome_status, + evidence_strength: normalized.evidence_strength, + signal: normalized.signal, detail: n.type, workflow, run_id: runId, @@ -430,10 +485,22 @@ function main() { // Evaluate each actionable item for (const item of actionable) { const evalResult = evaluateItem(item, repo); + const normalized = normalizeOutcome(evalResult.result, evalResult.detail); - switch (evalResult.result) { + switch (normalized.outcome_status) { case "accepted": accepted++; + switch (normalized.evidence_strength) { + case "strong": + acceptedStrong++; + break; + case "medium": + acceptedMedium++; + break; + case "weak": + acceptedWeak++; + break; + } if (evalResult.zero_touch === true) { zeroTouchCount++; } @@ -441,10 +508,16 @@ function main() { case "rejected": rejected++; break; - default: + case "ignored": + ignored++; + break; + case "pending": pending++; break; } + if (normalized.signal === "target_exists_only") { + fallbackExistsOnlyCount++; + } if (typeof evalResult.resolution_sec === "number" && evalResult.resolution_sec > 0) { resolutionTimes.push(evalResult.resolution_sec); } @@ -456,6 +529,9 @@ function main() { url: item.url || "", repo: item.repo || repo, result: evalResult.result, + outcome_status: normalized.outcome_status, + evidence_strength: normalized.evidence_strength, + signal: normalized.signal, detail: evalResult.detail, workflow, run_id: runId, @@ -511,6 +587,10 @@ function main() { ignored, pending, noop, + accepted_strong: acceptedStrong, + accepted_medium: acceptedMedium, + accepted_weak: acceptedWeak, + fallback_exists_only_count: fallbackExistsOnlyCount, acceptance_rate: Math.round(acceptanceRate * 10000) / 10000, waste_rate: Math.round(wasteRate * 10000) / 10000, noop_rate: Math.round(noopRate * 10000) / 10000, diff --git a/pkg/cli/outcome_eval.go b/pkg/cli/outcome_eval.go index a8394598cc8..8afc52f26a3 100644 --- a/pkg/cli/outcome_eval.go +++ b/pkg/cli/outcome_eval.go @@ -21,12 +21,14 @@ const ( OutcomeRejected OutcomeResult = "rejected" OutcomeIgnored OutcomeResult = "ignored" OutcomePending OutcomeResult = "pending" + OutcomeUnknown OutcomeResult = "unknown" OutcomeLifecycle OutcomeResult = "lifecycle" OutcomeError OutcomeResult = "error" ) // OutcomeReport is the result of evaluating one safe output item. type OutcomeReport struct { + OutcomeEvaluation Type string `json:"type" console:"header:Type"` ObjectURL string `json:"object_url,omitempty" console:"header:URL,omitempty"` ObjectNumber int `json:"object_number,omitempty" console:"header:#,omitempty"` @@ -45,19 +47,23 @@ type OutcomeReport struct { // OutcomeSummary aggregates outcomes across multiple safe output items. type OutcomeSummary struct { - Total int `json:"total" console:"header:Total"` - Accepted int `json:"accepted" console:"header:Accepted"` - Rejected int `json:"rejected" console:"header:Rejected"` - Ignored int `json:"ignored" console:"header:Ignored"` - Pending int `json:"pending" console:"header:Pending"` - Lifecycle int `json:"lifecycle" console:"header:Lifecycle"` - Errors int `json:"errors" console:"header:Errors"` - ZeroTouch int `json:"zero_touch" console:"header:Zero-touch"` - AcceptanceRate float64 `json:"acceptance_rate" console:"header:Acceptance Rate"` - WasteRate float64 `json:"waste_rate" console:"header:Waste Rate"` - ZeroTouchRate float64 `json:"zero_touch_rate" console:"header:Zero-touch Rate"` - MedianTimeToOutcome float64 `json:"median_time_to_outcome_hours,omitempty"` - CostPerAcceptedOutcome float64 `json:"cost_per_accepted_outcome,omitempty"` + Total int `json:"total" console:"header:Total"` + Accepted int `json:"accepted" console:"header:Accepted"` + Rejected int `json:"rejected" console:"header:Rejected"` + Ignored int `json:"ignored" console:"header:Ignored"` + Pending int `json:"pending" console:"header:Pending"` + AcceptedStrong int `json:"accepted_strong,omitempty"` + AcceptedMedium int `json:"accepted_medium,omitempty"` + AcceptedWeak int `json:"accepted_weak,omitempty"` + FallbackExistsOnlyCount int `json:"fallback_exists_only_count,omitempty"` + Lifecycle int `json:"lifecycle" console:"header:Lifecycle"` + Errors int `json:"errors" console:"header:Errors"` + ZeroTouch int `json:"zero_touch" console:"header:Zero-touch"` + AcceptanceRate float64 `json:"acceptance_rate" console:"header:Acceptance Rate"` + WasteRate float64 `json:"waste_rate" console:"header:Waste Rate"` + ZeroTouchRate float64 `json:"zero_touch_rate" console:"header:Zero-touch Rate"` + MedianTimeToOutcome float64 `json:"median_time_to_outcome_hours,omitempty"` + CostPerAcceptedOutcome float64 `json:"cost_per_accepted_outcome,omitempty"` } // outcomeEvaluator is a function that evaluates one safe output item. @@ -112,6 +118,7 @@ func EvaluateOutcomes(items []CreatedItemReport, repoOverride string) []OutcomeR report := eval(item, repo) report.CreatedAt = item.Timestamp report.CheckedAt = time.Now().UTC().Format(time.RFC3339) + report.OutcomeEvaluation = normalizeOutcomeEvaluation(report) reports = append(reports, report) } outcomeEvalLog.Printf("Outcome evaluation complete: reports=%d, skipped=%d", len(reports), skipped) @@ -123,18 +130,32 @@ func ComputeOutcomeSummary(reports []OutcomeReport, totalCost float64) OutcomeSu s := OutcomeSummary{Total: len(reports)} var times []float64 for _, r := range reports { - switch r.Result { - case OutcomeAccepted: + eval := normalizeOutcomeEvaluation(r) + switch eval.OutcomeStatus { + case OutcomeStatusAccepted: s.Accepted++ + switch eval.EvidenceStrength { + case EvidenceStrong: + s.AcceptedStrong++ + case EvidenceMedium: + s.AcceptedMedium++ + case EvidenceWeak: + s.AcceptedWeak++ + } if r.ZeroTouch { s.ZeroTouch++ } - case OutcomeRejected: + case OutcomeStatusRejected: s.Rejected++ - case OutcomeIgnored: + case OutcomeStatusIgnored: s.Ignored++ - case OutcomePending: + case OutcomeStatusPending: s.Pending++ + } + if eval.Signal == "target_exists_only" { + s.FallbackExistsOnlyCount++ + } + switch r.Result { case OutcomeLifecycle: s.Lifecycle++ case OutcomeError: diff --git a/pkg/cli/outcome_eval_generic.go b/pkg/cli/outcome_eval_generic.go index 16e7faa8039..d327e70c57f 100644 --- a/pkg/cli/outcome_eval_generic.go +++ b/pkg/cli/outcome_eval_generic.go @@ -7,6 +7,7 @@ import ( ) var outcomeEvalGenericLog = logger.New("cli:outcome_eval_generic") +var genericOutcomeGHAPIGet = ghAPIGet // evalCloseSticky checks whether a closed issue or PR stayed closed. func evalCloseSticky(item CreatedItemReport, repoOverride string) OutcomeReport { @@ -238,14 +239,19 @@ func evalGenericSticky(item CreatedItemReport, repoOverride string) OutcomeRepor return report } - _, err := ghAPIGet(fmt.Sprintf("issues/%d", num), repo) + _, err := genericOutcomeGHAPIGet(fmt.Sprintf("issues/%d", num), repo) if err != nil { report.Result = OutcomeError report.EvalError = err.Error() return report } - report.Result = OutcomeAccepted + report.Result = OutcomeUnknown report.Detail = "object still exists" + report.OutcomeEvaluation = OutcomeEvaluation{ + OutcomeStatus: OutcomeStatusUnknown, + EvidenceStrength: EvidenceWeak, + Signal: "target_exists_only", + } return report } diff --git a/pkg/cli/outcome_eval_jsonl.go b/pkg/cli/outcome_eval_jsonl.go index 7029b253bd9..81f648c2b52 100644 --- a/pkg/cli/outcome_eval_jsonl.go +++ b/pkg/cli/outcome_eval_jsonl.go @@ -26,10 +26,14 @@ func writeOutcomeJSONL(dir string, runID int64, reports []OutcomeReport) { defer f.Close() for _, r := range reports { + eval := normalizeOutcomeEvaluation(r) entry := map[string]any{ "run_id": runID, "type": r.Type, "result": r.Result, + "outcome_status": eval.OutcomeStatus, + "evidence_strength": eval.EvidenceStrength, + "signal": eval.Signal, "detail": r.Detail, "object_url": r.ObjectURL, "object_number": r.ObjectNumber, diff --git a/pkg/cli/outcome_eval_test.go b/pkg/cli/outcome_eval_test.go index 657226d0cd2..6cefcd5b119 100644 --- a/pkg/cli/outcome_eval_test.go +++ b/pkg/cli/outcome_eval_test.go @@ -3,9 +3,13 @@ package cli import ( + "encoding/json" + "os" + "path/filepath" "testing" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestComputeOutcomeSummary(t *testing.T) { @@ -27,6 +31,9 @@ func TestComputeOutcomeSummary(t *testing.T) { assert.Equal(t, 1, s.Pending, "pending count") assert.Equal(t, 1, s.Lifecycle, "lifecycle count") assert.Equal(t, 1, s.ZeroTouch, "zero-touch count") + assert.Equal(t, 0, s.AcceptedStrong, "accepted strong count") + assert.Equal(t, 2, s.AcceptedMedium, "accepted medium count") + assert.Equal(t, 0, s.AcceptedWeak, "accepted weak count") // AcceptanceRate = accepted / (accepted + rejected) = 2/3 assert.InDelta(t, 0.6667, s.AcceptanceRate, 0.01, "acceptance rate") @@ -198,3 +205,93 @@ func TestEvaluateOutcomesErrorOnMissingData(t *testing.T) { assert.Len(t, reports, 1, "should produce one report") assert.Equal(t, OutcomeError, reports[0].Result, "should error on missing repo and number") } + +func TestNormalizeOutcomeEvaluationTargetExistsOnly(t *testing.T) { + report := OutcomeReport{ + Type: "add_labels", + Result: OutcomeUnknown, + Detail: "object still exists", + } + + eval := normalizeOutcomeEvaluation(report) + assert.Equal(t, OutcomeStatusUnknown, eval.OutcomeStatus) + assert.Equal(t, EvidenceWeak, eval.EvidenceStrength) + assert.Equal(t, "target_exists_only", eval.Signal) +} + +func TestEvalGenericStickyTargetExistsOnlyFallback(t *testing.T) { + old := genericOutcomeGHAPIGet + t.Cleanup(func() { + genericOutcomeGHAPIGet = old + }) + genericOutcomeGHAPIGet = func(endpoint string, repo string) (map[string]any, error) { + return map[string]any{"state": "open"}, nil + } + + report := evalGenericSticky( + CreatedItemReport{Type: "add_labels", Number: 42, Repo: "owner/repo"}, + "owner/repo", + ) + + assert.Equal(t, OutcomeUnknown, report.Result) + assert.Equal(t, OutcomeStatusUnknown, report.OutcomeStatus) + assert.Equal(t, EvidenceWeak, report.EvidenceStrength) + assert.Equal(t, "target_exists_only", report.Signal) +} + +func TestComputeOutcomeSummaryDoesNotCountExistsOnlyFallbackAsAccepted(t *testing.T) { + reports := []OutcomeReport{ + { + Type: "add_labels", + Result: OutcomeUnknown, + OutcomeEvaluation: OutcomeEvaluation{ + OutcomeStatus: OutcomeStatusUnknown, + EvidenceStrength: EvidenceWeak, + Signal: "target_exists_only", + }, + }, + { + Type: "create_pull_request", + Result: OutcomeAccepted, + OutcomeEvaluation: OutcomeEvaluation{ + OutcomeStatus: OutcomeStatusAccepted, + EvidenceStrength: EvidenceStrong, + Signal: "merged", + }, + }, + } + + s := ComputeOutcomeSummary(reports, 0) + assert.Equal(t, 1, s.Accepted) + assert.Equal(t, 1, s.AcceptedStrong) + assert.Equal(t, 0, s.AcceptedWeak) + assert.Equal(t, 1, s.FallbackExistsOnlyCount) +} + +func TestWriteOutcomeJSONLEmitsNormalizedFields(t *testing.T) { + dir := t.TempDir() + reports := []OutcomeReport{ + { + Type: "add_labels", + Result: OutcomeUnknown, + OutcomeEvaluation: OutcomeEvaluation{ + OutcomeStatus: OutcomeStatusUnknown, + EvidenceStrength: EvidenceWeak, + Signal: "target_exists_only", + }, + CreatedAt: "2026-05-12T00:00:00Z", + CheckedAt: "2026-05-12T01:00:00Z", + }, + } + + writeOutcomeJSONL(dir, 123, reports) + + data, err := os.ReadFile(filepath.Join(dir, "outcomes-123.jsonl")) + require.NoError(t, err) + + var entry map[string]any + require.NoError(t, json.Unmarshal(data[:len(data)-1], &entry)) + assert.Equal(t, "unknown", entry["outcome_status"]) + assert.Equal(t, "weak", entry["evidence_strength"]) + assert.Equal(t, "target_exists_only", entry["signal"]) +} diff --git a/pkg/cli/outcome_evaluation.go b/pkg/cli/outcome_evaluation.go new file mode 100644 index 00000000000..d9f7c950bb2 --- /dev/null +++ b/pkg/cli/outcome_evaluation.go @@ -0,0 +1,100 @@ +package cli + +import "strings" + +// OutcomeStatus is the normalized classification for a safe output outcome. +type OutcomeStatus string + +const ( + OutcomeStatusAccepted OutcomeStatus = "accepted" + OutcomeStatusRejected OutcomeStatus = "rejected" + OutcomeStatusPending OutcomeStatus = "pending" + OutcomeStatusIgnored OutcomeStatus = "ignored" + OutcomeStatusSkipped OutcomeStatus = "skipped" + OutcomeStatusUnknown OutcomeStatus = "unknown" +) + +// EvidenceStrength describes how confidently the outcome can be inferred. +type EvidenceStrength string + +const ( + EvidenceStrong EvidenceStrength = "strong" + EvidenceMedium EvidenceStrength = "medium" + EvidenceWeak EvidenceStrength = "weak" +) + +// OutcomeEvaluation is the shared normalized outcome model. +type OutcomeEvaluation struct { + OutcomeStatus OutcomeStatus `json:"outcome_status"` + EvidenceStrength EvidenceStrength `json:"evidence_strength"` + Signal string `json:"signal,omitempty"` +} + +func normalizeOutcomeEvaluation(report OutcomeReport) OutcomeEvaluation { + if report.OutcomeStatus != "" { + return report.OutcomeEvaluation + } + + if report.EvalError != "" || report.Result == OutcomeError { + return OutcomeEvaluation{ + OutcomeStatus: OutcomeStatusUnknown, + EvidenceStrength: EvidenceWeak, + Signal: "evaluation_error", + } + } + + detail := strings.ToLower(strings.TrimSpace(report.Detail)) + + switch { + case strings.Contains(detail, "object still exists"): + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusUnknown, EvidenceStrength: EvidenceWeak, Signal: "target_exists_only"} + case strings.Contains(detail, "closed without merge"): + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusRejected, EvidenceStrength: EvidenceStrong, Signal: "closed_without_merge"} + case strings.Contains(detail, "closed as not planned"): + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusRejected, EvidenceStrength: EvidenceStrong, Signal: "closed_not_planned"} + case strings.Contains(detail, "closed by bot"): + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusUnknown, EvidenceStrength: EvidenceMedium, Signal: "lifecycle"} + case strings.Contains(detail, "merged"): + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusAccepted, EvidenceStrength: EvidenceStrong, Signal: "merged"} + case strings.Contains(detail, "reopened"): + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusRejected, EvidenceStrength: EvidenceStrong, Signal: "reopened"} + case strings.Contains(detail, "deleted"): + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusRejected, EvidenceStrength: EvidenceStrong, Signal: "deleted"} + case strings.Contains(detail, "completed"): + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusAccepted, EvidenceStrength: EvidenceStrong, Signal: "completed"} + case strings.Contains(detail, "milestone still assigned"): + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusAccepted, EvidenceStrength: EvidenceMedium, Signal: "milestone_assigned"} + case strings.Contains(detail, "milestone removed"): + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusRejected, EvidenceStrength: EvidenceMedium, Signal: "milestone_removed"} + case strings.Contains(detail, "reviews submitted"): + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusAccepted, EvidenceStrength: EvidenceMedium, Signal: "reviewed"} + case strings.Contains(detail, "awaiting review"), strings.Contains(detail, "no reviews yet"): + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusPending, EvidenceStrength: EvidenceMedium, Signal: "awaiting_review"} + case strings.Contains(detail, "no engagement"): + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusIgnored, EvidenceStrength: EvidenceMedium, Signal: "no_engagement"} + case strings.Contains(detail, "human comments"), strings.Contains(detail, "with comments"): + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusPending, EvidenceStrength: EvidenceMedium, Signal: "acted_on"} + case strings.Contains(detail, "open"): + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusPending, EvidenceStrength: EvidenceMedium, Signal: "open"} + case strings.Contains(detail, "closed"): + if report.Result == OutcomeRejected { + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusRejected, EvidenceStrength: EvidenceStrong, Signal: "closed"} + } + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusAccepted, EvidenceStrength: EvidenceStrong, Signal: "closed"} + } + + switch report.Result { + case OutcomeAccepted: + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusAccepted, EvidenceStrength: EvidenceMedium, Signal: "acted_on"} + case OutcomeRejected: + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusRejected, EvidenceStrength: EvidenceMedium, Signal: "rejected"} + case OutcomePending: + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusPending, EvidenceStrength: EvidenceMedium, Signal: "pending"} + case OutcomeIgnored: + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusIgnored, EvidenceStrength: EvidenceMedium, Signal: "ignored"} + case OutcomeUnknown: + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusUnknown, EvidenceStrength: EvidenceWeak, Signal: "unknown"} + default: + return OutcomeEvaluation{OutcomeStatus: OutcomeStatusUnknown, EvidenceStrength: EvidenceWeak, Signal: "unknown"} + } +} From 7918e7ed24c996f63799fbe9e42d8a2f0ac50a8f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 26 May 2026 20:41:26 +0000 Subject: [PATCH 3/6] fix: handle outcome JSONL write errors Co-authored-by: mnkiefer <8320933+mnkiefer@users.noreply.github.com> --- actions/setup/js/evaluate_outcomes.cjs | 4 +++- pkg/cli/outcome_eval_jsonl.go | 10 ++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/actions/setup/js/evaluate_outcomes.cjs b/actions/setup/js/evaluate_outcomes.cjs index 6c1dc2bd6e0..91af7350b57 100644 --- a/actions/setup/js/evaluate_outcomes.cjs +++ b/actions/setup/js/evaluate_outcomes.cjs @@ -180,7 +180,9 @@ function secondsBetween(from, to) { * @returns {{ outcome_status: "accepted"|"rejected"|"pending"|"ignored"|"skipped"|"unknown", evidence_strength: "strong"|"medium"|"weak", signal: string }} */ function normalizeOutcome(result, detail) { - const normalizedDetail = String(detail || "").toLowerCase().trim(); + const normalizedDetail = String(detail || "") + .toLowerCase() + .trim(); if (result === "noop") { return { outcome_status: "skipped", evidence_strength: "weak", signal: "noop" }; diff --git a/pkg/cli/outcome_eval_jsonl.go b/pkg/cli/outcome_eval_jsonl.go index 81f648c2b52..8d3c119006e 100644 --- a/pkg/cli/outcome_eval_jsonl.go +++ b/pkg/cli/outcome_eval_jsonl.go @@ -49,8 +49,14 @@ func writeOutcomeJSONL(dir string, runID int64, reports []OutcomeReport) { if err != nil { continue } - f.Write(line) - f.WriteString("\n") + if _, err := f.Write(line); err != nil { + outcomeEvalLog.Printf("Failed to write outcome entry to %s: %v", filePath, err) + return + } + if _, err := f.WriteString("\n"); err != nil { + outcomeEvalLog.Printf("Failed to write newline to %s: %v", filePath, err) + return + } } outcomeEvalLog.Printf("Wrote %d outcome entries to %s", len(reports), filePath) From 74cff65b555411e1a5b6f138d8aa65aa92a66276 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 26 May 2026 20:43:18 +0000 Subject: [PATCH 4/6] test: cover outcome normalization helpers Co-authored-by: mnkiefer <8320933+mnkiefer@users.noreply.github.com> --- actions/setup/js/emit_outcome_spans.cjs | 2 ++ actions/setup/js/evaluate_outcomes.cjs | 6 +++--- actions/setup/js/evaluate_outcomes.test.cjs | 23 +++++++++++++++++++++ pkg/cli/outcome_eval_test.go | 2 +- pkg/cli/outcome_evaluation.go | 2 +- 5 files changed, 30 insertions(+), 5 deletions(-) create mode 100644 actions/setup/js/evaluate_outcomes.test.cjs diff --git a/actions/setup/js/emit_outcome_spans.cjs b/actions/setup/js/emit_outcome_spans.cjs index 0c3def7289a..fd2d5b105bb 100644 --- a/actions/setup/js/emit_outcome_spans.cjs +++ b/actions/setup/js/emit_outcome_spans.cjs @@ -136,6 +136,8 @@ async function main() { for (const eval_ of evaluations) { const type = typeof eval_.type === "string" ? eval_.type : ""; const result = typeof eval_.result === "string" ? eval_.result : "unknown"; + // Fall back to the legacy result field so older JSONL artifacts still render + // useful spans while newer artifacts carry explicit normalized fields. const outcomeStatus = typeof eval_.outcome_status === "string" ? eval_.outcome_status : result; const evidenceStrength = typeof eval_.evidence_strength === "string" ? eval_.evidence_strength : "weak"; const signal = typeof eval_.signal === "string" ? eval_.signal : ""; diff --git a/actions/setup/js/evaluate_outcomes.cjs b/actions/setup/js/evaluate_outcomes.cjs index 91af7350b57..ac3d15015d6 100644 --- a/actions/setup/js/evaluate_outcomes.cjs +++ b/actions/setup/js/evaluate_outcomes.cjs @@ -187,7 +187,7 @@ function normalizeOutcome(result, detail) { if (result === "noop") { return { outcome_status: "skipped", evidence_strength: "weak", signal: "noop" }; } - if (normalizedDetail === "object exists") { + if (normalizedDetail === "object still exists") { return { outcome_status: "unknown", evidence_strength: "weak", signal: "target_exists_only" }; } if (result === "accepted" && normalizedDetail === "merged") { @@ -341,7 +341,7 @@ function evaluateItem(item, defaultRepo) { out.outcome_status = "unknown"; out.evidence_strength = "weak"; out.signal = "target_exists_only"; - out.detail = "object exists"; + out.detail = "object still exists"; return out; } @@ -616,4 +616,4 @@ if (require.main === module) { main(); } -module.exports = { main, evaluateItem, readJSONL, secondsBetween, isoToEpoch }; +module.exports = { main, evaluateItem, normalizeOutcome, readJSONL, secondsBetween, isoToEpoch }; diff --git a/actions/setup/js/evaluate_outcomes.test.cjs b/actions/setup/js/evaluate_outcomes.test.cjs new file mode 100644 index 00000000000..024e3490d95 --- /dev/null +++ b/actions/setup/js/evaluate_outcomes.test.cjs @@ -0,0 +1,23 @@ +import { describe, expect, it } from "vitest"; +import { createRequire } from "module"; + +const req = createRequire(import.meta.url); +const { normalizeOutcome } = req("./evaluate_outcomes.cjs"); + +describe("evaluate_outcomes.cjs", () => { + it("maps existence-only fallback to weak unknown evidence", () => { + expect(normalizeOutcome("unknown", "object still exists")).toEqual({ + outcome_status: "unknown", + evidence_strength: "weak", + signal: "target_exists_only", + }); + }); + + it("maps merged outcomes to strong accepted evidence", () => { + expect(normalizeOutcome("accepted", "merged")).toEqual({ + outcome_status: "accepted", + evidence_strength: "strong", + signal: "merged", + }); + }); +}); diff --git a/pkg/cli/outcome_eval_test.go b/pkg/cli/outcome_eval_test.go index 6cefcd5b119..f9b86710844 100644 --- a/pkg/cli/outcome_eval_test.go +++ b/pkg/cli/outcome_eval_test.go @@ -239,7 +239,7 @@ func TestEvalGenericStickyTargetExistsOnlyFallback(t *testing.T) { assert.Equal(t, "target_exists_only", report.Signal) } -func TestComputeOutcomeSummaryDoesNotCountExistsOnlyFallbackAsAccepted(t *testing.T) { +func TestOutcomeSummaryExcludesExistsOnlyFromAccepted(t *testing.T) { reports := []OutcomeReport{ { Type: "add_labels", diff --git a/pkg/cli/outcome_evaluation.go b/pkg/cli/outcome_evaluation.go index d9f7c950bb2..ed732c58c6c 100644 --- a/pkg/cli/outcome_evaluation.go +++ b/pkg/cli/outcome_evaluation.go @@ -31,7 +31,7 @@ type OutcomeEvaluation struct { } func normalizeOutcomeEvaluation(report OutcomeReport) OutcomeEvaluation { - if report.OutcomeStatus != "" { + if report.OutcomeStatus != "" && report.EvidenceStrength != "" { return report.OutcomeEvaluation } From e217e0c0ef60c49d2d1399bf10f5a1893cf313ae Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 26 May 2026 20:44:34 +0000 Subject: [PATCH 5/6] chore: polish outcome normalization validation Co-authored-by: mnkiefer <8320933+mnkiefer@users.noreply.github.com> --- actions/setup/js/emit_outcome_spans.cjs | 5 ++++- actions/setup/js/evaluate_outcomes.cjs | 4 +--- pkg/cli/outcome_eval_test.go | 3 ++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/actions/setup/js/emit_outcome_spans.cjs b/actions/setup/js/emit_outcome_spans.cjs index fd2d5b105bb..4b2ed4a7689 100644 --- a/actions/setup/js/emit_outcome_spans.cjs +++ b/actions/setup/js/emit_outcome_spans.cjs @@ -39,6 +39,9 @@ const { const AW_INFO_PATH = "/tmp/gh-aw/aw_info.json"; const EVALUATIONS_PATH = "/tmp/gh-aw/outcome-evaluations.jsonl"; const SUMMARY_PATH = "/tmp/gh-aw/outcome-summary.json"; +const OTLP_STATUS_UNSET = 0; +const OTLP_STATUS_OK = 1; +const OTLP_STATUS_ERROR = 2; /** * Read a JSONL file, returning an array of parsed objects. @@ -189,7 +192,7 @@ async function main() { if (zeroTouch) attributes.push(buildAttr("gh-aw.outcome.zero_touch", true)); // Map normalized outcome_status to OTLP status: accepted=OK, rejected=ERROR, all others=UNSET - const statusCode = outcomeStatus === "rejected" ? 2 : outcomeStatus === "accepted" ? 1 : 0; + const statusCode = outcomeStatus === "rejected" ? OTLP_STATUS_ERROR : outcomeStatus === "accepted" ? OTLP_STATUS_OK : OTLP_STATUS_UNSET; itemSpans.push( buildOTLPSpan({ diff --git a/actions/setup/js/evaluate_outcomes.cjs b/actions/setup/js/evaluate_outcomes.cjs index ac3d15015d6..d60bd1008f3 100644 --- a/actions/setup/js/evaluate_outcomes.cjs +++ b/actions/setup/js/evaluate_outcomes.cjs @@ -338,10 +338,8 @@ function evaluateItem(item, defaultRepo) { // Comments, labels, etc. — if URL exists, the item was created out.result = "unknown"; - out.outcome_status = "unknown"; - out.evidence_strength = "weak"; - out.signal = "target_exists_only"; out.detail = "object still exists"; + Object.assign(out, normalizeOutcome(out.result, out.detail)); return out; } diff --git a/pkg/cli/outcome_eval_test.go b/pkg/cli/outcome_eval_test.go index f9b86710844..2f1cf06ccc4 100644 --- a/pkg/cli/outcome_eval_test.go +++ b/pkg/cli/outcome_eval_test.go @@ -3,6 +3,7 @@ package cli import ( + "bytes" "encoding/json" "os" "path/filepath" @@ -290,7 +291,7 @@ func TestWriteOutcomeJSONLEmitsNormalizedFields(t *testing.T) { require.NoError(t, err) var entry map[string]any - require.NoError(t, json.Unmarshal(data[:len(data)-1], &entry)) + require.NoError(t, json.Unmarshal(bytes.TrimSpace(data), &entry)) assert.Equal(t, "unknown", entry["outcome_status"]) assert.Equal(t, "weak", entry["evidence_strength"]) assert.Equal(t, "target_exists_only", entry["signal"]) From 5e19085e0bc4b9f8e3ee64c2ec7527f01851e385 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 27 May 2026 11:20:26 +0000 Subject: [PATCH 6/6] fix: resolve merge conflict with main and fix test argument mismatch Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com> --- pkg/cli/outcome_eval_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/cli/outcome_eval_test.go b/pkg/cli/outcome_eval_test.go index 6f8806f90b3..469b5c32083 100644 --- a/pkg/cli/outcome_eval_test.go +++ b/pkg/cli/outcome_eval_test.go @@ -259,7 +259,7 @@ func TestOutcomeSummaryExcludesExistsOnlyFromAccepted(t *testing.T) { }, } - s := ComputeOutcomeSummary(reports, 0) + s := ComputeOutcomeSummary(reports) assert.Equal(t, 1, s.Accepted) assert.Equal(t, 1, s.AcceptedStrong) assert.Equal(t, 0, s.AcceptedWeak)