diff --git a/CHANGELOG.md b/CHANGELOG.md index e05453c..4091939 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to `@testsprite/testsprite-cli` are documented here. The for ## [Unreleased] +### Added + +- **`test failure triage --project `** — groups all failed tests in a project into root-cause clusters using existing M2.1 analysis fields (`failureKind`, `recommendedFixTarget.reference`, `rootCauseHypothesis`). Returns a representative test per cluster, affected test ids, confidence score, and fix priority — without downloading failure bundles. Supports `--type`, `--filter`, and `--max-concurrency`. Client-side Phase-0 triage until native backend clustering ships. + ## [0.1.2] - 2026-06-19 ### Added diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md index 3746f46..2e1fac7 100644 --- a/DOCUMENTATION.md +++ b/DOCUMENTATION.md @@ -248,6 +248,31 @@ testsprite test failure summary test_xxxxxxxx --output json testsprite test failure summary test_xxxxxxxx --dry-run --output json ``` +#### `testsprite test failure triage --project ` + +When many tests fail in the same project, triage them into a few root-cause clusters before downloading bundles. The CLI lists all failed tests, fetches a lightweight `failure/summary` per test (no screenshots or video), and groups them client-side by: + +1. shared `recommendedFixTarget.reference` +2. env-wide `failureKind` (`infra`, `network`, `network_timeout`, `routing_404`) +3. normalized `rootCauseHypothesis` prefix +4. singleton (one test per cluster when no shared signal exists) + +Each cluster includes a `representativeTestId`, `memberTestIds`, `confidence`, and `fixPriority` (lower = fix first). After triage, pull one bundle from the representative test: + +```bash +# Triage all failed tests in a project +testsprite test failure triage --project proj_xxxxxxxx --output json + +# Limit to backend tests whose name contains "checkout" +testsprite test failure triage --project proj_xxxxxxxx --type backend --filter checkout --output json + +# Then investigate the highest-priority cluster's representative test +testsprite test failure get --out ./.testsprite/failure + +# Learn the JSON shape offline +testsprite test failure triage --project proj_xxxxxxxx --dry-run --output json +``` + ### Write commands Require the `write:tests` scope. diff --git a/README.md b/README.md index 2d90012..b1c9b1a 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,7 @@ Prefer to configure each step by hand (or learn the surface offline with `--dry- | | `test result` | Latest result; `--history` lists a test's prior runs | | | `test failure get` | The agent entry point: one self-contained latest-failure bundle | | | `test failure summary` | One-screen triage card (no media download) | +| | `test failure triage` | Group all failed tests in a project into root-cause clusters (no bundle download) | | **Write** | `test create` / `test create-batch` | Create a test (or bulk-create from a plan file); `--produces` / `--needs` / `--category` wire BE dependency metadata | | | `test update` / `test delete` / `test delete-batch` | Edit metadata / soft-delete | | | `test code put` | Replace generated code (etag-guarded) | diff --git a/skills/testsprite-verify.skill.md b/skills/testsprite-verify.skill.md index 34a7dcc..f7746ba 100644 --- a/skills/testsprite-verify.skill.md +++ b/skills/testsprite-verify.skill.md @@ -398,7 +398,22 @@ testing agent's observation; don't auto-fix on the recommendation alone. If you genuinely can't tell: report `inconclusive` with the signal that triggered the call and ask. -## 5. On failure → download the artifact +## 5. On failure → triage first, then download one bundle + +When **multiple tests failed** in the same project (batch run, regression, or +`test list --status failed` shows more than one red row), triage before pulling +every bundle: + +```bash +testsprite test failure triage --project --output json +``` + +Read the clusters: each has a `representativeTestId`, `memberTestIds`, +`confidence`, and `fixPriority` (lower = fix first). Investigate the +representative test from the highest-priority cluster — not an arbitrary failed +test. After a fix, rerun that representative before rerunning the full suite. + +For a **single** failed test, skip triage and go straight to the artifact: ```bash testsprite test artifact get --out ./.testsprite/runs// diff --git a/src/commands/test.test.ts b/src/commands/test.test.ts index fd4e6ad..0e1ceb6 100644 --- a/src/commands/test.test.ts +++ b/src/commands/test.test.ts @@ -29,6 +29,7 @@ import { runDelete, runFailureGet, runFailureSummary, + runFailureTriage, runGet, runList, runPlanPut, @@ -147,7 +148,7 @@ describe('createTestCommand — surface', () => { expect(failure).toBeDefined(); // M2.1 piece 3 adds `summary`. `get` is the bundle entry point; // `summary` is the lightweight analysis-only triage card. - expect(failure!.commands.map(c => c.name()).sort()).toEqual(['get', 'summary']); + expect(failure!.commands.map(c => c.name()).sort()).toEqual(['get', 'summary', 'triage']); }); it('list exposes the documented filter and pagination flags (including --cursor alias)', () => { @@ -283,6 +284,16 @@ describe('createTestCommand — surface', () => { expect(help).toContain('--dry-run'); }); + it('test failure triage --help includes GLOBAL_OPTS_HINT and --project', () => { + const test = createTestCommand(); + const failure = test.commands.find(c => c.name() === 'failure')!; + const failureTriage = failure.commands.find(c => c.name() === 'triage')!; + const help = captureHelp(failureTriage); + expect(help).toContain('testsprite --help'); + expect(help).toContain('--project'); + expect(help).toContain('--max-concurrency'); + }); + it('M2 sweep: all remaining leaf subcommands include GLOBAL_OPTS_HINT', () => { // Covers list, get, create, create-batch, steps, result, update, delete, // code get, code put, plan put — the full M2 surface that the dogfood @@ -3168,6 +3179,268 @@ describe('runFailureSummary', () => { }); }); +// ---------- runFailureTriage ---------- + +describe('runFailureTriage', () => { + const FAILED_TEST_A = { + id: 'test_a', + projectId: 'proj_1', + name: 'Checkout submit', + type: 'frontend' as const, + createdFrom: 'cli' as const, + status: 'failed' as const, + createdAt: '2026-06-26T10:00:00.000Z', + updatedAt: '2026-06-26T12:00:00.000Z', + }; + const FAILED_TEST_B = { + ...FAILED_TEST_A, + id: 'test_b', + name: 'Checkout validation', + updatedAt: '2026-06-26T12:01:00.000Z', + }; + const FAILED_TEST_C = { + ...FAILED_TEST_A, + id: 'test_c', + name: 'Health check', + type: 'backend' as const, + updatedAt: '2026-06-26T12:02:00.000Z', + }; + + const SHARED_REF = 'src/components/CheckoutForm.tsx:412'; + + function summaryFor(testId: string, overrides: Record = {}) { + return { + testId, + status: 'failed' as const, + failureKind: 'assertion' as const, + snapshotId: `snap_${testId}`, + rootCauseHypothesis: 'Submit button is disabled.', + recommendedFixTarget: { + kind: 'code' as const, + reference: SHARED_REF, + rationale: 'Fix validation predicate.', + }, + ...overrides, + }; + } + + it('JSON mode clusters failed tests by shared fix target', async () => { + const { credentialsPath } = makeCreds(); + const seen: string[] = []; + const fetchImpl = makeFetch(url => { + seen.push(url); + if (url.includes('/tests?') && url.includes('status=failed')) { + return { body: { items: [FAILED_TEST_A, FAILED_TEST_B, FAILED_TEST_C], nextToken: null } }; + } + if (url.includes('/tests/test_a/failure/summary')) { + return { body: summaryFor('test_a') }; + } + if (url.includes('/tests/test_b/failure/summary')) { + return { body: summaryFor('test_b') }; + } + if (url.includes('/tests/test_c/failure/summary')) { + return { + body: summaryFor('test_c', { + failureKind: 'network_timeout', + rootCauseHypothesis: null, + recommendedFixTarget: null, + }), + }; + } + throw new Error(`unexpected url: ${url}`); + }); + const out: string[] = []; + const got = await runFailureTriage( + { + profile: 'default', + output: 'json', + debug: false, + projectId: 'proj_1', + maxConcurrency: 5, + }, + { credentialsPath, fetchImpl, stdout: line => out.push(line) }, + ); + + expect(seen.some(u => u.includes('status=failed'))).toBe(true); + expect(got.summary.totalFailed).toBe(3); + expect(got.clusters).toHaveLength(2); + + const codeCluster = got.clusters.find(c => c.groupReason === 'fix_target'); + expect(codeCluster?.memberTestIds).toEqual(['test_a', 'test_b']); + // test_b is fresher (updatedAt) and both members have a hypothesis + expect(codeCluster?.representativeTestId).toBe('test_b'); + + const envCluster = got.clusters.find(c => c.groupReason === 'failure_kind'); + expect(envCluster?.memberTestIds).toEqual(['test_c']); + + expect(JSON.parse(out[0]!).clusters).toHaveLength(2); + }); + + it('text mode renders cluster summary lines', async () => { + const { credentialsPath } = makeCreds(); + const fetchImpl = makeFetch(url => { + if (url.includes('/tests?')) { + return { body: { items: [FAILED_TEST_A], nextToken: null } }; + } + return { body: summaryFor('test_a') }; + }); + const out: string[] = []; + await runFailureTriage( + { + profile: 'default', + output: 'text', + debug: false, + projectId: 'proj_1', + maxConcurrency: 5, + }, + { credentialsPath, fetchImpl, stdout: line => out.push(line) }, + ); + const block = out.join('\n'); + expect(block).toContain('projectId: proj_1'); + expect(block).toContain('representative: test_a'); + expect(block).toContain('Shared fix target:'); + }); + + it('dry-run emits canned clusters without network', async () => { + const out: string[] = []; + const got = await runFailureTriage( + { + profile: 'default', + output: 'json', + debug: false, + dryRun: true, + projectId: 'proj_dry', + maxConcurrency: 5, + }, + { stdout: line => out.push(line) }, + ); + expect(got.summary.clusterCount).toBe(2); + expect(got.clusters[0]?.groupReason).toBe('failure_kind'); + expect(JSON.parse(out[0]!).projectId).toBe('proj_dry'); + }); + + it('returns empty clusters when no failed tests match', async () => { + const { credentialsPath } = makeCreds(); + const fetchImpl = makeFetch(() => ({ body: { items: [], nextToken: null } })); + const out: string[] = []; + const got = await runFailureTriage( + { + profile: 'default', + output: 'json', + debug: false, + projectId: 'proj_empty', + maxConcurrency: 5, + }, + { credentialsPath, fetchImpl, stdout: line => out.push(line) }, + ); + expect(got.clusters).toEqual([]); + expect(got.summary.totalFailed).toBe(0); + expect(JSON.parse(out[0]!).clusters).toEqual([]); + }); + + it('skips tests whose failure summary returns NOT_FOUND', async () => { + const { credentialsPath } = makeCreds(); + const stderrLines: string[] = []; + const fetchImpl = makeFetch(url => { + if (url.includes('/tests?')) { + return { body: { items: [FAILED_TEST_A, FAILED_TEST_B], nextToken: null } }; + } + if (url.includes('/tests/test_a/failure/summary')) { + return { body: summaryFor('test_a') }; + } + return { + status: 404, + body: { + error: { + code: 'NOT_FOUND', + message: 'Test has no failing run.', + nextAction: 'No failing run.', + requestId: 'req_test', + details: { resource: 'test', id: 'test_b', reason: 'no_failing_run' }, + }, + }, + }; + }); + const got = await runFailureTriage( + { + profile: 'default', + output: 'json', + debug: false, + projectId: 'proj_1', + maxConcurrency: 5, + }, + { + credentialsPath, + fetchImpl, + stdout: () => undefined, + stderr: line => stderrLines.push(line), + }, + ); + expect(got.summary.totalFailed).toBe(1); + expect(got.summary.skipped).toBe(1); + expect(got.skipped?.[0]).toEqual({ testId: 'test_b', reason: 'no_failing_run' }); + expect(stderrLines.some(l => l.includes('skipped'))).toBe(true); + }); + + it('rejects missing projectId with VALIDATION_ERROR (exit 5)', async () => { + await expect( + runFailureTriage( + { + profile: 'default', + output: 'json', + debug: false, + projectId: '', + maxConcurrency: 5, + }, + { stdout: () => undefined }, + ), + ).rejects.toMatchObject({ code: 'VALIDATION_ERROR', exitCode: 5 }); + }); + + it('rejects invalid --max-concurrency with VALIDATION_ERROR (exit 5)', async () => { + await expect( + runFailureTriage( + { + profile: 'default', + output: 'json', + debug: false, + projectId: 'proj_1', + maxConcurrency: 0, + }, + { stdout: () => undefined }, + ), + ).rejects.toMatchObject({ code: 'VALIDATION_ERROR', exitCode: 5 }); + }); + + it('--filter keeps only tests whose name matches (case-insensitive)', async () => { + const { credentialsPath } = makeCreds(); + const fetchImpl = makeFetch(url => { + if (url.includes('/tests?')) { + return { + body: { + items: [FAILED_TEST_A, { ...FAILED_TEST_B, name: 'Profile update flow' }], + nextToken: null, + }, + }; + } + return { body: summaryFor('test_a') }; + }); + const got = await runFailureTriage( + { + profile: 'default', + output: 'json', + debug: false, + projectId: 'proj_1', + nameFilter: 'checkout', + maxConcurrency: 5, + }, + { credentialsPath, fetchImpl, stdout: () => undefined }, + ); + expect(got.summary.totalFailed).toBe(1); + expect(got.clusters[0]?.memberTestIds).toEqual(['test_a']); + }); +}); + // ---------- §6.7 runFailureGet ---------- const FAILED_STEPS: CliTestStep[] = [ diff --git a/src/commands/test.ts b/src/commands/test.ts index a86b929..3559eb7 100644 --- a/src/commands/test.ts +++ b/src/commands/test.ts @@ -65,6 +65,12 @@ import { createTicker } from '../lib/ticker.js'; import { RateThrottle } from '../lib/rate-throttle.js'; import { resolvePortalBase, resolvePortalUrl } from '../lib/facade.js'; import { loadConfig } from '../lib/config.js'; +import { + buildFailureClusters, + renderFailureTriageText, + type FailureTriageInput, + type FailureTriageResult, +} from '../lib/failure-triage.js'; /** * `details` debug block per the CLI OpenAPI `Test` schema @@ -1782,6 +1788,9 @@ export const DEFAULT_BATCH_RUN_CONCURRENCY = 50; /** Hard upper bound for --max-concurrency. Values above this are rejected with exit 5 (VALIDATION_ERROR). */ export const MAX_BATCH_CONCURRENCY = 100; +/** Default fan-out when fetching per-test failure summaries during triage. */ +export const DEFAULT_TRIAGE_CONCURRENCY = 5; + /** Client-side run-trigger throttle: 50 triggers per 60-second rolling window per key (sits just under the server's 60/min/key cap). */ export const BATCH_RUN_RATE_LIMIT = 50; /** Rolling window duration (ms) for the client-side trigger rate throttle. */ @@ -3983,6 +3992,217 @@ interface FailureSummaryOptions extends CommonOptions { testId: string; } +interface FailureTriageOptions extends CommonOptions { + projectId: string; + type?: 'frontend' | 'backend'; + nameFilter?: string; + maxConcurrency: number; +} + +/** + * `test failure triage --project ` — groups failed tests in a + * project into root-cause clusters using existing M2.1 analysis + * fields (`failureKind`, `recommendedFixTarget.reference`, + * `rootCauseHypothesis`). Lightweight: one `failure/summary` call + * per failed test, no bundle downloads. + * + * Client-side Phase-0 triage — deterministic heuristics only. When + * the backend ships native clustering, this command becomes a thin + * wrapper over the new read API. + */ +export async function runFailureTriage( + opts: FailureTriageOptions, + deps: TestDeps = {}, +): Promise { + requireProjectId(opts.projectId); + + if ( + !Number.isInteger(opts.maxConcurrency) || + opts.maxConcurrency < 1 || + opts.maxConcurrency > MAX_BATCH_CONCURRENCY + ) { + throw localValidationError('max-concurrency', 'must be an integer between 1 and 100'); + } + + const out = makeOutput(opts.output, deps); + const stderrFn = deps.stderr ?? ((line: string) => process.stderr.write(`${line}\n`)); + + if (opts.dryRun) { + const dryRunResult: FailureTriageResult = { + projectId: opts.projectId, + clusters: [ + { + clusterId: 'cluster_kind_network_timeout', + label: 'Environment issue (network_timeout)', + groupKey: 'kind:network_timeout', + groupReason: 'failure_kind', + failureKind: 'network_timeout', + representativeTestId: 'test_dryrun_a', + memberTestIds: ['test_dryrun_a', 'test_dryrun_b'], + members: [ + { + testId: 'test_dryrun_a', + testName: 'Dry-run checkout flow', + testType: 'frontend', + updatedAt: '2026-06-26T12:00:00.000Z', + status: 'failed', + failureKind: 'network_timeout', + snapshotId: 'snap_dryrun_a', + rootCauseHypothesis: null, + recommendedFixTarget: null, + }, + { + testId: 'test_dryrun_b', + testName: 'Dry-run profile update', + testType: 'frontend', + updatedAt: '2026-06-26T12:01:00.000Z', + status: 'failed', + failureKind: 'network_timeout', + snapshotId: 'snap_dryrun_b', + rootCauseHypothesis: null, + recommendedFixTarget: null, + }, + ], + canonicalRootCause: null, + confidence: 0.88, + fixPriority: 1, + }, + { + clusterId: 'cluster_ref_src_components_checkoutform_tsx_412', + label: 'Shared fix target: src/components/CheckoutForm.tsx:412', + groupKey: 'ref:src/components/CheckoutForm.tsx:412', + groupReason: 'fix_target', + failureKind: 'assertion', + representativeTestId: 'test_dryrun_c', + memberTestIds: ['test_dryrun_c'], + members: [ + { + testId: 'test_dryrun_c', + testName: 'Dry-run submit checkout', + testType: 'frontend', + updatedAt: '2026-06-26T12:02:00.000Z', + status: 'failed', + failureKind: 'assertion', + snapshotId: 'snap_dryrun_c', + rootCauseHypothesis: + 'Submit button is disabled because the credit-card field is empty.', + recommendedFixTarget: { + kind: 'code', + reference: 'src/components/CheckoutForm.tsx:412', + rationale: 'Disabled state originates from `isFormValid()`.', + }, + }, + ], + canonicalRootCause: 'Submit button is disabled because the credit-card field is empty.', + confidence: 0.7, + fixPriority: 3, + }, + ], + summary: { totalFailed: 3, clusterCount: 2, skipped: 0 }, + }; + out.print(dryRunResult, data => renderFailureTriageText(data as FailureTriageResult)); + return dryRunResult; + } + + const client = makeClient(opts, deps); + + const failedPage = await paginate( + async ({ pageSize, cursor }) => + client.get>('/tests', { + query: { + projectId: opts.projectId, + status: 'failed', + type: opts.type, + pageSize, + cursor, + }, + }), + {}, + ); + + let failedTests = failedPage.items.filter(t => t.status === 'failed'); + if (opts.nameFilter !== undefined && opts.nameFilter !== '') { + const needle = opts.nameFilter.toLowerCase(); + failedTests = failedTests.filter(t => t.name.toLowerCase().includes(needle)); + } + + if (failedTests.length === 0) { + const empty: FailureTriageResult = { + projectId: opts.projectId, + clusters: [], + summary: { totalFailed: 0, clusterCount: 0, skipped: 0 }, + }; + out.print(empty, data => renderFailureTriageText(data as FailureTriageResult)); + return empty; + } + + stderrFn( + `Fetching failure summaries for ${failedTests.length} failed test${failedTests.length !== 1 ? 's' : ''}…`, + ); + + const triageInputs: FailureTriageInput[] = []; + const skipped: Array<{ testId: string; reason: string }> = []; + const concurrencyLimit = opts.maxConcurrency; + let nextIdx = 0; + let inFlight = 0; + + await new Promise((resolve, reject) => { + function startNext(): void { + while (inFlight < concurrencyLimit && nextIdx < failedTests.length) { + const test = failedTests[nextIdx++]!; + inFlight++; + client + .get(`/tests/${encodeURIComponent(test.id)}/failure/summary`) + .then(summary => { + triageInputs.push({ + testId: test.id, + testName: test.name, + testType: test.type, + updatedAt: test.updatedAt, + summary: { + status: summary.status, + failureKind: summary.failureKind, + snapshotId: summary.snapshotId, + rootCauseHypothesis: summary.rootCauseHypothesis, + recommendedFixTarget: summary.recommendedFixTarget, + }, + }); + inFlight--; + startNext(); + if (inFlight === 0 && nextIdx >= failedTests.length) resolve(); + }) + .catch(err => { + if (err instanceof ApiError && err.code === 'NOT_FOUND') { + skipped.push({ testId: test.id, reason: 'no_failing_run' }); + if (opts.verbose) { + stderrFn(`[triage] skipped ${test.id} — no failing run (race or stale list row)`); + } + } else { + reject(err); + } + inFlight--; + startNext(); + if (inFlight === 0 && nextIdx >= failedTests.length) resolve(); + }); + } + } + startNext(); + if (failedTests.length === 0) resolve(); + }); + + const result = buildFailureClusters(opts.projectId, triageInputs); + if (skipped.length > 0) { + result.summary.skipped = skipped.length; + result.skipped = skipped; + stderrFn( + `[advisory] ${skipped.length} test${skipped.length !== 1 ? 's' : ''} skipped — listed as failed but had no failure summary (stale status or in-flight run).`, + ); + } + + out.print(result, data => renderFailureTriageText(data as FailureTriageResult)); + return result; +} + /** * `test failure summary ` — M2.1 piece 3. * @@ -8644,5 +8864,55 @@ function createTestFailureCommand(deps: TestDeps): Command { .action(async (testId: string, _cmdOpts, command: Command) => { await runFailureSummary({ ...resolveCommonOptions(command), testId }, deps); }); + failure + .command('triage') + .description( + 'Group all failed tests in a project into root-cause clusters (lightweight summary fan-out — no bundle downloads)', + ) + .requiredOption('--project ', 'project id (returned by `testsprite project list`)') + .option('--type ', 'filter by test type (frontend|backend)') + .option( + '--filter ', + 'only include tests whose name contains this substring (case-insensitive)', + ) + .option( + '--max-concurrency ', + `max parallel failure-summary fetches (1–${MAX_BATCH_CONCURRENCY}, default ${DEFAULT_TRIAGE_CONCURRENCY})`, + String(DEFAULT_TRIAGE_CONCURRENCY), + ) + .addHelpText( + 'after', + [ + 'Clusters are built client-side from existing M2.1 analysis fields:', + ' 1. shared recommendedFixTarget.reference', + ' 2. env-wide failureKind (infra, network, network_timeout, routing_404)', + ' 3. normalized rootCauseHypothesis prefix', + ' 4. singleton (one test per cluster)', + '', + 'After a batch run with many failures, triage first — then pull one bundle:', + ' testsprite test failure get --out ./.testsprite/failure', + '', + GLOBAL_OPTS_HINT, + ].join('\n'), + ) + .action( + async ( + cmdOpts: { project: string; type?: string; filter?: string; maxConcurrency?: string }, + command: Command, + ) => { + await runFailureTriage( + { + ...resolveCommonOptions(command), + projectId: cmdOpts.project, + type: parseEnumFlag(cmdOpts.type, 'type', TEST_TYPES), + nameFilter: cmdOpts.filter, + maxConcurrency: + parseNumericFlag(cmdOpts.maxConcurrency, 'max-concurrency') ?? + DEFAULT_TRIAGE_CONCURRENCY, + }, + deps, + ); + }, + ); return failure; } diff --git a/src/lib/failure-triage.test.ts b/src/lib/failure-triage.test.ts new file mode 100644 index 0000000..3ada6c3 --- /dev/null +++ b/src/lib/failure-triage.test.ts @@ -0,0 +1,212 @@ +import { describe, expect, it } from 'vitest'; +import { + buildFailureClusters, + computeClusterConfidence, + computeFixPriority, + computeGroupKey, + normalizeHypothesis, + pickRepresentativeTestId, + type FailureTriageInput, + type FailureTriageMember, +} from './failure-triage.js'; + +function makeInput( + overrides: Partial & { testId: string }, +): FailureTriageInput { + return { + testName: overrides.testName ?? `Test ${overrides.testId}`, + testType: overrides.testType ?? 'frontend', + updatedAt: overrides.updatedAt ?? '2026-06-26T12:00:00.000Z', + summary: overrides.summary ?? { + status: 'failed', + failureKind: 'assertion', + snapshotId: `snap_${overrides.testId}`, + rootCauseHypothesis: 'Submit button is disabled.', + recommendedFixTarget: null, + }, + ...overrides, + }; +} + +describe('normalizeHypothesis', () => { + it('collapses whitespace and lowercases', () => { + expect(normalizeHypothesis(' Auth Token Expired ')).toBe('auth token expired'); + }); + + it('returns null for empty input', () => { + expect(normalizeHypothesis(null)).toBeNull(); + expect(normalizeHypothesis(' ')).toBeNull(); + }); +}); + +describe('computeGroupKey', () => { + it('groups by fix target reference first', () => { + const key = computeGroupKey( + makeInput({ + testId: 't1', + summary: { + status: 'failed', + failureKind: 'assertion', + snapshotId: 'snap', + rootCauseHypothesis: 'Different text', + recommendedFixTarget: { + kind: 'code', + reference: 'src/auth.ts:42', + rationale: 'Fix auth', + }, + }, + }), + ); + expect(key).toEqual({ groupKey: 'ref:src/auth.ts:42', groupReason: 'fix_target' }); + }); + + it('groups env-wide failure kinds', () => { + const key = computeGroupKey( + makeInput({ + testId: 't2', + summary: { + status: 'failed', + failureKind: 'network_timeout', + snapshotId: 'snap', + rootCauseHypothesis: null, + recommendedFixTarget: null, + }, + }), + ); + expect(key).toEqual({ groupKey: 'kind:network_timeout', groupReason: 'failure_kind' }); + }); + + it('groups by normalized hypothesis when no ref or env kind', () => { + const key = computeGroupKey( + makeInput({ + testId: 't3', + summary: { + status: 'failed', + failureKind: 'assertion', + snapshotId: 'snap', + rootCauseHypothesis: 'Login form validation failed.', + recommendedFixTarget: null, + }, + }), + ); + expect(key.groupReason).toBe('hypothesis'); + expect(key.groupKey).toBe('hyp:login form validation failed.'); + }); + + it('falls back to singleton when no grouping signal', () => { + const key = computeGroupKey( + makeInput({ + testId: 't4', + summary: { + status: 'failed', + failureKind: 'unknown', + snapshotId: 'snap', + rootCauseHypothesis: null, + recommendedFixTarget: null, + }, + }), + ); + expect(key).toEqual({ groupKey: 'singleton:t4', groupReason: 'singleton' }); + }); +}); + +describe('pickRepresentativeTestId', () => { + const members: FailureTriageMember[] = [ + { + testId: 't_old', + testName: 'Old', + testType: 'backend', + updatedAt: '2026-06-25T00:00:00.000Z', + status: 'failed', + failureKind: 'assertion', + snapshotId: 'snap1', + rootCauseHypothesis: null, + recommendedFixTarget: null, + }, + { + testId: 't_rich', + testName: 'Rich', + testType: 'backend', + updatedAt: '2026-06-24T00:00:00.000Z', + status: 'failed', + failureKind: 'assertion', + snapshotId: 'snap2', + rootCauseHypothesis: 'Detailed root cause hypothesis.', + recommendedFixTarget: null, + }, + ]; + + it('prefers member with root-cause hypothesis', () => { + expect(pickRepresentativeTestId(members)).toBe('t_rich'); + }); +}); + +describe('computeClusterConfidence', () => { + it('scores multi-member fix_target clusters highest', () => { + expect(computeClusterConfidence('fix_target', 3)).toBe(0.92); + expect(computeClusterConfidence('singleton', 1)).toBe(0.4); + }); +}); + +describe('computeFixPriority', () => { + it('prioritizes infra failures first', () => { + expect(computeFixPriority('failure_kind', 'infra', 5)).toBe(1); + expect(computeFixPriority('singleton', 'assertion', 1)).toBe(10); + }); +}); + +describe('buildFailureClusters', () => { + it('merges tests sharing the same fix target into one cluster', () => { + const sharedRef = 'src/components/CheckoutForm.tsx:412'; + const result = buildFailureClusters('proj_abc', [ + makeInput({ + testId: 'test_a', + summary: { + status: 'failed', + failureKind: 'assertion', + snapshotId: 'snap_a', + rootCauseHypothesis: 'Button disabled.', + recommendedFixTarget: { kind: 'code', reference: sharedRef, rationale: 'Fix form' }, + }, + }), + makeInput({ + testId: 'test_b', + summary: { + status: 'failed', + failureKind: 'assertion', + snapshotId: 'snap_b', + rootCauseHypothesis: 'Cannot submit checkout.', + recommendedFixTarget: { kind: 'code', reference: sharedRef, rationale: 'Same file' }, + }, + }), + makeInput({ + testId: 'test_c', + summary: { + status: 'failed', + failureKind: 'network_timeout', + snapshotId: 'snap_c', + rootCauseHypothesis: null, + recommendedFixTarget: null, + }, + }), + ]); + + expect(result.summary).toEqual({ totalFailed: 3, clusterCount: 2, skipped: 0 }); + expect(result.clusters).toHaveLength(2); + + const envCluster = result.clusters.find(c => c.groupReason === 'failure_kind'); + expect(envCluster?.memberTestIds).toEqual(['test_c']); + expect(envCluster?.fixPriority).toBe(1); + + const codeCluster = result.clusters.find(c => c.groupReason === 'fix_target'); + expect(codeCluster?.memberTestIds).toEqual(['test_a', 'test_b']); + expect(codeCluster?.representativeTestId).toMatch(/^test_/); + expect(codeCluster?.confidence).toBe(0.92); + }); + + it('returns empty clusters when no inputs', () => { + const result = buildFailureClusters('proj_empty', []); + expect(result.clusters).toEqual([]); + expect(result.summary.totalFailed).toBe(0); + }); +}); diff --git a/src/lib/failure-triage.ts b/src/lib/failure-triage.ts new file mode 100644 index 0000000..961b9b1 --- /dev/null +++ b/src/lib/failure-triage.ts @@ -0,0 +1,343 @@ +/** + * Client-side failure triage — groups per-test failure summaries into + * root-cause clusters using deterministic heuristics over existing + * M2.1 analysis fields. No LLM calls; the backend remains the source + * of truth for per-test hypotheses. + * + * @see `runFailureTriage` in `src/commands/test.ts` + */ + +/** Failure kinds that usually indicate a shared environment outage. */ +export const ENV_WIDE_FAILURE_KINDS: ReadonlySet = new Set([ + 'infra', + 'network', + 'network_timeout', + 'routing_404', +]); + +export type FailureTriageGroupReason = 'fix_target' | 'failure_kind' | 'hypothesis' | 'singleton'; + +export interface FailureTriageMember { + testId: string; + testName: string; + testType: 'frontend' | 'backend'; + updatedAt: string; + status: string; + failureKind: string | null; + snapshotId: string; + rootCauseHypothesis: string | null; + recommendedFixTarget: { + kind: string; + reference: string | null; + rationale: string | null; + } | null; +} + +export interface FailureTriageCluster { + clusterId: string; + label: string; + groupKey: string; + groupReason: FailureTriageGroupReason; + failureKind: string | null; + representativeTestId: string; + memberTestIds: string[]; + members: FailureTriageMember[]; + canonicalRootCause: string | null; + confidence: number; + fixPriority: number; +} + +export interface FailureTriageResult { + projectId: string; + clusters: FailureTriageCluster[]; + summary: { + totalFailed: number; + clusterCount: number; + skipped: number; + }; + skipped?: Array<{ testId: string; reason: string }>; +} + +export interface FailureTriageInput { + testId: string; + testName: string; + testType: 'frontend' | 'backend'; + updatedAt: string; + summary: { + status: string; + failureKind: string | null; + snapshotId: string; + rootCauseHypothesis: string | null; + recommendedFixTarget: { + kind: string; + reference: string | null; + rationale: string | null; + } | null; + }; +} + +export interface GroupKeyResult { + groupKey: string; + groupReason: FailureTriageGroupReason; +} + +/** + * Normalize a root-cause hypothesis for coarse grouping. Collapses + * whitespace, lowercases, and caps length so minor punctuation + * differences don't split clusters. + */ +export function normalizeHypothesis(hypothesis: string | null): string | null { + if (hypothesis === null || hypothesis.trim() === '') return null; + const collapsed = hypothesis.trim().replace(/\s+/g, ' ').toLowerCase(); + return collapsed.length > 100 ? collapsed.slice(0, 100) : collapsed; +} + +/** + * Derive a deterministic group key for one failed test's summary. + * Priority: shared fix target → env-wide failure kind → hypothesis prefix → singleton. + */ +export function computeGroupKey(input: FailureTriageInput): GroupKeyResult { + const ref = input.summary.recommendedFixTarget?.reference?.trim(); + if (ref) { + return { groupKey: `ref:${ref}`, groupReason: 'fix_target' }; + } + + const kind = input.summary.failureKind; + if (kind !== null && ENV_WIDE_FAILURE_KINDS.has(kind)) { + return { groupKey: `kind:${kind}`, groupReason: 'failure_kind' }; + } + + const hyp = normalizeHypothesis(input.summary.rootCauseHypothesis); + if (hyp) { + return { groupKey: `hyp:${hyp}`, groupReason: 'hypothesis' }; + } + + return { groupKey: `singleton:${input.testId}`, groupReason: 'singleton' }; +} + +/** + * Pick the representative test for a cluster. Prefers the member with the + * richest analysis (non-null hypothesis), then the most recently updated, + * then lexicographic testId for determinism. + */ +export function pickRepresentativeTestId(members: FailureTriageMember[]): string { + const sorted = [...members].sort((a, b) => { + const aHyp = a.rootCauseHypothesis !== null ? 1 : 0; + const bHyp = b.rootCauseHypothesis !== null ? 1 : 0; + if (bHyp !== aHyp) return bHyp - aHyp; + + const aTime = new Date(a.updatedAt).getTime(); + const bTime = new Date(b.updatedAt).getTime(); + if (bTime !== aTime) return bTime - aTime; + + return a.testId.localeCompare(b.testId); + }); + return sorted[0]!.testId; +} + +/** + * Confidence score for a cluster based on grouping signal strength and size. + */ +export function computeClusterConfidence( + groupReason: FailureTriageGroupReason, + memberCount: number, +): number { + if (memberCount < 1) return 0; + const multi = memberCount >= 2; + + switch (groupReason) { + case 'fix_target': + return multi ? 0.92 : 0.7; + case 'failure_kind': + return multi ? 0.88 : 0.65; + case 'hypothesis': + return multi ? 0.78 : 0.55; + case 'singleton': + return 0.4; + default: + return 0.4; + } +} + +/** + * Lower fixPriority means "fix this cluster first". + */ +export function computeFixPriority( + groupReason: FailureTriageGroupReason, + failureKind: string | null, + memberCount: number, +): number { + if (failureKind === 'infra' || failureKind === 'network' || failureKind === 'network_timeout') { + return 1; + } + if (failureKind === 'routing_404') { + return 2; + } + if (groupReason === 'fix_target' && memberCount >= 2) { + return 3; + } + if (groupReason === 'failure_kind' && memberCount >= 2) { + return 4; + } + if (groupReason === 'hypothesis' && memberCount >= 2) { + return 5; + } + if (groupReason === 'singleton') { + return 10; + } + return 6; +} + +function buildClusterLabel( + groupReason: FailureTriageGroupReason, + members: FailureTriageMember[], + failureKind: string | null, +): string { + const rep = members.find(m => m.recommendedFixTarget?.reference) ?? members[0]!; + const ref = rep.recommendedFixTarget?.reference; + + if (groupReason === 'fix_target' && ref) { + return `Shared fix target: ${ref}`; + } + if (groupReason === 'failure_kind' && failureKind !== null) { + return `Environment issue (${failureKind})`; + } + if (groupReason === 'hypothesis') { + const hyp = rep.rootCauseHypothesis; + if (hyp) { + return hyp.length > 80 ? `${hyp.slice(0, 77)}…` : hyp; + } + } + return `Independent failure: ${rep.testName}`; +} + +function slugifyClusterId(groupKey: string): string { + const slug = groupKey + .toLowerCase() + .replace(/[^a-z0-9]+/g, '_') + .replace(/^_|_$/g, '') + .slice(0, 48); + return slug.length > 0 ? slug : 'unknown'; +} + +function toMember(input: FailureTriageInput): FailureTriageMember { + return { + testId: input.testId, + testName: input.testName, + testType: input.testType, + updatedAt: input.updatedAt, + status: input.summary.status, + failureKind: input.summary.failureKind, + snapshotId: input.summary.snapshotId, + rootCauseHypothesis: input.summary.rootCauseHypothesis, + recommendedFixTarget: input.summary.recommendedFixTarget, + }; +} + +/** + * Group triage inputs into clusters. Deterministic: same inputs always + * produce the same cluster ids and representative tests. + */ +export function buildFailureClusters( + projectId: string, + inputs: FailureTriageInput[], +): FailureTriageResult { + const groups = new Map< + string, + { reason: FailureTriageGroupReason; members: FailureTriageMember[] } + >(); + + for (const input of inputs) { + const { groupKey, groupReason } = computeGroupKey(input); + const member = toMember(input); + const existing = groups.get(groupKey); + if (existing) { + existing.members.push(member); + } else { + groups.set(groupKey, { reason: groupReason, members: [member] }); + } + } + + const clusters: FailureTriageCluster[] = []; + for (const [groupKey, { reason, members }] of groups) { + const representativeTestId = pickRepresentativeTestId(members); + const rep = members.find(m => m.testId === representativeTestId) ?? members[0]!; + const memberCount = members.length; + const failureKind = rep.failureKind; + + clusters.push({ + clusterId: `cluster_${slugifyClusterId(groupKey)}`, + label: buildClusterLabel(reason, members, failureKind), + groupKey, + groupReason: reason, + failureKind, + representativeTestId, + memberTestIds: members.map(m => m.testId).sort(), + members: [...members].sort((a, b) => a.testId.localeCompare(b.testId)), + canonicalRootCause: rep.rootCauseHypothesis, + confidence: computeClusterConfidence(reason, memberCount), + fixPriority: computeFixPriority(reason, failureKind, memberCount), + }); + } + + clusters.sort((a, b) => { + if (a.fixPriority !== b.fixPriority) return a.fixPriority - b.fixPriority; + if (b.memberTestIds.length !== a.memberTestIds.length) { + return b.memberTestIds.length - a.memberTestIds.length; + } + return a.clusterId.localeCompare(b.clusterId); + }); + + return { + projectId, + clusters, + summary: { + totalFailed: inputs.length, + clusterCount: clusters.length, + skipped: 0, + }, + }; +} + +/** + * Text renderer for `test failure triage` output. + */ +export function renderFailureTriageText(result: FailureTriageResult): string { + const lines: string[] = []; + lines.push(`projectId: ${result.projectId}`); + lines.push( + `summary: ${result.summary.totalFailed} failed test(s) → ${result.summary.clusterCount} cluster(s)`, + ); + if (result.summary.skipped > 0) { + lines.push(`skipped: ${result.summary.skipped} test(s) could not be summarized`); + } + lines.push(''); + + if (result.clusters.length === 0) { + lines.push('No failed tests found — nothing to triage.'); + return lines.join('\n'); + } + + for (const [idx, cluster] of result.clusters.entries()) { + lines.push( + `[${idx + 1}] ${cluster.label} (confidence ${(cluster.confidence * 100).toFixed(0)}%, fix priority ${cluster.fixPriority})`, + ); + lines.push(` clusterId: ${cluster.clusterId}`); + lines.push(` groupReason: ${cluster.groupReason}`); + if (cluster.failureKind !== null) lines.push(` failureKind: ${cluster.failureKind}`); + lines.push(` representative: ${cluster.representativeTestId}`); + lines.push( + ` affected (${cluster.memberTestIds.length}): ${cluster.memberTestIds.join(', ')}`, + ); + if (cluster.canonicalRootCause !== null) { + const hyp = + cluster.canonicalRootCause.length > 120 + ? `${cluster.canonicalRootCause.slice(0, 117)}…` + : cluster.canonicalRootCause; + lines.push(` rootCause: ${hyp}`); + } + lines.push(''); + } + + return lines.join('\n').trimEnd(); +} diff --git a/test/__snapshots__/help.snapshot.test.ts.snap b/test/__snapshots__/help.snapshot.test.ts.snap index b4ea59b..911d11b 100644 --- a/test/__snapshots__/help.snapshot.test.ts.snap +++ b/test/__snapshots__/help.snapshot.test.ts.snap @@ -334,6 +334,35 @@ Global options (--dry-run, --output, --profile, --endpoint-url, --verbose, --deb " `; +exports[`--help snapshots > test failure triage 1`] = ` +"Usage: testsprite test failure triage [options] + +Group all failed tests in a project into root-cause clusters (lightweight +summary fan-out — no bundle downloads) + +Options: + --project project id (returned by \`testsprite project list\`) + --type filter by test type (frontend|backend) + --filter only include tests whose name contains this substring + (case-insensitive) + --max-concurrency max parallel failure-summary fetches (1–100, default + 5) (default: "5") + -h, --help display help for command +Clusters are built client-side from existing M2.1 analysis fields: + 1. shared recommendedFixTarget.reference + 2. env-wide failureKind (infra, network, network_timeout, routing_404) + 3. normalized rootCauseHypothesis prefix + 4. singleton (one test per cluster) + +After a batch run with many failures, triage first — then pull one bundle: + testsprite test failure get --out ./.testsprite/failure + + +Global options (--dry-run, --output, --profile, --endpoint-url, --verbose, --debug): + testsprite --help +" +`; + exports[`--help snapshots > test get 1`] = ` "Usage: testsprite test get [options] diff --git a/test/help.snapshot.test.ts b/test/help.snapshot.test.ts index 2448c91..f0ea318 100644 --- a/test/help.snapshot.test.ts +++ b/test/help.snapshot.test.ts @@ -38,6 +38,7 @@ const cases: Array<[string, string[]]> = [ ['test steps', ['test', 'steps', '--help']], ['test result', ['test', 'result', '--help']], ['test failure get', ['test', 'failure', 'get', '--help']], + ['test failure triage', ['test', 'failure', 'triage', '--help']], ['test rerun', ['test', 'rerun', '--help']], // R5: regression guard for commands that gained new flag wording ['test create-batch', ['test', 'create-batch', '--help']],