From 7c101c605887c82a7a5498a72d0adc36be2c36f7 Mon Sep 17 00:00:00 2001 From: saagpatel Date: Sat, 30 May 2026 01:27:36 -0700 Subject: [PATCH] feat: presence-claim audit workflow + README/lead-paragraph scoring fixes Adds an external dynamic-workflow audit that independently re-checks the portfolio-truth snapshot's six presence claims against on-disk ground truth, and uses it to find and verify two scoring fixes in the auditor itself. Audit (read-only): - src/run_instructions_audit.py: deterministic pre-step (stratified pilot selection, evidence prep, live tool_today recompute, git drift) + bucket logic - scripts/presence-claims-audit.workflow.js: Workflow that fans out one Haiku verifier per repo (judging all 6 claims), deterministic tally, Sonnet synthesis - scripts/run-instructions-audit.workflow.js: original single-claim version, superseded by presence-claims (kept as the simpler example) Auditor fixes in analyze_project_context, both verified by the audit: - README fallback: presence claims now consider the top-level README, not only the primary context file (wires the previously-dormant readme_text param) - lead-paragraph fallback: a project summary is detected as the prose under the H1 title, not only under an "## Overview" section Verified deterministically on a 16-repo pilot: overall agreement 79% -> 90%, project_summary 75% -> 100%, stack 75% -> 100%. Adds direct unit coverage for analyze_project_context (previously untested). Full suite: 2091 passed. Canonical portfolio-truth-latest.json intentionally NOT regenerated (the fixes shift context_quality portfolio-wide; that actualization is a separate step). --- ...29-run-instructions-external-audit-plan.md | 763 ++++++++++++++++++ ...6-05-29-run-instructions-external-audit.md | 386 +++++++++ scripts/presence-claims-audit.workflow.js | 219 +++++ scripts/run-instructions-audit.workflow.js | 141 ++++ src/portfolio_context_contract.py | 50 +- src/run_instructions_audit.py | 156 ++++ tests/test_portfolio_context_contract.py | 116 +++ tests/test_run_instructions_audit.py | 224 +++++ 8 files changed, 2052 insertions(+), 3 deletions(-) create mode 100644 docs/plans/2026-05-29-run-instructions-external-audit-plan.md create mode 100644 docs/plans/2026-05-29-run-instructions-external-audit.md create mode 100644 scripts/presence-claims-audit.workflow.js create mode 100644 scripts/run-instructions-audit.workflow.js create mode 100644 src/run_instructions_audit.py create mode 100644 tests/test_portfolio_context_contract.py create mode 100644 tests/test_run_instructions_audit.py diff --git a/docs/plans/2026-05-29-run-instructions-external-audit-plan.md b/docs/plans/2026-05-29-run-instructions-external-audit-plan.md new file mode 100644 index 0000000..ab89782 --- /dev/null +++ b/docs/plans/2026-05-29-run-instructions-external-audit-plan.md @@ -0,0 +1,763 @@ +# Run-Instructions External Audit — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. **Note:** Tasks 7–8 (build + run the `Workflow`) must run in the main session, since only it can call the `Workflow` tool. + +**Goal:** Independently re-check the snapshot's `run_instructions_present` claim against on-disk ground truth across a stratified pilot of ~19 repos, and produce a discrepancy report. + +**Architecture:** A deterministic, read-only Python pre-step (`src/run_instructions_audit.py`, TDD'd) selects the pilot and computes per-repo metadata + a live `tool_today` recompute, emitting compact JSON. A `Workflow` (`scripts/run-instructions-audit.workflow.js`) fans out one Haiku subagent per repo to read the files and judge (blind to the tool's answer), tallies buckets in deterministic JS, and a single Sonnet call writes the markdown report. + +**Tech Stack:** Python 3.11+ (pytest), the `Workflow` tool (JS orchestration, Haiku verifiers, Sonnet synthesis), `ctx_execute` to run the pre-step. + +**Spec:** `docs/plans/2026-05-29-run-instructions-external-audit.md` + +--- + +## File Structure + +| File | Responsibility | +|---|---| +| `src/run_instructions_audit.py` (create) | Stage 0 pilot selection + Stage 1 evidence prep + Stage 3 bucket logic (pure fns reused by the workflow's JS mirror). Read-only. | +| `tests/test_run_instructions_audit.py` (create) | Unit tests for every pure fn + tmp_path tests for the IO fns. | +| `scripts/run-instructions-audit.workflow.js` (create) | Stage 2 verifier fan-out + Stage 3 JS tally + Stage 4 synthesis. | +| `output/run-instructions-audit-2026-05-29.md` (generated) | The report. Gitignored. | + +**Bucket / drift-bucket logic is defined once in Python and mirrored in the workflow JS.** The Python copy is the tested source of truth; the JS copy is a 6-line transcription verified against it in Task 7. + +--- + +## Task 1: Module scaffold + `is_fork_junk` + `assign_bucket` (Stage 3 truth table) + +**Files:** +- Create: `src/run_instructions_audit.py` +- Test: `tests/test_run_instructions_audit.py` + +- [ ] **Step 1: Write the failing test** + +```python +# tests/test_run_instructions_audit.py +from src.run_instructions_audit import assign_bucket, is_fork_junk + + +def test_is_fork_junk_flags_known_patterns(): + assert is_fork_junk("AssistSupport-openssl-cve-2026-42327") + assert is_fork_junk("BrowserHistoryVisualizer-security-fix") + assert is_fork_junk("ApplyKit-private-history-backup-20260517.bundle") + assert not is_fork_junk("Fun:GamePrjs/BattleGrid") + assert not is_fork_junk("mcpforge") + + +def test_assign_bucket_truth_table(): + # agreement + assert assign_bucket(True, True, True) == "agree_present" + assert assign_bucket(False, False, False) == "agree_absent" + # false negatives (tool said absent, verifier found it) + assert assign_bucket(False, True, True) == "fn_alias_gap" # evidence in primary file + assert assign_bucket(False, True, False) == "fn_blind_spot" # evidence only in README/other + # false positive (tool over-claimed) + assert assign_bucket(True, False, False) == "fp_overclaim" + assert assign_bucket(True, False, True) == "fp_overclaim" +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `python -m pytest tests/test_run_instructions_audit.py -q` +Expected: FAIL — `ModuleNotFoundError: No module named 'src.run_instructions_audit'` + +- [ ] **Step 3: Write minimal implementation** + +```python +# src/run_instructions_audit.py +"""External audit of the snapshot's run_instructions_present claim (pre-step). + +Stage 0 (stratified pilot selection) + Stage 1 (evidence prep + live tool_today +recompute) run here as a deterministic, read-only pre-step. The compact JSON this +emits is consumed as `args` by scripts/run-instructions-audit.workflow.js, whose +Haiku subagents read the repo files and judge. Never writes repos/snapshot/git. +""" +from __future__ import annotations + +import json +import re +import subprocess +from datetime import datetime +from pathlib import Path + +from src.portfolio_context_contract import ( + analyze_project_context, + choose_primary_context_file, +) +from src.portfolio_truth_sources import _collect_context_files + +FORK_JUNK_PATTERNS = (r"-security-fix", r"-cve-", r"-backup-", r"\.bundle$", r"-openssl-") + + +def is_fork_junk(path: str) -> bool: + return any(re.search(pattern, path) for pattern in FORK_JUNK_PATTERNS) + + +def assign_bucket(tool_today: bool, verdict: bool, evidence_in_primary: bool) -> str: + if tool_today == verdict: + return "agree_present" if verdict else "agree_absent" + if verdict and not tool_today: + return "fn_alias_gap" if evidence_in_primary else "fn_blind_spot" + return "fp_overclaim" +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `python -m pytest tests/test_run_instructions_audit.py -q` +Expected: PASS (2 passed) + +- [ ] **Step 5: Commit** + +```bash +git add src/run_instructions_audit.py tests/test_run_instructions_audit.py +git commit -m "feat: add run-instructions audit pre-step scaffold + bucket logic" +``` + +--- + +## Task 2: `assign_drift_bucket` (snapshot-vs-today drift) + +**Files:** +- Modify: `src/run_instructions_audit.py` +- Test: `tests/test_run_instructions_audit.py` + +- [ ] **Step 1: Write the failing test** + +```python +from src.run_instructions_audit import assign_drift_bucket + + +def test_assign_drift_bucket(): + # snapshot still matches today's recompute → no field drift + assert assign_drift_bucket(True, True, True) == "claim_same" + assert assign_drift_bucket(False, False, False) == "claim_same" + # field value changed AND repo has commits since snapshot → explained by drift + assert assign_drift_bucket(False, True, True) == "claim_changed_drift" + # field value changed with NO commits since snapshot → unexplained (snapshot was wrong) + assert assign_drift_bucket(False, True, False) == "claim_changed_nodrift" +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `python -m pytest tests/test_run_instructions_audit.py::test_assign_drift_bucket -q` +Expected: FAIL — `ImportError: cannot import name 'assign_drift_bucket'` + +- [ ] **Step 3: Write minimal implementation** + +Add to `src/run_instructions_audit.py` after `assign_bucket`: + +```python +def assign_drift_bucket(snapshot_claim: bool, tool_today: bool, repo_drifted: bool) -> str: + if snapshot_claim == tool_today: + return "claim_same" + return "claim_changed_drift" if repo_drifted else "claim_changed_nodrift" +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `python -m pytest tests/test_run_instructions_audit.py -q` +Expected: PASS (3 passed) + +- [ ] **Step 5: Commit** + +```bash +git add src/run_instructions_audit.py tests/test_run_instructions_audit.py +git commit -m "feat: add drift bucket logic to run-instructions audit" +``` + +--- + +## Task 3: `select_pilot` (Stage 0 — stratified, deterministic) + +**Files:** +- Modify: `src/run_instructions_audit.py` +- Test: `tests/test_run_instructions_audit.py` + +- [ ] **Step 1: Write the failing test** + +```python +from src.run_instructions_audit import select_pilot + + +def _project(key, quality, *, status="active", path=None): + return { + "identity": {"project_key": key, "path": path or key, "display_name": key.split("/")[-1]}, + "derived": { + "registry_status": status, + "context_quality": quality, + "context_files": ["CLAUDE.md"], + "run_instructions_present": False, + }, + } + + +def test_select_pilot_stratifies_sorts_and_filters(): + projects = ( + [_project(f"b{i}", "boilerplate") for i in range(6)] + + [_project("n1", "none"), _project("n2", "none")] + + [_project("arch", "full", status="archived")] + + [_project("junk-security-fix", "full")] + + [_project("z-full", "full"), _project("a-full", "full")] + ) + selected = select_pilot(projects, per_tier={"none": 3, "boilerplate": 4, "full": 4}) + keys = [p["identity"]["project_key"] for p in selected] + + # archived + fork-junk excluded + assert "arch" not in keys and "junk-security-fix" not in keys + # boilerplate capped at 4 of 6 + assert sum(k.startswith("b") for k in keys) == 4 + # full sorted by project_key → a-full before z-full + assert keys.index("a-full") < keys.index("z-full") + # both 'none' present (only 2 available, asked for 3) + assert {"n1", "n2"} <= set(keys) +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `python -m pytest tests/test_run_instructions_audit.py::test_select_pilot_stratifies_sorts_and_filters -q` +Expected: FAIL — `ImportError: cannot import name 'select_pilot'` + +- [ ] **Step 3: Write minimal implementation** + +Add the `DEFAULT_PER_TIER` constant near the top (under `FORK_JUNK_PATTERNS`): + +```python +DEFAULT_PER_TIER = { + "none": 3, + "boilerplate": 4, + "minimum-viable": 4, + "standard": 4, + "full": 4, +} +``` + +Add the function: + +```python +def select_pilot(projects: list[dict], *, per_tier: dict[str, int] = DEFAULT_PER_TIER) -> list[dict]: + eligible = [ + p + for p in projects + if p["derived"]["registry_status"] != "archived" + and not is_fork_junk(p["identity"]["path"]) + ] + selected: list[dict] = [] + for tier, count in per_tier.items(): + tier_projects = sorted( + (p for p in eligible if p["derived"]["context_quality"] == tier), + key=lambda p: p["identity"]["project_key"], + ) + selected.extend(tier_projects[:count]) + return selected +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `python -m pytest tests/test_run_instructions_audit.py -q` +Expected: PASS (4 passed) + +- [ ] **Step 5: Commit** + +```bash +git add src/run_instructions_audit.py tests/test_run_instructions_audit.py +git commit -m "feat: add stratified pilot selection (Stage 0)" +``` + +--- + +## Task 4: `build_record` (Stage 1a — compact record from snapshot, pure) + +**Files:** +- Modify: `src/run_instructions_audit.py` +- Test: `tests/test_run_instructions_audit.py` + +- [ ] **Step 1: Write the failing test** + +```python +from src.run_instructions_audit import build_record + + +def test_build_record_resolves_path_and_primary(): + project = { + "identity": { + "project_key": "Fun:GamePrjs/BattleGrid", + "path": "Fun:GamePrjs/BattleGrid", + "display_name": "BattleGrid", + }, + "derived": { + "context_files": ["AGENTS.md", "README.md"], + "run_instructions_present": False, + }, + } + record = build_record(project, "/Users/d/Projects") + + assert record["abs_path"] == "/Users/d/Projects/Fun:GamePrjs/BattleGrid" + assert record["primary_file_name"] == "AGENTS.md" # no CLAUDE.md → AGENTS.md + assert record["snapshot_claim"] is False + assert record["context_files"] == ["AGENTS.md", "README.md"] + assert record["project_key"] == "Fun:GamePrjs/BattleGrid" + + +def test_build_record_prefers_claude_md(): + project = { + "identity": {"project_key": "x", "path": "x", "display_name": "x"}, + "derived": {"context_files": ["AGENTS.md", "CLAUDE.md"], "run_instructions_present": True}, + } + assert build_record(project, "/w")["primary_file_name"] == "CLAUDE.md" +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `python -m pytest tests/test_run_instructions_audit.py -k build_record -q` +Expected: FAIL — `ImportError: cannot import name 'build_record'` + +- [ ] **Step 3: Write minimal implementation** + +```python +def build_record(project: dict, workspace_root: str) -> dict: + path = project["identity"]["path"] + context_files = project["derived"]["context_files"] + return { + "project_key": project["identity"]["project_key"], + "display_name": project["identity"]["display_name"], + "abs_path": str(Path(workspace_root) / path), + "primary_file_name": choose_primary_context_file(context_files), + "context_files": context_files, + "snapshot_claim": bool(project["derived"]["run_instructions_present"]), + } +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `python -m pytest tests/test_run_instructions_audit.py -q` +Expected: PASS (6 passed) + +- [ ] **Step 5: Commit** + +```bash +git add src/run_instructions_audit.py tests/test_run_instructions_audit.py +git commit -m "feat: add build_record (Stage 1a) for run-instructions audit" +``` + +--- + +## Task 5: IO fns — `is_after`, `compute_drifted`, `compute_tool_today` (Stage 1b/c) + +**Files:** +- Modify: `src/run_instructions_audit.py` +- Test: `tests/test_run_instructions_audit.py` + +- [ ] **Step 1: Write the failing test** + +```python +import os +import subprocess + +from src.run_instructions_audit import compute_drifted, compute_tool_today, is_after + + +def test_is_after_compares_tz_aware_iso(): + assert is_after("2026-05-25T19:25:00-07:00", "2026-05-17T05:01:39+00:00") + assert not is_after("2026-05-10T00:00:00+00:00", "2026-05-17T05:01:39+00:00") + + +def test_compute_tool_today_true_when_run_heading_matches_alias(tmp_path): + # "## Usage" IS a run_instructions alias → present + (tmp_path / "CLAUDE.md").write_text( + "# Proj\n\n## Usage\n\nRun the dev server with `npm run dev`. It serves on :3000.\n" + ) + assert compute_tool_today(str(tmp_path)) is True + + +def test_compute_tool_today_false_when_run_heading_outside_alias(tmp_path): + # "## Running" is NOT in the alias list → the tool misses it (alias-gap case) + (tmp_path / "CLAUDE.md").write_text( + "# Proj\n\n## Running\n\nStart it with `npm run dev`. This is genuine run guidance.\n" + ) + assert compute_tool_today(str(tmp_path)) is False + + +def _git_commit_at(path, iso): + env = { + **os.environ, + "GIT_AUTHOR_DATE": iso, "GIT_COMMITTER_DATE": iso, + "GIT_AUTHOR_NAME": "t", "GIT_AUTHOR_EMAIL": "t@t", + "GIT_COMMITTER_NAME": "t", "GIT_COMMITTER_EMAIL": "t@t", + } + subprocess.run(["git", "init", "-q"], cwd=path, check=True) + subprocess.run(["git", "commit", "-q", "--allow-empty", "-m", "x"], cwd=path, env=env, check=True) + + +def test_compute_drifted_true_when_commit_after_snapshot(tmp_path): + _git_commit_at(tmp_path, "2026-05-25T00:00:00+00:00") + assert compute_drifted(str(tmp_path), "2026-05-17T05:01:39+00:00") is True + + +def test_compute_drifted_false_when_commit_before_snapshot(tmp_path): + _git_commit_at(tmp_path, "2026-05-01T00:00:00+00:00") + assert compute_drifted(str(tmp_path), "2026-05-17T05:01:39+00:00") is False + + +def test_compute_drifted_false_for_non_git_dir(tmp_path): + assert compute_drifted(str(tmp_path), "2026-05-17T05:01:39+00:00") is False +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `python -m pytest tests/test_run_instructions_audit.py -k "is_after or tool_today or drifted" -q` +Expected: FAIL — `ImportError: cannot import name 'is_after'` + +- [ ] **Step 3: Write minimal implementation** + +```python +def compute_tool_today(abs_path: str) -> bool: + project_path = Path(abs_path) + analysis = analyze_project_context(project_path, _collect_context_files(project_path)) + return bool(analysis.run_instructions_present) + + +def is_after(commit_iso: str, generated_at_iso: str) -> bool: + return datetime.fromisoformat(commit_iso) > datetime.fromisoformat(generated_at_iso) + + +def compute_drifted(abs_path: str, generated_at: str) -> bool: + try: + result = subprocess.run( + ["git", "-C", abs_path, "log", "-1", "--format=%cI"], + capture_output=True, + text=True, + timeout=10, + ) + except (subprocess.SubprocessError, OSError): + return False + commit_iso = result.stdout.strip() + if result.returncode != 0 or not commit_iso: + return False + return is_after(commit_iso, generated_at) +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `python -m pytest tests/test_run_instructions_audit.py -q` +Expected: PASS (12 passed). If `compute_tool_today` cases fail on heading parsing, inspect `analyze_project_context` behavior on the fixture and adjust the fixture headings — not the assertion intent. + +- [ ] **Step 5: Commit** + +```bash +git add src/run_instructions_audit.py tests/test_run_instructions_audit.py +git commit -m "feat: add live tool_today recompute + git drift detection (Stage 1)" +``` + +--- + +## Task 6: `prepare_pilot` + `main` (Stage 0+1 orchestration) + +**Files:** +- Modify: `src/run_instructions_audit.py` +- Test: `tests/test_run_instructions_audit.py` + +- [ ] **Step 1: Write the failing test** + +```python +import json + +from src.run_instructions_audit import prepare_pilot + + +def test_prepare_pilot_builds_records_and_reports_missing_dirs(tmp_path): + workspace = tmp_path / "ws" + real = workspace / "RealRepo" + real.mkdir(parents=True) + (real / "CLAUDE.md").write_text("# R\n\n## Usage\n\nRun `npm run dev` to start the server.\n") + + snapshot = { + "workspace_root": str(workspace), + "generated_at": "2026-05-17T05:01:39+00:00", + "projects": [ + { + "identity": {"project_key": "RealRepo", "path": "RealRepo", "display_name": "RealRepo"}, + "derived": { + "registry_status": "active", "context_quality": "full", + "context_files": ["CLAUDE.md"], "run_instructions_present": True, + }, + }, + { + "identity": {"project_key": "GhostRepo", "path": "GhostRepo", "display_name": "GhostRepo"}, + "derived": { + "registry_status": "active", "context_quality": "full", + "context_files": ["CLAUDE.md"], "run_instructions_present": False, + }, + }, + ], + } + snap_path = tmp_path / "snap.json" + snap_path.write_text(json.dumps(snapshot)) + + result = prepare_pilot(str(snap_path), per_tier={"full": 4}) + + assert result["workspace_root"] == str(workspace) + assert len(result["records"]) == 1 + assert len(result["errors"]) == 1 + record = result["records"][0] + assert record["project_key"] == "RealRepo" + assert record["tool_today"] is True # live recompute on the fixture file + assert record["drifted"] is False # no git repo → not drifted + assert result["errors"][0]["error"] == "missing_dir" + assert result["errors"][0]["project_key"] == "GhostRepo" +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `python -m pytest tests/test_run_instructions_audit.py -k prepare_pilot -q` +Expected: FAIL — `ImportError: cannot import name 'prepare_pilot'` + +- [ ] **Step 3: Write minimal implementation** + +```python +def prepare_pilot(snapshot_path: str, *, per_tier: dict[str, int] = DEFAULT_PER_TIER) -> dict: + snapshot = json.loads(Path(snapshot_path).read_text()) + workspace_root = snapshot["workspace_root"] + generated_at = snapshot["generated_at"] + records: list[dict] = [] + errors: list[dict] = [] + for project in select_pilot(snapshot["projects"], per_tier=per_tier): + record = build_record(project, workspace_root) + if not Path(record["abs_path"]).is_dir(): + errors.append( + { + "project_key": record["project_key"], + "abs_path": record["abs_path"], + "error": "missing_dir", + } + ) + continue + record["tool_today"] = compute_tool_today(record["abs_path"]) + record["drifted"] = compute_drifted(record["abs_path"], generated_at) + records.append(record) + return { + "generated_at": generated_at, + "workspace_root": workspace_root, + "records": records, + "errors": errors, + } + + +def main() -> None: + import sys + + snapshot_path = sys.argv[1] if len(sys.argv) > 1 else "output/portfolio-truth-latest.json" + print(json.dumps(prepare_pilot(snapshot_path), indent=2)) + + +if __name__ == "__main__": + main() +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `python -m pytest tests/test_run_instructions_audit.py -q` +Expected: PASS (13 passed) + +- [ ] **Step 5: Verify ruff + run against the real snapshot (smoke, read-only)** + +Run: `python -m ruff check src/run_instructions_audit.py && python -m src.run_instructions_audit output/portfolio-truth-latest.json | python3 -c "import json,sys; d=json.load(sys.stdin); print('records', len(d['records']), 'errors', len(d['errors']))"` +Expected: ruff clean; ~15–20 records, errors listed (not crashed). Confirms real paths resolve. + +- [ ] **Step 6: Commit** + +```bash +git add src/run_instructions_audit.py tests/test_run_instructions_audit.py +git commit -m "feat: add prepare_pilot orchestrator + CLI entrypoint (Stage 0+1)" +``` + +--- + +## Task 7: The Workflow script (Stages 2–4) + 2-repo smoke + +**Files:** +- Create: `scripts/run-instructions-audit.workflow.js` + +This task is **main-session only** (it calls the `Workflow` tool). No pytest. + +- [ ] **Step 1: Write the workflow script** + +```javascript +// scripts/run-instructions-audit.workflow.js +export const meta = { + name: 'run-instructions-audit', + description: 'Audit snapshot run_instructions_present claim against on-disk ground truth', + phases: [ + { title: 'Verify', detail: 'one Haiku subagent per pilot repo reads files and judges' }, + { title: 'Synthesize', detail: 'one Sonnet call writes the markdown report' }, + ], +} + +const VERIFIER_SCHEMA = { + type: 'object', + additionalProperties: false, + required: ['verdict', 'evidence_in_primary', 'evidence_quote', 'evidence_location', 'confidence'], + properties: { + verdict: { type: 'boolean' }, + evidence_in_primary: { type: 'boolean' }, + evidence_quote: { type: 'string', maxLength: 240 }, + evidence_location: { type: 'string' }, + confidence: { type: 'string', enum: ['high', 'med', 'low'] }, + }, +} + +const { generated_at, records, errors } = args + +function verifierPrompt(rec) { + return [ + `You audit whether a project documents HOW TO RUN IT. Judge independently — you are NOT told the tool's answer.`, + `Project: ${rec.project_key}`, + `Directory (absolute): ${rec.abs_path}`, + `The tool treats "${rec.primary_file_name}" as the PRIMARY context file; it may be absent.`, + `Listed context files: ${JSON.stringify(rec.context_files)}.`, + ``, + `Do this:`, + `1. Read the primary file (if present), README.md, and the other listed context files, by absolute path under the directory.`, + `2. Decide: do these files genuinely tell a developer how to run/start the project — a run command, dev server, build+run steps, or quickstart? A bare dependency-install ("pip install", "## Installation" of deps) alone is NOT run instructions.`, + `3. If yes: verdict=true; quote the exact run command or heading (<=240 chars) in evidence_quote; set evidence_location like "CLAUDE.md §Usage" or "README §Getting Started".`, + `4. evidence_in_primary=true ONLY if that evidence is inside "${rec.primary_file_name}". If the primary file is absent, or the evidence is only in README/another file, set it false.`, + `5. If no run instructions exist anywhere, verdict=false with empty quote/location. Default to false when uncertain.`, + ].join('\n') +} + +// --- Stage 3 tally logic (mirror of src/run_instructions_audit.py) --- +function assignBucket(toolToday, verdict, inPrimary) { + if (toolToday === verdict) return verdict ? 'agree_present' : 'agree_absent' + if (verdict && !toolToday) return inPrimary ? 'fn_alias_gap' : 'fn_blind_spot' + return 'fp_overclaim' +} +function assignDrift(snapshotClaim, toolToday, drifted) { + if (snapshotClaim === toolToday) return 'claim_same' + return drifted ? 'claim_changed_drift' : 'claim_changed_nodrift' +} + +phase('Verify') +const verified = await parallel( + records.map((rec) => () => + agent(verifierPrompt(rec), { + label: `verify:${rec.project_key}`, + phase: 'Verify', + model: 'haiku', + agentType: 'Explore', + schema: VERIFIER_SCHEMA, + }) + .then((v) => ({ rec, v })) + .catch(() => null) + ) +) + +const rows = verified.filter(Boolean).map(({ rec, v }) => ({ + project_key: rec.project_key, + primary_file_name: rec.primary_file_name, + snapshot_claim: rec.snapshot_claim, + tool_today: rec.tool_today, + drifted: rec.drifted, + ...v, + bucket: assignBucket(rec.tool_today, v.verdict, v.evidence_in_primary), + drift_bucket: assignDrift(rec.snapshot_claim, rec.tool_today, rec.drifted), +})) + +const counts = rows.reduce((acc, r) => ((acc[r.bucket] = (acc[r.bucket] || 0) + 1), acc), {}) +const disagreements = rows.filter((r) => !r.bucket.startsWith('agree')) +const agreementRate = rows.length ? (rows.length - disagreements.length) / rows.length : 0 +log(`Verified ${rows.length} repos — ${disagreements.length} disagreements, agreement ${(agreementRate * 100).toFixed(0)}%`) + +phase('Synthesize') +const synthesisPrompt = [ + `Write a markdown audit report for the snapshot claim "run_instructions_present". Return ONLY markdown, no preamble.`, + `Facts: snapshot generated_at=${generated_at}; repos verified=${rows.length}; agreement rate (verifier vs tool_today)=${(agreementRate * 100).toFixed(0)}%.`, + `Bucket counts: ${JSON.stringify(counts)}.`, + `Unresolved-path errors: ${JSON.stringify(errors)}.`, + `Disagreement rows (JSON): ${JSON.stringify(disagreements, null, 2)}`, + ``, + `Required sections:`, + `1. Headline — repos, agreement rate, counts per bucket.`, + `2. Disagreements — a table keyed by project_key with columns: bucket, evidence_quote, evidence_location, confidence, drifted.`, + `3. Drift summary — count rows where drift_bucket != "claim_same", split claim_changed_drift (explained) vs claim_changed_nodrift (snapshot likely wrong).`, + `4. Prescriptive fixes — for fn_alias_gap rows, the exact headings to add to CONTEXT_SECTION_ALIASES; if any fn_blind_spot rows, recommend choose_primary_context_file consider README.md; for fp_overclaim, flag the over-claim.`, +].join('\n') + +const report = await agent(synthesisPrompt, { label: 'synthesis', phase: 'Synthesize', model: 'sonnet' }) + +return { + report, + stats: { verified: rows.length, agreementRate, counts, disagreements: disagreements.length, errors: errors.length }, + rows, +} +``` + +- [ ] **Step 2: Sanity-check the JS bucket logic matches Python** + +Confirm by eye that `assignBucket`/`assignDrift` in the JS are line-for-line equivalent to `assign_bucket`/`assign_drift_bucket` in `src/run_instructions_audit.py` (same branch order, same string returns). They are the same six lines. + +- [ ] **Step 3: 2-repo smoke run (main session)** + +In the main session: +1. Run the pre-step and capture the JSON payload: + `python -m src.run_instructions_audit output/portfolio-truth-latest.json` +2. Slice the payload to its first 2 `records` (keep `generated_at`, `workspace_root`, `errors`). +3. Call `Workflow({ scriptPath: "scripts/run-instructions-audit.workflow.js", args: })`. + +Expected: 2 Haiku verifiers run + 1 Sonnet synthesis; the tool returns `{ report, stats, rows }` with `rows.length === 2` and each row carrying a `bucket` + `drift_bucket`. If schema validation errors occur, fix the prompt/schema and re-run only this smoke. + +- [ ] **Step 4: Commit the workflow script** + +```bash +git add scripts/run-instructions-audit.workflow.js +git commit -m "feat: add run-instructions audit Workflow (verify fan-out + tally + synthesis)" +``` + +--- + +## Task 8: Run the full pilot + write the report + hand-validate + +This task is **main-session only**. No pytest. + +- [ ] **Step 1: Run the pilot end-to-end** + +1. `python -m src.run_instructions_audit output/portfolio-truth-latest.json` → full payload (~19 records). +2. `Workflow({ scriptPath: "scripts/run-instructions-audit.workflow.js", args: })`. + +Expected: ~19 Haiku verifiers + 1 Sonnet synthesis; returns `{ report, stats, rows }`. + +- [ ] **Step 2: Write the report + a rows sidecar for auditing** + +In the main session, `Write` the returned `report` to `output/run-instructions-audit-2026-05-29.md`, and `Write` `JSON.stringify(rows, null, 2)` to `output/run-instructions-audit-2026-05-29.rows.json` (so every verdict is inspectable). + +- [ ] **Step 3: Hand-validate (the whole point of a small pilot)** + +For **every** disagreement row, open the cited file at `evidence_location` in the repo and confirm the verifier's call by hand. Note any verifier error. Confirm buckets sum to the verified count and that `errors` (unresolved dirs) were reported, not dropped. + +- [ ] **Step 4: Record the outcome** + +Append a short "Pilot result" note to the spec (`docs/plans/2026-05-29-run-instructions-external-audit.md`): agreement rate, dominant bucket (tests the blind-spot hypothesis), and any verifier misses found during hand-validation. Commit: + +```bash +git add output/run-instructions-audit-2026-05-29.md output/run-instructions-audit-2026-05-29.rows.json docs/plans/2026-05-29-run-instructions-external-audit.md +git commit -m "chore: run-instructions audit pilot results" +``` + +> Note: `output/` is gitignored. If you want the report in git, add an explicit force-add (`git add -f`) — otherwise the commit captures only the spec note, which is fine. + +--- + +## Self-Review + +**Spec coverage:** +- §5 Stage 0 (pilot selection) → Task 3. Stage 1 (evidence prep + `tool_today` + drift) → Tasks 4–5. Stage 2 (verifier) → Task 7. Stage 3 (tally) → Tasks 1–2 (logic) + Task 7 (applied in JS). Stage 4 (synthesis + report) → Task 7 + Task 8. ✓ +- §5 "where each stage runs" (pre-step in main session via `ctx_execute`/CLI; fan-out in workflow; FS reads in subagents) → Tasks 6–8. ✓ +- §6 data contract (record + verifier schema fields) → Task 4 (record), Task 7 (`VERIFIER_SCHEMA`). ✓ +- §7 guarantees: read-only (no writes in any fn), Haiku fan-out + Sonnet synthesis + no Opus in fan-out (Task 7 `model` pins), blind verification (prompt omits the claim). ✓ +- §8 gotchas: drift via `tool_today`+drift bucket (Tasks 2,5); README blind spot as `fn_blind_spot` (Task 1); polluted population filtered (Task 3); path encoding via snapshot `path` join + `is_dir()` pre-flight (Tasks 4,6); workflow JS no-FS handled by pre-step split (Tasks 6–8). ✓ +- §9 out of scope respected (no extra booleans, no numeric score, pilot-only N). ✓ +- §10 done criteria → Task 8 steps 2–3. ✓ + +**Placeholder scan:** No TBD/TODO; all steps have real code or exact commands. ✓ + +**Type consistency:** `assign_bucket(tool_today, verdict, evidence_in_primary)` / `assign_drift_bucket(snapshot_claim, tool_today, repo_drifted)` identical in Python (Tasks 1–2) and JS (`assignBucket`/`assignDrift`, Task 7). Record keys (`project_key, display_name, abs_path, primary_file_name, context_files, snapshot_claim, tool_today, drifted`) consistent across Tasks 4/6/7. Verifier schema fields (`verdict, evidence_in_primary, evidence_quote, evidence_location, confidence`) consistent Task 7 ↔ spec §6. Bucket strings (`agree_present/agree_absent/fn_alias_gap/fn_blind_spot/fp_overclaim`) and drift strings (`claim_same/claim_changed_drift/claim_changed_nodrift`) consistent. ✓ diff --git a/docs/plans/2026-05-29-run-instructions-external-audit.md b/docs/plans/2026-05-29-run-instructions-external-audit.md new file mode 100644 index 0000000..6f6207f --- /dev/null +++ b/docs/plans/2026-05-29-run-instructions-external-audit.md @@ -0,0 +1,386 @@ +# Spec — External Audit of `run_instructions_present` (Dynamic Workflow) + +- **Date:** 2026-05-29 +- **Status:** Design approved; spec under review (pre-implementation) +- **Branch:** `feat/run-instructions-external-audit` +- **Owner:** d +- **Type:** Read-only verification workflow (first dynamic-workflow build for this project) + +## 1. Purpose + +`portfolio-truth-latest.json` makes ~22 derived claims about each of 132 projects on disk. +This workflow is a **second pair of eyes the tool cannot fool**: it independently re-checks one +high-value claim — `run_instructions_present` — against the actual files on disk, using LLM +judgment where the tool uses a brittle regex. It produces a discrepancy report for human review. +It never modifies repos, the snapshot, or git state. + +This is **not** a replacement for `portfolio_truth_validate.py` (which checks the snapshot's +internal schema/consistency). This audits the snapshot's claims against **ground truth**. + +## 2. What we are auditing (the heuristic under test) + +Every `derived` presence-boolean traces through one function: + +- `_inspect_project_dir` — `src/portfolio_truth_sources.py:200` +- → `analyze_project_context` — `src/portfolio_context_contract.py:135` +- → `choose_primary_context_file` — `src/portfolio_context_contract.py:128` + (returns `CLAUDE.md`, else `AGENTS.md` — **never `README.md`**) +- → `_section_has_meaningful_content` — `src/portfolio_context_contract.py:265` + (true iff a markdown heading matches a hardcoded alias in `CONTEXT_SECTION_ALIASES` **and** + has non-trivial text under it) + +So `run_instructions_present` is true **iff** the primary file has a heading matching the alias +list, with content. Two known failure modes follow directly from the code: + +- **Alias gap:** run instructions exist in the primary file under a heading the alias list does + not recognize. NB the list is already broad — `how to run`, `usage`, `getting started`, + `quick start`, `commands`, `local setup`, `local development`, `build run`, + `development commands`, `run instructions` — so realistic misses are headings *outside* it + (`## Running`, `## Run the app`, `## Develop`, `## Scripts`, `## Make targets`) or run steps + present only as prose / a code block under no recognized heading. +- **Input blind spot:** run instructions exist only in `README.md`, which the tool never reads + as the primary file → the project can score `context_quality: none` despite rich human docs. + +Because the alias list is broad, the **dominant discrepancy class is likely the blind spot +(README-only repos), not the alias gap** — a hypothesis this pilot will test. Either way, both +are exactly what an LLM reading the prose catches and a regex never will. + +## 3. Locked scope decisions + +| Decision | Choice | Rationale | +|---|---|---| +| Claim to verify | `run_instructions_present` | Atomic, semantic, fragile heuristic, demonstrable, cheap. Widening to all 6 booleans later is near-free (same file already read). | +| Evidence mode | Categorized ground-truth | Verifier judges whether run instructions truly exist anywhere, and the harness classifies *why* the tool missed: alias-gap vs blind-spot vs genuinely-absent. Each bucket implies a different fix. | +| Population (first run) | Stratified pilot ~15–20 | Small enough to hand-verify every disagreement and earn trust in the mechanism before scaling. Same script scales to full 132 as a one-line change. | + +## 4. Snapshot facts (as of generation) + +- `schema_version`: `0.4.0` +- `generated_at`: `2026-05-17T05:01:39Z` (**12 days stale as of this spec — drift is real, see §8**) +- `workspace_root`: `/Users/d/Projects` +- `projects`: 132 (a list; key on `identity.project_key`, **not** `display_name` — dupes exist: + `IncidentWorkbench`, `OrbitForge`, `StatusPage`) +- `context_quality_counts`: `none: 3, boilerplate: 17, minimum-viable: 66, standard: 27, full: 19` + +## 5. Architecture — four stages + live-recompute refinement + +Only **Stage 2** is LLM judgment. Everything else is deterministic code. The harness controls +the evidence; the LLM only judges it. + +### Where each stage physically runs (critical for a Workflow build) + +The Workflow script's JS sandbox has **no filesystem access** and cannot run Python. Therefore: + +| Stage | Runs in | Mechanism | +|---|---|---| +| 0 · Pilot selection | Main session (pre-step) | `ctx_execute(python)` — reads snapshot, emits compact records | +| 1 · Evidence prep + live recompute | Main session (pre-step) | `ctx_execute(python)` — per-repo metadata only, **no file bodies into context** | +| 2 · Per-repo verifier | Workflow `agent()` fan-out | Haiku subagents `Read` their own repo files | +| 3 · Tally | Workflow JS | pure arithmetic over verdicts + passed-in claims | +| 4 · Synthesis | Workflow `agent()` | one Sonnet call → report markdown | +| (write report) | Main session | `Write` to `output/…md` | + +The pre-step (Stages 0–1) emits a compact `evidence_packets` array passed to the Workflow as +`args`. **File bodies never enter the main Opus context** — they are read inside each Haiku +subagent. This is the context-hygiene win of the harness-controls-metadata / subagent-reads-body +split. + +### Stage 0 · Pilot selection (deterministic) +- Read `output/portfolio-truth-latest.json`. +- Exclude `derived.registry_status == "archived"` and fork-junk by path/name regex + (`-security-fix`, `-cve-`, `-backup-`, `.bundle`, `-openssl-`). +- Group remaining by `derived.context_quality`; within each tier sort by `project_key`. +- Take all 3 from `none`; 4 each from `boilerplate / minimum-viable / standard / full`. → ~19. +- Output one record per repo (no file bodies): + `{ project_key, display_name, abs_path, primary_file_name, context_files[], snapshot_claim }` + where `abs_path = workspace_root + "/" + path` and `snapshot_claim = + derived.run_instructions_present`. + +### Stage 1 · Evidence prep + live recompute (deterministic, read-only) +For each pilot record, compute and attach: +- `primary_file_name` = `choose_primary_context_file(context_files)`. +- `tool_today` = live recompute on **today's** files: + ```python + from src.portfolio_context_contract import analyze_project_context + from src.portfolio_truth_sources import _collect_context_files + tool_today = analyze_project_context( + abs_path, _collect_context_files(abs_path) + ).run_instructions_present + ``` +- `drifted` = repo has git commits after `generated_at` + (`git -C log -1 --format=%cI` > `2026-05-17T05:01:39Z`). +- Pre-flight: assert the **directory** `abs_path` resolves. The primary file **may legitimately + be absent** (README-only repos — `choose_primary_context_file` still returns `AGENTS.md`); that + is a `fn_blind_spot` candidate, **not** an error. Only an unresolvable `abs_path` (missing dir) + goes to a separate error bucket — never silently dropped. + +`tool_today` is the refinement: it lets us isolate **heuristic error** from **snapshot drift** +without depending on the snapshot being fresh. + +### Stage 2 · Per-repo verifier (Haiku, schema-locked — the only LLM call in the fan-out) +One subagent per record. Given `{abs_path, primary_file_name, context_files}`, it `Read`s **all +available context files** (the primary file flagged; the primary may be absent for README-only +repos), then judges. It is told which file is primary; it is **not** told the tool's claim (blind +verification). `evidence_in_primary` is `false` when the primary file is absent or the evidence +lives only in a non-primary file. Forced schema: + +```jsonc +{ + "verdict": true | false, // do run instructions genuinely exist in these files? + "evidence_in_primary": true | false,// is the evidence in the primary file (vs only README/other)? + "evidence_quote": "string<=240", // the actual run command / heading text, or "" + "evidence_location": "CLAUDE.md §Usage" | "README §Getting Started" | "", + "confidence": "high" | "med" | "low" +} +``` +Rule (anti-inflation): if no run instructions exist anywhere, return `verdict:false` and quote +nothing. Default-to-false on uncertainty. + +### Stage 3 · Tally (deterministic JS in the workflow) +Combine each verdict with `tool_today` (authoritative tool answer) to assign a bucket: + +| `tool_today` | `verdict` | `evidence_in_primary` | bucket | +|---|---|---|---| +| true | true | — | `agree_present` | +| false | false | — | `agree_absent` | +| false | true | true | `fn_alias_gap` → add an alias | +| false | true | false | `fn_blind_spot` → broaden primary-file selection | +| true | false | — | `fp_overclaim` → tool matched an empty/trivial heading | + +Separately, `snapshot_claim` vs `tool_today` → **drift bucket** (`fresh` / `claim_changed` / +`claim_same`), so a stale snapshot never masquerades as a heuristic miss. + +### Stage 4 · Synthesis + report (one Sonnet call) +Given the tallied disagreements, name the **pattern** ("the tool systematically misses run +instructions documented under headings outside the alias set, e.g. `## Running`") and emit +markdown: +- **Headline:** N repos, agreement rate (verifier vs `tool_today`), counts per bucket. +- **Disagreement table**, keyed by `project_key`: bucket, verifier quote + location, confidence, + drift flag. +- **Drift summary:** how many snapshot claims differ from `tool_today`. +- **Prescriptive fixes:** which aliases to add to `CONTEXT_SECTION_ALIASES`; whether + `choose_primary_context_file` should consider `README.md`. + +Main session writes the report to `output/run-instructions-audit-2026-05-29.md`. + +## 6. Data contract — assembled per-repo record (post-tally) +```jsonc +{ + "project_key": "Fun:GamePrjs/BattleGrid", + "display_name": "BattleGrid", + "abs_path": "/Users/d/Projects/Fun:GamePrjs/BattleGrid", + "primary_file_name": "AGENTS.md", + "snapshot_claim": false, + "tool_today": false, + "drifted": false, + "verifier": { /* Stage 2 schema */ }, + "bucket": "fn_alias_gap", + "drift_bucket": "claim_same" +} +``` + +## 7. Guarantees +- **Read-only:** repos, snapshot, and git are never written. Only artifact is the report file in + `output/` (gitignored). +- **No Opus in the fan-out:** Stage 2 pinned `model: "haiku"`; Stage 4 `model: "sonnet"`; Opus is + orchestrator only (writes/edits the script, reviews results). +- **Blind verification:** verifier is not shown the tool's claim, removing anchoring bias. +- **Cost:** ~19 small Haiku calls + 1 Sonnet synthesis → pennies, ~1–2 min wall-clock. + +## 8. Known risks / gotchas +1. **Snapshot drift (12 days).** Handled by `tool_today` live recompute + drift bucket; raw + verifier-vs-snapshot would conflate heuristic error with change-since-snapshot. +2. **README blind spot is a whole discrepancy class** (`fn_blind_spot`) and is arguably the + biggest source of wrong claims — surfaced explicitly, not collapsed. +3. **Polluted population (132 incl. archived + fork-junk + colon-encoded nested paths).** Stage 0 + filters; pre-flight asserts path resolution. +4. **Path encoding.** Real dirs include colons (`Fun:GamePrjs/`) and near-duplicates + (`FunGamePrjs`, top-level `BattleGrid`). Always use snapshot `path` joined to `workspace_root`; + never reconstruct from `display_name`. +5. **Workflow JS sandbox has no FS/Python.** Drives the main-session-pre-step architecture (§5). + +## 9. Out of scope (explicit — future widening, not silent cuts) +- Verifying the other 5 presence booleans (same subagent file-read; trivial to add later). +- The numeric `context_quality_score` (Arc-H merge gate) — lives in `src/context_quality.py`, a + separate code path **not** in the snapshot. +- Full 132-repo run — invocation #2 after the pilot is hand-validated. +- Auditing mechanical claims (`has_tests`, `has_ci`, staleness, …) — a script reproduces those + exactly; no LLM value. + +## 10. Done criteria (first run) +- Report written to `output/run-instructions-audit-2026-05-29.md`. +- Every disagreement carries a verifier quote + location and a drift flag. +- Buckets sum to the pilot count; no path silently dropped (unresolved paths reported). +- Operator can open each flagged repo and confirm the verifier's call by hand. + +## Pilot result (run 2026-05-29) + +Pilot ran: 16 repos (0 unresolved paths), via a 2-repo smoke then the full fan-out +(16 Haiku verifiers + 1 Sonnet synthesis, ~56s, ~800k subagent tokens). Report: +`output/run-instructions-audit-2026-05-29.md`; per-row sidecar: +`output/run-instructions-audit-2026-05-29.rows.json`. + +**Headline: 75% agreement (12/16). All 4 disagreements are `fn_blind_spot`. Zero +`fn_alias_gap`, zero `fp_overclaim`.** The spec's hypothesis is confirmed: the alias list +is broad enough that the regex rarely misses *within* the primary file — the tool's only +real weakness on this claim is being structurally blind to `README.md`. + +**Hand-validation (every disagreement, by reading the files):** all 4 confirmed true. +`BattleGrid` / `OrbitForge` / `SlackIncidentBot` each have a generic Codex-OS bootstrap +`AGENTS.md` primary (Communication Contract / Definition of Done / Verification Contract — +no run content) while the real run instructions live in `README.md` (`make dev`, +`pnpm exec tauri dev`, `cargo run --release`). The verifier's semantic judgment and its +`evidence_in_primary=false` calls were accurate. + +**Confound — auditing the auditor:** `GithubRepoAuditor` was scanned on this `feat` +branch (cut from `main`), where `main`'s "prepare public distribution path" had removed +`CLAUDE.md` **and** `AGENTS.md`. So its scanned tree is the stripped public state — the +`fn_blind_spot` is valid for that tree (README is the only run-doc) and `drifted=true` +fired correctly, but it does **not** reflect the codex working branch (which still has +`CLAUDE.md`) or the snapshot state. Lesson: run the pre-step from the repo's canonical +branch when the auditor audits itself. + +**Second-order nuance (not a bucket error):** 3 of the 12 `agree_present` rows +(`LoreKeeper`, `Afterimage`, +1) have `evidence_in_primary=false` — the tool reports +present (from a matched primary-file heading) but the verifier found the *runnable command* +in `README.md`. "Right for the wrong reason." Preserved in `rows.json`; doesn't change the +present/absent verdict. A v2 could escalate this to its own bucket. + +**Actionable fix surfaced:** broaden `choose_primary_context_file` (or add a `README.md` +fallback for structural claims like run instructions) — this resolves all 4 blind spots. +No additions to `CONTEXT_SECTION_ALIASES` are warranted by this pilot. + +**Mechanism verdict:** the workflow is trustworthy on this claim — verifier calls matched +ground truth on every disagreement. Ready to widen to all 6 presence booleans and/or the +full 132-repo population (run the pre-step from each repo's canonical branch; for the +auditor repo specifically, scan a non-stripped branch). + +## Widening to all 6 presence claims (run 2026-05-29, branch `feat/widen-audit-six-claims`) + +The pre-step now captures `snapshot_claims{}` and `tool_today{}` as dicts over all six +fields (`CLAIM_FIELDS`); the workflow (`scripts/presence-claims-audit.workflow.js`) has the +verifier judge all six in one file-read and the tally produces one cell per (repo × claim). +Report: `output/presence-claims-audit-2026-05-29.md`; cells: +`output/presence-claims-audit-2026-05-29.cells.json`. Same 16-repo sample, 2-repo smoke +first, then 16 Haiku verifiers + 1 Sonnet (~139s, ~864k subagent tokens). + +**Result: 96 cells, 80% agreement.** Per-claim scorecard: + +| claim | agreement | fn_blind_spot | fn_alias_gap | +|---|---|---|---| +| project_summary_present | 75% | 4 | 0 | +| current_state_present | 87.5% | 2 | 0 | +| stack_present | 75% | 4 | 0 | +| run_instructions_present | 75% | 4 | 0 | +| known_risks_present | 81% | 2 | 1 | +| next_recommended_move_present | 87.5% | 1 | 1 | + +Totals: 70 agree_present, 7 agree_absent, **17 fn_blind_spot, 2 fn_alias_gap** (0 overclaim). + +**Key finding — the README blind spot is not specific to run instructions; it dominates ALL +six claims.** The same generic-`AGENTS.md`-stub repos (BattleGrid, OrbitForge, +SlackIncidentBot) carry their real summary/stack/run docs in `README.md`, which the tool +never reads as primary. So a single fix — teach `choose_primary_context_file` to fall back +to `README.md` — would lift agreement across the whole claim set, not just one claim. + +**The 2 fn_alias_gap cells are NOT real alias gaps.** Both are LoreKeeper +(`known_risks`, `next_recommended_move`). Hand-verification: LoreKeeper's `AGENTS.md` has an +**unclosed ```bash fence under `## How To Run`** (no command, never closed) that swallows the +later `## Known Risks` and `## Next Recommended Move` headings — so the tool's markdown parser +correctly excludes them, while the verifier (reading raw text) counts them as documented. The +aliases `known risks` / `next recommended move` already exist; the real bug is the Codex-OS +`AGENTS.md` generator's malformed fence. The synthesis was hardened to flag fn_alias_gap as +**review candidates** (alias-may-already-exist / malformed-markdown / sub-threshold / +verifier-over-credit) rather than auto-prescribing alias additions — and the run's report did +so correctly. + +**GithubRepoAuditor:** all 6 cells are `claim_changed_drift` — the same auditor-audits-itself +confound (CLAUDE.md/AGENTS.md stripped on the `feat` branch cut from `main`'s public- +distribution prep). Valid for that tree; not representative of the canonical branch. + +**Verifier-reliability note (carried from the smoke):** the verifier judges *human +readability*, so on malformed-markdown sections it can disagree with the tool's parser. That +makes fn_alias_gap a "review me" bucket, not a "trust me" bucket — the audit surfaces +candidates; a human (or a parser-aware follow-up) adjudicates. + +**Not yet done (deliberate):** scaling to all ~132 repos (a `per_tier`/"all" change in the +pre-step), fixing the README blindness, and hardening into a one-command runner — parked per +operator decision. + +## Fix + re-verify (branch `feat/readme-fallback-fix`, 2026-05-29) + +Closed the loop: fixed the README blindness, then used this audit as the regression test. + +**The fix** wires the dormant `readme_text` parameter of `analyze_project_context` +(`src/portfolio_context_contract.py`): each presence boolean is now "documented in the primary +file **OR** the top-level README"; the `context_quality == "none"` gate also considers the +README. Primary-file *identity* is unchanged (surgical). Added +`tests/test_portfolio_context_contract.py` (5 tests; the function previously had **no** direct +unit coverage). Full suite 2087 passed, 0 regressions. + +**Verification — measured three ways (the deterministic one is authoritative):** + +| measure | agreement | meaning | +|---|---|---| +| pre-fix | 80% (77/96) | baseline | +| **fix-only** (pre-fix verifier verdicts held constant, only `tool_today` recomputed) | **86% (83/96)** | the pure effect of the code change | +| post-fix live re-run | 85% (82/96) | fix effect minus verifier-variance noise | + +**The fix did exactly what was predicted: +6 cells** — `stack` 75%→**100%** (4 cells) and `run` +75%→87.5% (2 cells). These are the cases where the README uses a conventional alias heading +(`## Tech Stack`, `## Quick Start`, `## Getting Started`). + +**The fix is deliberately partial — it exposed a second, deeper layer.** "README blindness" was +really two problems: (1) *wrong file* (content in README, tool read AGENTS) — now fixed; and +(2) *no matching heading* — project summaries are the **lead paragraph under the `# Title`** +(not an `## Overview` section), and some content sits under bespoke headings +(`## Recommended Default Path`, `## What This Project Is Today`) or nested in sub-sections/code +blocks. Heading-alias matching cannot see those *regardless of which file it reads*. The 11 +residual `fn_blind_spot` cells are almost entirely this layer-2 problem; the 2 `fn_alias_gap` +remain the LoreKeeper malformed-fence artifact. + +**Methodological finding — the verifier (Haiku) is not perfectly deterministic.** Re-running on +identical files flipped exactly **1 of 96 cells** (Afterimage `next_recommended_move`, +`agree_present`→`fp_overclaim`; the verifier wobbled on whether a generic portfolio-context +block counts as a real "next move"). So a fresh re-run conflates the fix with ~1% verifier +noise. **To measure a fix, hold the verifier verdicts constant and recompute only `tool_today`** +(the deterministic 86% above) — re-running the full fan-out is for confirming the harness +re-runs, not for the before/after number. + +**Net:** the fix is verified, the audit functioned as a regression harness, and the loop caught +an over-claim (my initial "~98%, all blind spots close") before it could mislead. Residual work +is the layer-2 heading problem (a harder heuristic change: lead-paragraph summaries, bespoke +headings) — not a file-scope gap. Artifacts: `output/presence-claims-audit-2026-05-29-postfix.md` +and `…-postfix.cells.json`. + +## Layer-2 fix: lead-paragraph summaries (same branch, 2026-05-29) + +Scoped to the single largest, lowest-risk layer-2 category: project summaries that live as the +**lead paragraph under the `# Title`** rather than under an `## Overview` section (4 of the +residual blind spots, a universal README convention). Deferred the riskier layer-2 cases +(bespoke-heading fuzzy matching, nested-code detection, supporting-file scan) as +false-positive-prone / lower-value. + +**The fix:** new `_has_lead_summary` / `_lead_paragraph_text` helpers extract the prose between +the H1 title and the first `##` heading (stripping badges/images, keeping link text); for +`project_summary` only, when no alias section matched, a non-trivial lead paragraph now counts. +4 new tests; full suite 2091 passed, 0 regressions. + +**Verified deterministically (verifier verdicts held constant, only `tool_today` recomputed — +the method lesson from the prior round, so no re-sampling noise):** + +| step | overall agreement | project_summary | +|---|---|---| +| baseline (no fix) | 76/96 = 79% | 12/16 | +| + README fallback | 82/96 = 85% | 12/16 | +| **+ lead-paragraph (both fixes)** | **86/96 = 90%** | **16/16 (100%)** | + +`project_summary` 75% → **100%** (all 4 lead-paragraph blind spots closed); overall **79% → 90%** +across the two fixes. (Baseline shows 79% here, not the earlier 80%, only because one consistent +verdict set is applied across all three columns — making the deltas pure tool-effects.) + +**Residual at 90% (≈10 cells) is now mostly non-heuristic or deliberately deferred:** the 2 +LoreKeeper malformed-fence cells (generator bug), the Afterimage boilerplate/variance cell, the +GithubRepoAuditor branch confound, and a few deferred bespoke-heading/nested cases. Further gains +need either the riskier layer-2 work or upstream doc/generator fixes — not more file/lead-paragraph +plumbing. diff --git a/scripts/presence-claims-audit.workflow.js b/scripts/presence-claims-audit.workflow.js new file mode 100644 index 0000000..d83bee2 --- /dev/null +++ b/scripts/presence-claims-audit.workflow.js @@ -0,0 +1,219 @@ +// scripts/presence-claims-audit.workflow.js +// External audit of the snapshot's SIX presence claims against on-disk ground truth. +// Stage 2 (verifier fan-out, Haiku — judges all 6 claims per repo in one read) +// + Stage 3 (deterministic per-(repo,claim) tally) + Stage 4 (Sonnet synthesis). +// args = output of `python -m src.run_instructions_audit` ({ generated_at, workspace_root, records, errors }), +// where each record carries snapshot_claims{} and tool_today{} dicts over the 6 fields. +export const meta = { + name: "presence-claims-audit", + description: + "Audit the snapshot's 6 presence claims against on-disk ground truth", + phases: [ + { + title: "Verify", + detail: "one Haiku subagent per repo judges all 6 claims from the files", + }, + { + title: "Synthesize", + detail: "one Sonnet call writes the per-claim scorecard report", + }, + ], +}; + +const CLAIM_FIELDS = [ + "project_summary_present", + "current_state_present", + "stack_present", + "run_instructions_present", + "known_risks_present", + "next_recommended_move_present", +]; + +// What each claim means, for the verifier (semantic judgment, not heading-matching). +const CLAIM_DEFS = { + project_summary_present: + "states what the project IS — its purpose or what it does (not merely its name/tagline)", + current_state_present: + "states where the project stands now — status, current phase, or what is done / in progress", + stack_present: + "names the technology stack — languages, frameworks, or key tools", + run_instructions_present: + "tells a developer how to run/start it — a run command, dev server, or build+run steps (dependency install alone does NOT count)", + known_risks_present: + "documents known risks, issues, limitations, or intentional constraints", + next_recommended_move_present: + "states the recommended next step / what to do next", +}; + +const CLAIM_VERDICT = { + type: "object", + additionalProperties: false, + required: [ + "verdict", + "evidence_in_primary", + "evidence_quote", + "evidence_location", + "confidence", + ], + properties: { + verdict: { type: "boolean" }, + evidence_in_primary: { type: "boolean" }, + evidence_quote: { type: "string", maxLength: 240 }, + evidence_location: { type: "string" }, + confidence: { type: "string", enum: ["high", "med", "low"] }, + }, +}; + +const VERIFIER_SCHEMA = { + type: "object", + additionalProperties: false, + required: CLAIM_FIELDS, + properties: Object.fromEntries(CLAIM_FIELDS.map((f) => [f, CLAIM_VERDICT])), +}; + +const { generated_at, records, errors } = args; + +function verifierPrompt(rec) { + const defs = CLAIM_FIELDS.map( + (f, i) => `${i + 1}. ${f}: ${CLAIM_DEFS[f]}`, + ).join("\n"); + return [ + `You audit whether a project's docs genuinely document SIX things. Judge each independently from the files — you are NOT told the tool's answers.`, + `Project: ${rec.project_key}`, + `Directory (absolute): ${rec.abs_path}`, + `The tool treats "${rec.primary_file_name}" as the PRIMARY context file; it may be absent.`, + `Listed context files: ${JSON.stringify(rec.context_files)}.`, + ``, + `Read the primary file (if present), README.md, and the other listed context files by absolute path under the directory. Then, for EACH of these six claims, decide whether the docs genuinely document it:`, + defs, + ``, + `For each claim return an object with:`, + `- verdict: true if genuinely documented, else false (default false when uncertain).`, + `- evidence_in_primary: true ONLY if the evidence is inside "${rec.primary_file_name}". If that file is absent or the evidence is only in README/another file, false.`, + `- evidence_quote: the exact heading or sentence (<=240 chars) that documents it, or "" if absent.`, + `- evidence_location: e.g. "CLAUDE.md §Usage" or "README §Status", or "" if absent.`, + `- confidence: "high" | "med" | "low".`, + `Return one object per claim, keyed by the exact field names above.`, + ].join("\n"); +} + +// --- Stage 3 tally logic (mirror of src/run_instructions_audit.py) --- +function assignBucket(toolToday, verdict, inPrimary) { + if (toolToday === verdict) return verdict ? "agree_present" : "agree_absent"; + if (verdict && !toolToday) + return inPrimary ? "fn_alias_gap" : "fn_blind_spot"; + return "fp_overclaim"; +} +function assignDrift(snapshotClaim, toolToday, drifted) { + if (snapshotClaim === toolToday) return "claim_same"; + return drifted ? "claim_changed_drift" : "claim_changed_nodrift"; +} + +phase("Verify"); +const verified = await parallel( + records.map( + (rec) => () => + agent(verifierPrompt(rec), { + label: `verify:${rec.project_key}`, + phase: "Verify", + model: "haiku", + agentType: "Explore", + schema: VERIFIER_SCHEMA, + }) + .then((v) => ({ rec, v })) + .catch(() => null), + ), +); + +// Stage 3: one cell per (repo, claim). +const cells = []; +for (const item of verified.filter(Boolean)) { + const { rec, v } = item; + for (const claim of CLAIM_FIELDS) { + const cv = v[claim]; + cells.push({ + project_key: rec.project_key, + claim, + primary_file_name: rec.primary_file_name, + snapshot_claim: rec.snapshot_claims[claim], + tool_today: rec.tool_today[claim], + drifted: rec.drifted, + verdict: cv.verdict, + evidence_in_primary: cv.evidence_in_primary, + evidence_quote: cv.evidence_quote, + evidence_location: cv.evidence_location, + confidence: cv.confidence, + bucket: assignBucket( + rec.tool_today[claim], + cv.verdict, + cv.evidence_in_primary, + ), + drift_bucket: assignDrift( + rec.snapshot_claims[claim], + rec.tool_today[claim], + rec.drifted, + ), + }); + } +} + +// Per-claim aggregates. +const perClaim = CLAIM_FIELDS.map((claim) => { + const rows = cells.filter((c) => c.claim === claim); + const counts = rows.reduce( + (a, c) => ((a[c.bucket] = (a[c.bucket] || 0) + 1), a), + {}, + ); + const dis = rows.filter((c) => !c.bucket.startsWith("agree")); + const agree = rows.length - dis.length; + return { + claim, + total: rows.length, + agree, + agreementRate: rows.length ? agree / rows.length : 0, + counts, + disagreements: dis.length, + }; +}); + +const repoCount = verified.filter(Boolean).length; +const disagreements = cells.filter((c) => !c.bucket.startsWith("agree")); +const totalCells = cells.length; +const overallAgree = totalCells - disagreements.length; +log( + `Verified ${repoCount} repos × ${CLAIM_FIELDS.length} claims = ${totalCells} cells — ${disagreements.length} disagreements, overall ${(totalCells ? (overallAgree / totalCells) * 100 : 0).toFixed(0)}%`, +); + +phase("Synthesize"); +const synthesisPrompt = [ + `Write a markdown audit report for the snapshot's SIX presence claims. Return ONLY markdown, no preamble.`, + `Snapshot generated_at=${generated_at}. Repos=${repoCount}, claims=${CLAIM_FIELDS.length}, cells=${totalCells}, overall agreement (verifier vs tool_today)=${(totalCells ? (overallAgree / totalCells) * 100 : 0).toFixed(0)}%.`, + `Per-claim aggregates (JSON): ${JSON.stringify(perClaim, null, 2)}`, + `Unresolved-path errors: ${JSON.stringify(errors)}.`, + `All disagreement cells (JSON): ${JSON.stringify(disagreements, null, 2)}`, + ``, + `Required sections:`, + `1. Headline — repos, claims, overall agreement; a per-claim scorecard table: claim | agreement% | agree | fn_blind_spot | fn_alias_gap | fp_overclaim.`, + `2. Weakest claims — which claims have the lowest agreement and the dominant failure bucket for each.`, + `3. Disagreement details — grouped by claim; each row: project_key, bucket, evidence_quote, evidence_location, confidence, drifted.`, + `4. Drift summary — cells where drift_bucket != "claim_same", split claim_changed_drift vs claim_changed_nodrift.`, + `5. Prescriptive fixes — per claim. fn_blind_spot → content lives in a non-primary file (usually README): recommend choose_primary_context_file consider README.md. fn_alias_gap → the verifier found it in the PRIMARY file but the tool missed it; treat these as REVIEW CANDIDATES, not confirmed alias bugs. The verifier judges human-readability, so it may count text the tool's markdown parser correctly excluded. For each, cite the heading and list causes to check by hand: (a) the alias may ALREADY exist (verify before proposing it), (b) the section may be trapped in malformed markdown such as an unclosed code fence swallowing later headings, (c) the content may be below the tool's non-trivial-text threshold, (d) the verifier may have over-credited boilerplate. Only propose an alias addition for a heading plainly outside a normal alias set. fp_overclaim → flag the over-claim.`, +].join("\n"); + +const report = await agent(synthesisPrompt, { + label: "synthesis", + phase: "Synthesize", + model: "sonnet", +}); + +return { + report, + stats: { + repos: repoCount, + claims: CLAIM_FIELDS.length, + cells: totalCells, + overallAgreement: totalCells ? overallAgree / totalCells : 0, + perClaim, + }, + cells, +} diff --git a/scripts/run-instructions-audit.workflow.js b/scripts/run-instructions-audit.workflow.js new file mode 100644 index 0000000..902b6ad --- /dev/null +++ b/scripts/run-instructions-audit.workflow.js @@ -0,0 +1,141 @@ +// scripts/run-instructions-audit.workflow.js +// External audit of the snapshot's run_instructions_present claim. +// Stage 2 (verifier fan-out, Haiku) + Stage 3 (deterministic tally) + Stage 4 (Sonnet synthesis). +// args = output of `python -m src.run_instructions_audit` ({ generated_at, workspace_root, records, errors }). +export const meta = { + name: "run-instructions-audit", + description: + "Audit snapshot run_instructions_present claim against on-disk ground truth", + phases: [ + { + title: "Verify", + detail: "one Haiku subagent per pilot repo reads files and judges", + }, + { + title: "Synthesize", + detail: "one Sonnet call writes the markdown report", + }, + ], +}; + +const VERIFIER_SCHEMA = { + type: "object", + additionalProperties: false, + required: [ + "verdict", + "evidence_in_primary", + "evidence_quote", + "evidence_location", + "confidence", + ], + properties: { + verdict: { type: "boolean" }, + evidence_in_primary: { type: "boolean" }, + evidence_quote: { type: "string", maxLength: 240 }, + evidence_location: { type: "string" }, + confidence: { type: "string", enum: ["high", "med", "low"] }, + }, +}; + +const { generated_at, records, errors } = args; + +function verifierPrompt(rec) { + return [ + `You audit whether a project documents HOW TO RUN IT. Judge independently — you are NOT told the tool's answer.`, + `Project: ${rec.project_key}`, + `Directory (absolute): ${rec.abs_path}`, + `The tool treats "${rec.primary_file_name}" as the PRIMARY context file; it may be absent.`, + `Listed context files: ${JSON.stringify(rec.context_files)}.`, + ``, + `Do this:`, + `1. Read the primary file (if present), README.md, and the other listed context files, by absolute path under the directory.`, + `2. Decide: do these files genuinely tell a developer how to run/start the project — a run command, dev server, build+run steps, or quickstart? A bare dependency-install ("pip install", "## Installation" of deps) alone is NOT run instructions.`, + `3. If yes: verdict=true; quote the exact run command or heading (<=240 chars) in evidence_quote; set evidence_location like "CLAUDE.md §Usage" or "README §Getting Started".`, + `4. evidence_in_primary=true ONLY if that evidence is inside "${rec.primary_file_name}". If the primary file is absent, or the evidence is only in README/another file, set it false.`, + `5. If no run instructions exist anywhere, verdict=false with empty quote/location. Default to false when uncertain.`, + ].join("\n"); +} + +// --- Stage 3 tally logic (mirror of src/run_instructions_audit.py) --- +function assignBucket(toolToday, verdict, inPrimary) { + if (toolToday === verdict) return verdict ? "agree_present" : "agree_absent"; + if (verdict && !toolToday) + return inPrimary ? "fn_alias_gap" : "fn_blind_spot"; + return "fp_overclaim"; +} +function assignDrift(snapshotClaim, toolToday, drifted) { + if (snapshotClaim === toolToday) return "claim_same"; + return drifted ? "claim_changed_drift" : "claim_changed_nodrift"; +} + +phase("Verify"); +const verified = await parallel( + records.map( + (rec) => () => + agent(verifierPrompt(rec), { + label: `verify:${rec.project_key}`, + phase: "Verify", + model: "haiku", + agentType: "Explore", + schema: VERIFIER_SCHEMA, + }) + .then((v) => ({ rec, v })) + .catch(() => null), + ), +); + +const rows = verified.filter(Boolean).map(({ rec, v }) => ({ + project_key: rec.project_key, + primary_file_name: rec.primary_file_name, + snapshot_claim: rec.snapshot_claim, + tool_today: rec.tool_today, + drifted: rec.drifted, + ...v, + bucket: assignBucket(rec.tool_today, v.verdict, v.evidence_in_primary), + drift_bucket: assignDrift(rec.snapshot_claim, rec.tool_today, rec.drifted), +})); + +const counts = rows.reduce( + (acc, r) => ((acc[r.bucket] = (acc[r.bucket] || 0) + 1), acc), + {}, +); +const disagreements = rows.filter((r) => !r.bucket.startsWith("agree")); +const agreementRate = rows.length + ? (rows.length - disagreements.length) / rows.length + : 0; +log( + `Verified ${rows.length} repos — ${disagreements.length} disagreements, agreement ${(agreementRate * 100).toFixed(0)}%`, +); + +phase("Synthesize"); +const synthesisPrompt = [ + `Write a markdown audit report for the snapshot claim "run_instructions_present". Return ONLY markdown, no preamble.`, + `Facts: snapshot generated_at=${generated_at}; repos verified=${rows.length}; agreement rate (verifier vs tool_today)=${(agreementRate * 100).toFixed(0)}%.`, + `Bucket counts: ${JSON.stringify(counts)}.`, + `Unresolved-path errors: ${JSON.stringify(errors)}.`, + `Disagreement rows (JSON): ${JSON.stringify(disagreements, null, 2)}`, + ``, + `Required sections:`, + `1. Headline — repos, agreement rate, counts per bucket.`, + `2. Disagreements — a table keyed by project_key with columns: bucket, evidence_quote, evidence_location, confidence, drifted.`, + `3. Drift summary — count rows where drift_bucket != "claim_same", split claim_changed_drift (explained) vs claim_changed_nodrift (snapshot likely wrong).`, + `4. Prescriptive fixes — for fn_alias_gap rows, the exact headings to add to CONTEXT_SECTION_ALIASES; if any fn_blind_spot rows, recommend choose_primary_context_file consider README.md; for fp_overclaim, flag the over-claim.`, +].join("\n"); + +const report = await agent(synthesisPrompt, { + label: "synthesis", + phase: "Synthesize", + model: "sonnet", +}); + +return { + report, + stats: { + verified: rows.length, + agreementRate, + counts, + disagreements: disagreements.length, + errors: errors.length, + }, + rows, +} diff --git a/src/portfolio_context_contract.py b/src/portfolio_context_contract.py index 4a05b5e..4138069 100644 --- a/src/portfolio_context_contract.py +++ b/src/portfolio_context_contract.py @@ -143,11 +143,32 @@ def analyze_project_context( if primary_exists: primary_text = _read_small_text(project_path / primary_context_file) - sections = _split_markdown_sections(primary_text) + # README fallback: a project's real docs often live in README.md, which is never chosen + # as the primary context file. When the primary file lacks a required section, also look + # in the top-level README so well-documented repos are not scored blind. An explicit + # readme_text argument overrides the on-disk read (used in tests). + if not readme_text: + readme_path = project_path / "README.md" + if readme_path.is_file(): + readme_text = _read_small_text(readme_path) + has_readme = bool(readme_text.strip()) + + primary_sections = _split_markdown_sections(primary_text) + readme_sections = _split_markdown_sections(readme_text) if has_readme else {} section_presence = { - field: _section_has_meaningful_content(sections, aliases) + field: ( + _section_has_meaningful_content(primary_sections, aliases) + or _section_has_meaningful_content(readme_sections, aliases) + ) for field, aliases in CONTEXT_SECTION_ALIASES.items() } + # Lead-paragraph fallback: a project summary conventionally lives as the prose under the + # title, not under an "## Overview" heading. If no summary heading matched, accept a + # non-trivial lead paragraph from the primary file or the README. + if not section_presence["project_summary"]: + section_presence["project_summary"] = _has_lead_summary(primary_text) or _has_lead_summary( + readme_text + ) missing_fields = [ DERIVED_KEY_TO_LABEL[REQUIRED_FIELD_TO_DERIVED_KEY[field]] for field, present in section_presence.items() @@ -159,7 +180,7 @@ def analyze_project_context( if Path(item).name in SUPPORTING_CONTEXT_FILES and Path(item).name != primary_context_file ) - if not primary_exists: + if not primary_exists and not has_readme: context_quality = "none" elif missing_fields: context_quality = "boilerplate" @@ -277,3 +298,26 @@ def _is_nontrivial_text(text: str) -> bool: return False words = re.findall(r"[A-Za-z0-9][A-Za-z0-9+./:_-]*", compact) return len(words) >= 4 and len(compact) >= 24 + + +def _lead_paragraph_text(text: str) -> str: + """Prose between the H1 title and the first level-2+ heading — a doc's lead/intro. + + Strips the title line and badge/image/link markdown so a wall of badges does not read as a + summary. Project summaries conventionally live here rather than under an "## Overview". + """ + lead_lines: list[str] = [] + for line in text.splitlines(): + if re.match(r"^#{2,}\s", line): # first '## ...' (or deeper) heading ends the lead + break + if re.match(r"^#\s", line): # the H1 title line itself + continue + lead_lines.append(line) + lead = "\n".join(lead_lines) + lead = re.sub(r"!\[[^\]]*\]\([^)]*\)", "", lead) # drop images/badges + lead = re.sub(r"\[([^\]]*)\]\([^)]*\)", r"\1", lead) # keep link text, drop URLs + return lead + + +def _has_lead_summary(text: str) -> bool: + return _is_nontrivial_text(_lead_paragraph_text(text)) diff --git a/src/run_instructions_audit.py b/src/run_instructions_audit.py new file mode 100644 index 0000000..ff415a2 --- /dev/null +++ b/src/run_instructions_audit.py @@ -0,0 +1,156 @@ +"""External audit of the snapshot's run_instructions_present claim (pre-step). + +Stage 0 (stratified pilot selection) + Stage 1 (evidence prep + live tool_today +recompute) run here as a deterministic, read-only pre-step. The compact JSON this +emits is consumed as `args` by scripts/run-instructions-audit.workflow.js, whose +Haiku subagents read the repo files and judge. Never writes repos/snapshot/git. +""" + +from __future__ import annotations + +import json +import re +import subprocess +from datetime import datetime +from pathlib import Path + +from src.portfolio_context_contract import ( + analyze_project_context, + choose_primary_context_file, +) +from src.portfolio_truth_sources import _collect_context_files + +# The six presence claims the snapshot derives per repo, in logical reading order. +# These names match both snapshot `derived.` and ContextAnalysis attributes. +CLAIM_FIELDS = ( + "project_summary_present", + "current_state_present", + "stack_present", + "run_instructions_present", + "known_risks_present", + "next_recommended_move_present", +) + +FORK_JUNK_PATTERNS = (r"-security-fix", r"-cve-", r"-backup-", r"\.bundle$", r"-openssl-") +DEFAULT_PER_TIER = { + "none": 3, + "boilerplate": 4, + "minimum-viable": 4, + "standard": 4, + "full": 4, +} + + +def is_fork_junk(path: str) -> bool: + return any(re.search(pattern, path) for pattern in FORK_JUNK_PATTERNS) + + +def assign_bucket(tool_today: bool, verdict: bool, evidence_in_primary: bool) -> str: + if tool_today == verdict: + return "agree_present" if verdict else "agree_absent" + if verdict and not tool_today: + return "fn_alias_gap" if evidence_in_primary else "fn_blind_spot" + return "fp_overclaim" + + +def assign_drift_bucket(snapshot_claim: bool, tool_today: bool, repo_drifted: bool) -> str: + if snapshot_claim == tool_today: + return "claim_same" + return "claim_changed_drift" if repo_drifted else "claim_changed_nodrift" + + +def select_pilot( + projects: list[dict], *, per_tier: dict[str, int] = DEFAULT_PER_TIER +) -> list[dict]: + eligible = [ + p + for p in projects + if p["derived"]["registry_status"] != "archived" and not is_fork_junk(p["identity"]["path"]) + ] + selected: list[dict] = [] + for tier, count in per_tier.items(): + tier_projects = sorted( + (p for p in eligible if p["derived"]["context_quality"] == tier), + key=lambda p: p["identity"]["project_key"], + ) + selected.extend(tier_projects[:count]) + return selected + + +def build_record(project: dict, workspace_root: str) -> dict: + path = project["identity"]["path"] + derived = project["derived"] + context_files = derived["context_files"] + return { + "project_key": project["identity"]["project_key"], + "display_name": project["identity"]["display_name"], + "abs_path": str(Path(workspace_root) / path), + "primary_file_name": choose_primary_context_file(context_files), + "context_files": context_files, + "snapshot_claims": {field: bool(derived.get(field, False)) for field in CLAIM_FIELDS}, + } + + +def compute_tool_today(abs_path: str) -> dict: + project_path = Path(abs_path) + analysis = analyze_project_context(project_path, _collect_context_files(project_path)) + return {field: bool(getattr(analysis, field)) for field in CLAIM_FIELDS} + + +def is_after(commit_iso: str, generated_at_iso: str) -> bool: + return datetime.fromisoformat(commit_iso) > datetime.fromisoformat(generated_at_iso) + + +def compute_drifted(abs_path: str, generated_at: str) -> bool: + try: + result = subprocess.run( + ["git", "-C", abs_path, "log", "-1", "--format=%cI"], + capture_output=True, + text=True, + timeout=10, + ) + except (subprocess.SubprocessError, OSError): + return False + commit_iso = result.stdout.strip() + if result.returncode != 0 or not commit_iso: + return False + return is_after(commit_iso, generated_at) + + +def prepare_pilot(snapshot_path: str, *, per_tier: dict[str, int] = DEFAULT_PER_TIER) -> dict: + snapshot = json.loads(Path(snapshot_path).read_text()) + workspace_root = snapshot["workspace_root"] + generated_at = snapshot["generated_at"] + records: list[dict] = [] + errors: list[dict] = [] + for project in select_pilot(snapshot["projects"], per_tier=per_tier): + record = build_record(project, workspace_root) + if not Path(record["abs_path"]).is_dir(): + errors.append( + { + "project_key": record["project_key"], + "abs_path": record["abs_path"], + "error": "missing_dir", + } + ) + continue + record["tool_today"] = compute_tool_today(record["abs_path"]) + record["drifted"] = compute_drifted(record["abs_path"], generated_at) + records.append(record) + return { + "generated_at": generated_at, + "workspace_root": workspace_root, + "records": records, + "errors": errors, + } + + +def main() -> None: + import sys + + snapshot_path = sys.argv[1] if len(sys.argv) > 1 else "output/portfolio-truth-latest.json" + print(json.dumps(prepare_pilot(snapshot_path), indent=2)) + + +if __name__ == "__main__": + main() diff --git a/tests/test_portfolio_context_contract.py b/tests/test_portfolio_context_contract.py new file mode 100644 index 0000000..66d6357 --- /dev/null +++ b/tests/test_portfolio_context_contract.py @@ -0,0 +1,116 @@ +from pathlib import Path + +from src.portfolio_context_contract import analyze_project_context + +# A generic Codex-OS-style AGENTS.md stub: no Portfolio-Context sections, no run guidance. +_GENERIC_AGENTS = ( + "# codex-os-managed\n\n" + "## Communication Contract\n\n" + "Follow the global contract for all changes made in this repository.\n" +) +_RUN_SECTION = "## Usage\n\nRun the dev server with `npm run dev` to start the app.\n" + + +def _write(dir_path: Path, name: str, text: str) -> None: + (dir_path / name).write_text(text) + + +def test_readme_fallback_marks_run_instructions_present(tmp_path): + # Primary (AGENTS.md) lacks run instructions; README carries them → README fallback. + _write(tmp_path, "AGENTS.md", _GENERIC_AGENTS) + _write(tmp_path, "README.md", "# Proj\n\n" + _RUN_SECTION) + result = analyze_project_context(tmp_path, ["AGENTS.md", "README.md"]) + assert result.run_instructions_present is True + + +def test_no_fallback_when_neither_documents_it(tmp_path): + # Neither file documents how to run it → still absent (fallback must not hallucinate). + _write(tmp_path, "AGENTS.md", _GENERIC_AGENTS) + _write(tmp_path, "README.md", "# Proj\n\nA short blurb with no run guidance whatsoever.\n") + result = analyze_project_context(tmp_path, ["AGENTS.md", "README.md"]) + assert result.run_instructions_present is False + + +def test_primary_still_detected_without_readme(tmp_path): + # Existing behavior preserved: run instructions in the primary file are still found. + _write(tmp_path, "AGENTS.md", "# Proj\n\n" + _RUN_SECTION) + result = analyze_project_context(tmp_path, ["AGENTS.md"]) + assert result.run_instructions_present is True + + +def test_context_quality_not_none_for_readme_only_repo(tmp_path): + # No CLAUDE.md/AGENTS.md, but a rich README documents all six sections. + readme = ( + "# Proj\n\n" + "## Overview\n\nThis project does a specific useful thing for its users.\n\n" + "## Status\n\nCurrently in active development with the core features done.\n\n" + "## Tech Stack\n\nBuilt with Python, FastAPI, and SQLite for storage.\n\n" + "## Usage\n\nRun it locally with `uvicorn app:main` after installing deps.\n\n" + "## Known Risks\n\nRate limits and no auth yet are the main known gaps.\n\n" + "## Next Steps\n\nAdd authentication and ship the first tagged release.\n" + ) + _write(tmp_path, "README.md", readme) + result = analyze_project_context(tmp_path, ["README.md"]) + assert result.context_quality != "none" # README content now counts + assert result.run_instructions_present is True + + +def test_explicit_readme_text_override_is_honored(tmp_path): + # The dormant readme_text param now works as an explicit override (no disk read needed). + _write(tmp_path, "AGENTS.md", _GENERIC_AGENTS) + result = analyze_project_context( + tmp_path, ["AGENTS.md"], readme_text="# Proj\n\n" + _RUN_SECTION + ) + assert result.run_instructions_present is True + + +# --- Layer-2: lead-paragraph project summaries (no "## Overview" heading) --- + + +def test_lead_paragraph_counts_as_project_summary(tmp_path): + # Summary is the tagline under the title, with no Overview-style heading anywhere. + _write(tmp_path, "AGENTS.md", _GENERIC_AGENTS) + _write( + tmp_path, + "README.md", + "# Proj\n\nA real-time strategy game where every decision happens at once.\n\n" + "## Install\n\nRun npm install first to set things up.\n", + ) + result = analyze_project_context(tmp_path, ["AGENTS.md", "README.md"]) + assert result.project_summary_present is True + + +def test_badge_only_lead_is_not_a_summary(tmp_path): + # A wall of badges/links before the first section must NOT count as a summary. + _write(tmp_path, "AGENTS.md", _GENERIC_AGENTS) + _write( + tmp_path, + "README.md", + "# Proj\n\n![CI](https://x/ci.svg) ![Coverage](https://x/cov.svg)\n\n" + "## Install\n\nRun npm install first to set things up.\n", + ) + result = analyze_project_context(tmp_path, ["AGENTS.md", "README.md"]) + assert result.project_summary_present is False + + +def test_overview_section_still_wins_without_lead(tmp_path): + # Existing alias path is unaffected by the lead-paragraph fallback. + _write( + tmp_path, + "README.md", + "# Proj\n\n## Overview\n\nThis project does a specific, clearly described useful thing.\n", + ) + result = analyze_project_context(tmp_path, ["README.md"]) + assert result.project_summary_present is True + + +def test_lead_paragraph_in_primary_file_counts(tmp_path): + # Lead paragraph in the primary file (CLAUDE.md) also counts. + _write( + tmp_path, + "CLAUDE.md", + "# Proj\n\nA concise description of what this tool is and who it serves.\n\n" + "## Setup\n\nInstall the dependencies to begin.\n", + ) + result = analyze_project_context(tmp_path, ["CLAUDE.md"]) + assert result.project_summary_present is True diff --git a/tests/test_run_instructions_audit.py b/tests/test_run_instructions_audit.py new file mode 100644 index 0000000..930f135 --- /dev/null +++ b/tests/test_run_instructions_audit.py @@ -0,0 +1,224 @@ +import json +import os +import subprocess + +from src.run_instructions_audit import ( + CLAIM_FIELDS, + assign_bucket, + assign_drift_bucket, + build_record, + compute_drifted, + compute_tool_today, + is_after, + is_fork_junk, + prepare_pilot, + select_pilot, +) + + +def test_is_fork_junk_flags_known_patterns(): + assert is_fork_junk("AssistSupport-openssl-cve-2026-42327") + assert is_fork_junk("BrowserHistoryVisualizer-security-fix") + assert is_fork_junk("ApplyKit-private-history-backup-20260517.bundle") + assert not is_fork_junk("Fun:GamePrjs/BattleGrid") + assert not is_fork_junk("mcpforge") + + +def test_assign_bucket_truth_table(): + # agreement + assert assign_bucket(True, True, True) == "agree_present" + assert assign_bucket(False, False, False) == "agree_absent" + # false negatives (tool said absent, verifier found it) + assert assign_bucket(False, True, True) == "fn_alias_gap" # evidence in primary file + assert assign_bucket(False, True, False) == "fn_blind_spot" # evidence only in README/other + # false positive (tool over-claimed) + assert assign_bucket(True, False, False) == "fp_overclaim" + assert assign_bucket(True, False, True) == "fp_overclaim" + + +def test_assign_drift_bucket(): + # snapshot still matches today's recompute → no field drift + assert assign_drift_bucket(True, True, True) == "claim_same" + assert assign_drift_bucket(False, False, False) == "claim_same" + # field value changed AND repo has commits since snapshot → explained by drift + assert assign_drift_bucket(False, True, True) == "claim_changed_drift" + # field value changed with NO commits since snapshot → unexplained (snapshot was wrong) + assert assign_drift_bucket(False, True, False) == "claim_changed_nodrift" + + +def _project(key, quality, *, status="active", path=None): + return { + "identity": {"project_key": key, "path": path or key, "display_name": key.split("/")[-1]}, + "derived": { + "registry_status": status, + "context_quality": quality, + "context_files": ["CLAUDE.md"], + "run_instructions_present": False, + }, + } + + +def test_select_pilot_stratifies_sorts_and_filters(): + projects = ( + [_project(f"b{i}", "boilerplate") for i in range(6)] + + [_project("n1", "none"), _project("n2", "none")] + + [_project("arch", "full", status="archived")] + + [_project("junk-security-fix", "full")] + + [_project("z-full", "full"), _project("a-full", "full")] + ) + selected = select_pilot(projects, per_tier={"none": 3, "boilerplate": 4, "full": 4}) + keys = [p["identity"]["project_key"] for p in selected] + + # archived + fork-junk excluded + assert "arch" not in keys and "junk-security-fix" not in keys + # boilerplate capped at 4 of 6 + assert sum(k.startswith("b") for k in keys) == 4 + # full sorted by project_key → a-full before z-full + assert keys.index("a-full") < keys.index("z-full") + # both 'none' present (only 2 available, asked for 3) + assert {"n1", "n2"} <= set(keys) + + +def test_build_record_resolves_path_and_all_six_claims(): + project = { + "identity": { + "project_key": "Fun:GamePrjs/BattleGrid", + "path": "Fun:GamePrjs/BattleGrid", + "display_name": "BattleGrid", + }, + "derived": { + "context_files": ["AGENTS.md", "README.md"], + "run_instructions_present": False, + "project_summary_present": True, # one set True to prove per-field mapping + }, + } + record = build_record(project, "/Users/d/Projects") + + assert record["abs_path"] == "/Users/d/Projects/Fun:GamePrjs/BattleGrid" + assert record["primary_file_name"] == "AGENTS.md" # no CLAUDE.md → AGENTS.md + assert record["context_files"] == ["AGENTS.md", "README.md"] + assert record["project_key"] == "Fun:GamePrjs/BattleGrid" + # snapshot_claims is now a dict over all 6 fields; missing derived fields default False + assert set(record["snapshot_claims"]) == set(CLAIM_FIELDS) + assert record["snapshot_claims"]["run_instructions_present"] is False + assert record["snapshot_claims"]["project_summary_present"] is True + assert record["snapshot_claims"]["known_risks_present"] is False # absent in derived → False + + +def test_build_record_prefers_claude_md(): + project = { + "identity": {"project_key": "x", "path": "x", "display_name": "x"}, + "derived": {"context_files": ["AGENTS.md", "CLAUDE.md"], "run_instructions_present": True}, + } + assert build_record(project, "/w")["primary_file_name"] == "CLAUDE.md" + + +def test_is_after_compares_tz_aware_iso(): + assert is_after("2026-05-25T19:25:00-07:00", "2026-05-17T05:01:39+00:00") + assert not is_after("2026-05-10T00:00:00+00:00", "2026-05-17T05:01:39+00:00") + + +def test_compute_tool_today_returns_all_six_claims_true_when_alias_matches(tmp_path): + # "## Usage" IS a run_instructions alias → run_instructions_present True + (tmp_path / "CLAUDE.md").write_text( + "# Proj\n\n## Usage\n\nRun the dev server with `npm run dev`. It serves on :3000.\n" + ) + result = compute_tool_today(str(tmp_path)) + assert set(result) == set(CLAIM_FIELDS) # dict over all 6 claims + assert result["run_instructions_present"] is True + assert result["known_risks_present"] is False # not documented → False + + +def test_compute_tool_today_false_when_run_heading_outside_alias(tmp_path): + # "## Running" is NOT in the alias list → the tool misses it (alias-gap case) + (tmp_path / "CLAUDE.md").write_text( + "# Proj\n\n## Running\n\nStart it with `npm run dev`. This is genuine run guidance.\n" + ) + assert compute_tool_today(str(tmp_path))["run_instructions_present"] is False + + +def _git_commit_at(path, iso): + env = { + **os.environ, + "GIT_AUTHOR_DATE": iso, + "GIT_COMMITTER_DATE": iso, + "GIT_AUTHOR_NAME": "t", + "GIT_AUTHOR_EMAIL": "t@t", + "GIT_COMMITTER_NAME": "t", + "GIT_COMMITTER_EMAIL": "t@t", + } + subprocess.run(["git", "init", "-q"], cwd=path, check=True) + subprocess.run( + ["git", "commit", "-q", "--allow-empty", "-m", "x"], cwd=path, env=env, check=True + ) + + +def test_compute_drifted_true_when_commit_after_snapshot(tmp_path): + _git_commit_at(tmp_path, "2026-05-25T00:00:00+00:00") + assert compute_drifted(str(tmp_path), "2026-05-17T05:01:39+00:00") is True + + +def test_compute_drifted_false_when_commit_before_snapshot(tmp_path): + _git_commit_at(tmp_path, "2026-05-01T00:00:00+00:00") + assert compute_drifted(str(tmp_path), "2026-05-17T05:01:39+00:00") is False + + +def test_compute_drifted_false_for_non_git_dir(tmp_path): + assert compute_drifted(str(tmp_path), "2026-05-17T05:01:39+00:00") is False + + +def test_prepare_pilot_builds_records_and_reports_missing_dirs(tmp_path): + workspace = tmp_path / "ws" + real = workspace / "RealRepo" + real.mkdir(parents=True) + (real / "CLAUDE.md").write_text("# R\n\n## Usage\n\nRun `npm run dev` to start the server.\n") + + snapshot = { + "workspace_root": str(workspace), + "generated_at": "2026-05-17T05:01:39+00:00", + "projects": [ + { + "identity": { + "project_key": "RealRepo", + "path": "RealRepo", + "display_name": "RealRepo", + }, + "derived": { + "registry_status": "active", + "context_quality": "full", + "context_files": ["CLAUDE.md"], + "run_instructions_present": True, + }, + }, + { + "identity": { + "project_key": "GhostRepo", + "path": "GhostRepo", + "display_name": "GhostRepo", + }, + "derived": { + "registry_status": "active", + "context_quality": "full", + "context_files": ["CLAUDE.md"], + "run_instructions_present": False, + }, + }, + ], + } + snap_path = tmp_path / "snap.json" + snap_path.write_text(json.dumps(snapshot)) + + result = prepare_pilot(str(snap_path), per_tier={"full": 4}) + + assert result["workspace_root"] == str(workspace) + assert len(result["records"]) == 1 + assert len(result["errors"]) == 1 + record = result["records"][0] + assert record["project_key"] == "RealRepo" + # tool_today + snapshot_claims are now dicts over all 6 claims + assert record["tool_today"]["run_instructions_present"] is True # live recompute on fixture + assert record["snapshot_claims"]["run_instructions_present"] is True + assert set(record["tool_today"]) == set(CLAIM_FIELDS) + assert record["drifted"] is False # no git repo → not drifted + assert result["errors"][0]["error"] == "missing_dir" + assert result["errors"][0]["project_key"] == "GhostRepo"