From e0ece3efa8e19d067a34c58fb574f28923134716 Mon Sep 17 00:00:00 2001
From: Mickael Farina <farina.mickael@gmail.com>
Date: Mon, 4 May 2026 10:04:12 +0200
Subject: [PATCH] hotfix: plan-time retry on hallucinated skill names (PR #41)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

User repro 2026-05-04 09:58:
  Project: "Read all markdown files in ~/codec-repo/docs/ and create
           an index.md that lists each file with its first heading and
           a one-line description"
  Result:  Plan failed: plan invalid: plan references unknown skills:
           ['file_read']

Same hallucination CLASS as PR #35 but at a different LAYER.
PR #35 fixed retries during execution (codec_agent_runner). This is
failing earlier — at plan validation, before the plan is even saved.
The user never even got to the approve-or-reject step.

Root cause:
Qwen drafts plans naming skills that don't exist. `file_read` and
`fetch_url` are the two we've seen. The actual file-reading skill is
`file_ops` (which reads, writes, appends, lists). The actual URL fetch
is `web_fetch`. The user-visible result was the same as PR #35 —
project mode dies before doing anything useful.

Fix (mirrors PR #35's pattern at planning layer):
1. After validate_plan_skills returns ok=False, instead of raising,
   build a corrective prompt listing the missing skills, the FULL
   allowed registry, and the three most common confusions
   (file_read→file_ops, fetch_url→web_fetch, read_file→file_ops).
2. Re-call _qwen_chat ONCE with the appended correction.
3. Re-validate the second draft. If valid, use it. If not, raise
   with BOTH attempts in the message so the user sees the model is
   consistently confused (vs a one-off transient miss).
4. If the retry call itself fails (Qwen flakes between attempts),
   raise with the ORIGINAL validation error — more diagnostic than
   "qwen flaked on retry".

Also:
- Strengthen _PLAN_SYSTEM_PROMPT with the same three confusion hints
  so the FIRST draft is more likely to succeed (cuts the retry rate).

Tests (3 new in tests/test_agent_plan.py — all pass):
- test_draft_plan_retries_on_hallucinated_skill_then_succeeds
  Reproduces the exact user case: file_read on attempt 1, file_ops
  on attempt 2, plan succeeds.
- test_draft_plan_retry_also_fails_raises_with_both_attempts
  Both attempts hallucinate (file_read, then read_file): error
  message contains both for diagnostic value.
- test_draft_plan_retry_qwen_unavailable_surfaces_original_error
  Retry call raises ConnectionError: original validation error
  surfaces with "retry failed" appended.

All 3 existing draft_plan tests still pass — backward-compat preserved.
The existing test_draft_plan_rejects_unknown_skill now exercises BOTH
attempts (fake_qwen_chat returns same bad plan each time) and still
raises with the missing skill in the message.

Total: 35/35 file pass + 7 pre-existing pynput env failures (unchanged).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 codec_agent_plan.py      |  69 +++++++++++++++++++++++--
 tests/test_agent_plan.py | 107 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 172 insertions(+), 4 deletions(-)

diff --git a/codec_agent_plan.py b/codec_agent_plan.py
index adccec5..cd19f46 100644
--- a/codec_agent_plan.py
+++ b/codec_agent_plan.py
@@ -326,6 +326,7 @@ def _qwen_chat(user_prompt: str, system_prompt: str = "",
 Rules:
 - Output ONLY valid JSON. No prose before or after.
 - skills_needed MUST be skill names from the user-supplied registry list. Never invent skill names.
+  Common confusions to avoid: there is NO `file_read` skill (use `file_ops`, which reads, writes, appends, lists). There is NO `fetch_url` (use `web_fetch`). There is NO `read_file` (use `file_ops`). If you can't find an exact match in the registry list, pick the closest match — never invent.
 - write_paths default to ~/.codec/agents/{agent_id}/artifacts/** unless the project explicitly requires writing elsewhere.
 - destructive_ops list any irreversible operations (deletes, payments, sending emails on user's behalf). They will require additional consent at runtime.
 - estimated_duration_minutes is your best honest guess.
@@ -406,12 +407,72 @@ def draft_plan(agent_id: str, description: str, registry=None,
         raise PlanValidationError(f"plan schema invalid: {e}")
 
     ok, missing = validate_plan_skills(plan, registry=registry)
-    if not ok:
+    if ok:
+        return plan
+
+    # PR #41: plan-time hallucination retry. Mirror of the execution-time
+    # retry shipped in PR #35 (codec_agent_runner._build_correction_nudge).
+    # Real-world Qwen drift hits both layers — at execution it picks
+    # `fetch_url` instead of `web_fetch`; at planning it picks `file_read`
+    # instead of `file_ops`. Same cure: re-prompt ONCE with an explicit
+    # closed-world correction, fail hard if the second draft still misses.
+    log.info("[%s] plan referenced unknown skills %s; retrying with correction nudge",
+             agent_id, missing)
+    correction = (
+        f"\n\nYour previous draft referenced these skills which DO NOT EXIST "
+        f"in the registry: {sorted(missing)}.\n"
+        f"You MUST pick from this exact list — do not invent names, do not "
+        f"add suffixes (no _v2, no _read, no _write versions of unrelated "
+        f"skills). The full allowed set is:\n"
+        f"  {', '.join(available_skills)}\n\n"
+        f"Common confusions:\n"
+        f"  - Need to read a file? Use `file_ops` (it reads, writes, "
+        f"appends, lists). There is NO `file_read` skill.\n"
+        f"  - Need to fetch a URL? Use `web_fetch`. There is NO `fetch_url`.\n"
+        f"  - Need to search files? Use `file_search`.\n\n"
+        f"Re-emit the entire JSON plan with valid skill names only."
+    )
+    retry_prompt = user_prompt + correction
+    try:
+        raw2 = _qwen_chat(retry_prompt, _PLAN_SYSTEM_PROMPT)
+    except (QwenUnavailableError, ConnectionError, OSError, RuntimeError) as e:
+        # If the retry call itself fails, surface the ORIGINAL validation
+        # error — that's more diagnostic than "qwen flaked on retry".
         raise PlanValidationError(
-            f"plan references unknown skills: {missing}"
+            f"plan references unknown skills: {missing} "
+            f"(retry failed: {e})"
         )
-
-    return plan
+    raw2 = raw2.strip()
+    if raw2.startswith("```"):
+        raw2 = re.sub(r"^```(?:json)?\s*", "", raw2)
+        raw2 = re.sub(r"\s*```\s*$", "", raw2)
+    try:
+        d2 = json.loads(raw2)
+    except json.JSONDecodeError as e:
+        raise PlanValidationError(
+            f"plan references unknown skills: {missing} "
+            f"(retry returned non-JSON: {e})"
+        )
+    if d2.get("too_vague"):
+        raise PlanValidationError("too_vague: description needs clarification")
+    d2.setdefault("schema", PLAN_SCHEMA_VERSION)
+    d2.setdefault("agent_id", agent_id)
+    for cp in d2.get("checkpoints", []):
+        cp.setdefault("id", _stable_checkpoint_id(cp))
+    try:
+        plan2 = plan_from_dict(d2)
+    except (KeyError, ValueError, TypeError) as e:
+        raise PlanValidationError(f"retry plan schema invalid: {e}")
+    ok2, missing2 = validate_plan_skills(plan2, registry=registry)
+    if not ok2:
+        # Second miss — give up. Surface BOTH attempts so the user can see
+        # the model is consistently confused (e.g. truly unfixable phrasing).
+        raise PlanValidationError(
+            f"plan references unknown skills after retry: "
+            f"first={sorted(missing)}, second={sorted(missing2)}"
+        )
+    log.info("[%s] retry succeeded; using corrected plan", agent_id)
+    return plan2
 
 
 def _stable_checkpoint_id(cp_dict: Dict[str, Any]) -> str:
diff --git a/tests/test_agent_plan.py b/tests/test_agent_plan.py
index bc33490..e04bf4d 100644
--- a/tests/test_agent_plan.py
+++ b/tests/test_agent_plan.py
@@ -287,6 +287,113 @@ def raise_connection(*a, **k):
         )
 
 
+# ─────────────────────────────────────────────────────────────────────────────
+# PR #41 — Plan-time hallucination retry (3 tests)
+# ─────────────────────────────────────────────────────────────────────────────
+
+def _make_plan_response(skill_name):
+    """Build a minimal valid-shape plan JSON using one skill name. Used by
+    the retry tests below."""
+    return json.dumps({
+        "goals": ["read markdown files"],
+        "checkpoints": [
+            {"title": "scan", "description": "list and read",
+             "skills_needed": [skill_name],
+             "expected_output": "index.md written", "step_budget": 30},
+        ],
+        "permission_manifest": {
+            "read_paths": ["~/codec-repo/docs/**"],
+            "write_paths": ["~/codec-repo/docs/index.md"],
+            "network_domains": [],
+            "skills": [skill_name], "destructive_ops": [],
+        },
+        "estimated_duration_minutes": 5, "assumptions": [],
+    })
+
+
+def test_draft_plan_retries_on_hallucinated_skill_then_succeeds(monkeypatch):
+    """Real reproducer from 2026-05-04 09:58: user asked CODEC to read
+    markdown files; Qwen drafted a plan with skill `file_read` (does not
+    exist; correct skill is `file_ops`). PR #41 retries ONCE with a
+    correction nudge — second draft picks `file_ops` and succeeds."""
+    import codec_agent_plan as cap
+
+    calls = {"n": 0}
+    def fake_qwen(prompt, *a, **k):
+        calls["n"] += 1
+        if calls["n"] == 1:
+            return _make_plan_response("file_read")    # hallucinated
+        return _make_plan_response("file_ops")          # corrected
+    monkeypatch.setattr(cap, "_qwen_chat", fake_qwen)
+
+    fake_registry = MagicMock()
+    fake_registry.names.return_value = ["file_ops", "file_search"]
+
+    plan = cap.draft_plan(
+        agent_id="agent_test",
+        description="Read markdown files in docs and create an index",
+        registry=fake_registry,
+    )
+    # The corrected plan was used
+    assert calls["n"] == 2
+    assert plan.checkpoints[0].skills_needed == ["file_ops"]
+
+
+def test_draft_plan_retry_also_fails_raises_with_both_attempts(monkeypatch):
+    """If the second attempt ALSO hallucinates, raise with both attempts
+    in the error message so the user can see Qwen is consistently confused
+    (vs a one-off transient miss)."""
+    import codec_agent_plan as cap
+
+    responses = iter([
+        _make_plan_response("file_read"),    # first miss
+        _make_plan_response("read_file"),    # second miss (different bad name)
+    ])
+    monkeypatch.setattr(cap, "_qwen_chat", lambda *a, **k: next(responses))
+
+    fake_registry = MagicMock()
+    fake_registry.names.return_value = ["file_ops"]
+
+    with pytest.raises(cap.PlanValidationError) as exc_info:
+        cap.draft_plan(
+            agent_id="agent_test",
+            description="some project",
+            registry=fake_registry,
+        )
+    msg = str(exc_info.value)
+    assert "after retry" in msg
+    assert "file_read" in msg     # first attempt
+    assert "read_file" in msg     # second attempt
+
+
+def test_draft_plan_retry_qwen_unavailable_surfaces_original_error(monkeypatch):
+    """If the retry call itself fails (Qwen flakes between attempts), surface
+    the ORIGINAL validation error — that's more diagnostic than 'qwen flaked
+    on retry'."""
+    import codec_agent_plan as cap
+
+    calls = {"n": 0}
+    def flaky(*a, **k):
+        calls["n"] += 1
+        if calls["n"] == 1:
+            return _make_plan_response("file_read")
+        raise ConnectionError("qwen died between attempts")
+    monkeypatch.setattr(cap, "_qwen_chat", flaky)
+
+    fake_registry = MagicMock()
+    fake_registry.names.return_value = ["file_ops"]
+
+    with pytest.raises(cap.PlanValidationError) as exc_info:
+        cap.draft_plan(
+            agent_id="agent_test",
+            description="x",
+            registry=fake_registry,
+        )
+    msg = str(exc_info.value)
+    assert "file_read" in msg            # original missing skill present
+    assert "retry failed" in msg         # cause noted
+
+
 # ─────────────────────────────────────────────────────────────────────────────
 # Task 7 — Vague-description clarifying loop (Q3) (2 tests)
 # ─────────────────────────────────────────────────────────────────────────────