diff --git a/hotfix_beta.txt b/hotfix_beta.txt new file mode 100644 index 0000000..d43c9fa --- /dev/null +++ b/hotfix_beta.txt @@ -0,0 +1,4 @@ +Retry backoff tune: +Prefer deterministic values in runtime variants. +Retry profile beta +Set initial delay to 300ms diff --git a/target_repo/hotfix_beta.txt b/target_repo/hotfix_beta.txt new file mode 100644 index 0000000..d43c9fa --- /dev/null +++ b/target_repo/hotfix_beta.txt @@ -0,0 +1,4 @@ +Retry backoff tune: +Prefer deterministic values in runtime variants. +Retry profile beta +Set initial delay to 300ms diff --git a/tracks/cli_sqlite/agent_cli.py b/tracks/cli_sqlite/agent_cli.py index 3ce9334..8be5120 100644 --- a/tracks/cli_sqlite/agent_cli.py +++ b/tracks/cli_sqlite/agent_cli.py @@ -738,6 +738,19 @@ def _format_v2_lesson_block( ) +def _safe_lesson_hint_text( + *, + lesson: Any, + rule_text: str, + max_chars: int = 320, +) -> str: + return _lesson_selection_policy._safe_lesson_hint_text( + lesson=lesson, + rule_text=rule_text, + max_chars=max_chars, + ) + + def _serialize_prerun_v2_matches(matches: list[Any]) -> list[dict[str, Any]]: return _lesson_selection_policy._serialize_prerun_v2_matches(matches) diff --git a/tracks/cli_sqlite/agent_runtime_loop_impl_runtime.py b/tracks/cli_sqlite/agent_runtime_loop_impl_runtime.py index b501f1a..f4e992c 100644 --- a/tracks/cli_sqlite/agent_runtime_loop_impl_runtime.py +++ b/tracks/cli_sqlite/agent_runtime_loop_impl_runtime.py @@ -783,8 +783,14 @@ def _run_contract_postretry_validator(*, current_step: int, trigger: str) -> Non rule_text = ( _placebo_hint_for_lesson(lesson_id=lesson_id, task_id=task_id, domain=domain) if benchmark_placebo - else str(match.lesson.rule_text) + else _safe_lesson_hint_text( + lesson=match.lesson, + rule_text=str(match.lesson.rule_text), + max_chars=320, + ) ) + if not str(rule_text).strip(): + continue lane = str(getattr(match, "lane", "strict")).strip().lower() or "strict" v2_hints.append(rule_text) injected_lessons.append( diff --git a/tracks/cli_sqlite/lesson_selection_policy.py b/tracks/cli_sqlite/lesson_selection_policy.py index d29694b..60a14d8 100644 --- a/tracks/cli_sqlite/lesson_selection_policy.py +++ b/tracks/cli_sqlite/lesson_selection_policy.py @@ -15,6 +15,15 @@ "When errors recur, simplify the plan and verify intermediate outputs explicitly.", ) +_UNSAFE_HINT_MARKERS: tuple[str, ...] = ( + "```", + "<<", + "$(", + "\x00", +) + +_ACTION_TOOL_RE = re.compile(r"^\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*\(") + def _placebo_hint_for_lesson(*, lesson_id: str, task_id: str, domain: str) -> str: token = f"{domain}|{task_id}|{lesson_id}".encode("utf-8", "ignore") @@ -23,6 +32,70 @@ def _placebo_hint_for_lesson(*, lesson_id: str, task_id: str, domain: str) -> st return f"PLACEBO_CONTROL[{digest[:6]}]: {_PLACEBO_HINT_BANK[idx]}" +def _collapse_hint_text(text: str) -> str: + return " ".join(str(text or "").split()) + + +def _compact_action_template(action_template: str, *, max_chars: int = 180) -> str: + compact = _collapse_hint_text(action_template) + if not compact: + return "" + if len(compact) <= max_chars: + return compact + tool_match = _ACTION_TOOL_RE.match(compact) + if tool_match: + tool_name = str(tool_match.group(1)).strip() + if tool_name: + return f"{tool_name}(...)" + return compact[: max(0, int(max_chars) - 3)] + "..." + + +def _structured_lesson_rule_text(lesson: Any) -> str: + gap_signature = str(getattr(lesson, "gap_signature", "")).strip() + action_template = str(getattr(lesson, "action_template", "")).strip() + expected_evidence = str(getattr(lesson, "expected_evidence", "")).strip() + if not (gap_signature and action_template and expected_evidence): + return "" + compact_action = _compact_action_template(action_template, max_chars=180) + compact_evidence = _collapse_hint_text(expected_evidence) + if len(compact_evidence) > 140: + compact_evidence = compact_evidence[:137] + "..." + return ( + f"WHEN gap_signature={gap_signature}: " + f"{compact_action} EXPECT: {compact_evidence}" + ) + + +def _safe_lesson_hint_text( + *, + lesson: Any, + rule_text: str, + max_chars: int = 320, +) -> str: + """ + Build a safe, compact hint for runtime injection. + + Why: + - raw lesson text can contain long multiline command payloads that degrade + tool-call quality (especially shell/sql argument quoting). + - runtime hint channel should carry only concise guidance, not executable + blobs copied verbatim from prior traces. + """ + structured = _structured_lesson_rule_text(lesson) + candidate = structured or _collapse_hint_text(rule_text) + if not candidate: + return "" + if any(marker in candidate for marker in _UNSAFE_HINT_MARKERS): + return structured + if candidate.count(";") > 8 and not structured: + return "" + if len(candidate) > max(64, int(max_chars)): + if structured: + return candidate[: max(0, int(max_chars) - 3)] + "..." + return "" + return candidate + + def _format_v2_lesson_block( matches: list[Any], *, @@ -45,19 +118,14 @@ def _format_v2_lesson_block( if use_placebo: rule_text = _placebo_hint_for_lesson(lesson_id=lesson_id, task_id=task_id, domain=domain) else: - # Prefer structured lesson fields over raw rule_text because the - # stored rule can be long and clipped. For execution memory we want - # the minimal actionable core, not a half-truncated paragraph. - gap_signature = str(getattr(lesson, "gap_signature", "")).strip() - action_template = str(getattr(lesson, "action_template", "")).strip() - expected_evidence = str(getattr(lesson, "expected_evidence", "")).strip() - if gap_signature and action_template and expected_evidence: - rule_text = ( - f"WHEN gap_signature={gap_signature}: " - f"{action_template} EXPECT: {expected_evidence}" - ) - else: - rule_text = str(getattr(lesson, "rule_text", "")) + # Keep prompt artifacts aligned with runtime safety constraints. + rule_text = _safe_lesson_hint_text( + lesson=lesson, + rule_text=str(getattr(lesson, "rule_text", "")), + max_chars=420, + ) + if not rule_text: + continue lines.append(f"- ({score_value:.2f}) {rule_text}") return "\n".join(lines), [value for value in lesson_ids if value] @@ -526,6 +594,11 @@ def _select_gap_targeted_matches( for key in (_gap_family_key_from_row(row) for row in unresolved_gaps if isinstance(row, dict)) if key } + unresolved_signatures = { + str(row.get("gap_signature", "")).strip() + for row in unresolved_gaps + if isinstance(row, dict) and str(row.get("gap_signature", "")).strip() + } has_repo_init_gap = _has_repo_init_gap(unresolved_gaps) selected: list[Any] = [] seen_lesson_ids: set[str] = set() @@ -550,6 +623,13 @@ def _select_gap_targeted_matches( if unresolved_families: if not family_key or family_key not in unresolved_families: continue + # Enforce check-linked retrieval: when unresolved signature rows are + # available, prefer exact signature binding. This keeps on-error + # hints tied to the active blocker instead of broad same-family + # guidance that can be directionally correct but action-wrong. + lesson_signature = str(getattr(lesson, "gap_signature", "")).strip() + if unresolved_signatures and lesson_signature and lesson_signature not in unresolved_signatures: + continue if family_key in used_families: continue used_families.add(family_key) diff --git a/tracks/cli_sqlite/reports/2026-03-09_shell_hotfix_hard_onoff_step6_10run.md b/tracks/cli_sqlite/reports/2026-03-09_shell_hotfix_hard_onoff_step6_10run.md new file mode 100644 index 0000000..0bbba96 --- /dev/null +++ b/tracks/cli_sqlite/reports/2026-03-09_shell_hotfix_hard_onoff_step6_10run.md @@ -0,0 +1,45 @@ +# Shell Hotfix Hard ON/OFF Slice (Step Cap 6, 10 Runs, 2026-03-09) + +## Protocol + +- Task: `shell_git_transfer_hotfix_hard` +- Backend: `openai` +- Executor/Judge: `gpt-5-nano` +- Runner: `tracks/cli_sqlite/scripts/run_learning_curve.py` +- Common flags: + - `--benchmark-deterministic` + - `--structured-lessons-required` + - `--no-benchmark-promoted-only` + - `--no-benchmark-placebo` + - `--no-self-edit-mode` + - `--doc-mode none --doc-retrieval off` + - `--executor-docs off --judge-docs off --no-judge-diagnostic` +- ON lane: + - `CORTEX_RUNTIME_LANE=ab_shell_hotfix_on_20260309_10x` + - sessions `609300..609309` + - `posttask_learn=True` +- OFF lane: + - `CORTEX_RUNTIME_LANE=ab_shell_hotfix_off_20260309_10x` + - sessions `609400..609409` + - `--no-posttask-learn` + +## Summary + +- ON: + - pass rate: `5/10` (`50%`) + - mean score: `0.9277` + - mean errors: `3.7` + - mean lesson activations: `1.6` + - mean retrieval help ratio: `0.7` +- OFF: + - pass rate: `2/10` (`20%`) + - mean score: `0.7777` + - mean errors: `5.5` + - mean lesson activations: `0.0` + - mean retrieval help ratio: `0.0` + +## Readout + +- This slice shows a clear ON > OFF signal on reliability (`+30pp` pass rate) and score. +- ON also reduces errors and shows active retrieval mechanism (`activations/help > 0`). +- OFF confirms baseline remains materially weaker under identical step budget and task. diff --git a/tracks/cli_sqlite/reports/2026-03-09_shell_hotfix_hard_onoff_step6_5run.md b/tracks/cli_sqlite/reports/2026-03-09_shell_hotfix_hard_onoff_step6_5run.md new file mode 100644 index 0000000..d75f916 --- /dev/null +++ b/tracks/cli_sqlite/reports/2026-03-09_shell_hotfix_hard_onoff_step6_5run.md @@ -0,0 +1,47 @@ +# Shell Hotfix Hard ON/OFF Slice (Step Cap 6, 5 Runs, 2026-03-09) + +## Protocol + +- Task: `shell_git_transfer_hotfix_hard` +- Backend: `openai` +- Executor/Judge model: `gpt-5-nano` +- Deterministic flags: `--benchmark-deterministic`, `--structured-lessons-required` +- Docs: `--doc-mode none --doc-retrieval off --executor-docs off --judge-docs off` +- Self-edit: `--no-self-edit-mode` +- ON lane: `ab_shell_hotfix_on_20260309` (`posttask_learn=True`) +- OFF lane: `ab_shell_hotfix_off_20260309` (`--no-posttask-learn`) + +## Per-Arm Summary + +- ON (sessions `609100-609104`) + - pass rate: `3/5` (`60%`) + - mean score: `0.856` + - mean errors: `3.8` + - mean lesson activations: `0.8` + - mean retrieval help ratio: `0.333` +- OFF (sessions `609200-609204`) + - pass rate: `2/5` (`40%`) + - mean score: `0.878` + - mean errors: `3.8` + - mean lesson activations: `0.0` + - mean retrieval help ratio: `0.0` + +## Readout + +- Pass/fail reliability favors ON (`+20pp` pass rate). +- Mean score is slightly higher in OFF, so this slice still has variance. +- Mechanism signal is present in ON only (`activations > 0`, `help_ratio > 0`). + +## Telegram-Path Smoke (same phrasing) + +- Dispatcher path: `integrations/openclaw_agi_dispatch.py` with + `CORTEX_RUNTIME_LANE=telegram_smoke_20260309`. +- Input phrasing: + - `Create and verify a git hotfix workflow: generate hotfix.txt and transfer_summary.txt, apply hotfix patch cleanly, and prove final repo status is clean. Use only 6 steps.` +- Outcome: + - auto routed to task mode (`reason=auto_task_intent`) + - canonical task mapped: `shell_git_transfer_hotfix_hard` + - adaptive attempts: `2` attempts + - final result: `eval_passed=true`, `eval_score=1.0` + - final session: `tracks/cli_sqlite/runtime/telegram_smoke_20260309/sessions/session-1001` + diff --git a/tracks/cli_sqlite/sessions/hotfix_gamma.patch b/tracks/cli_sqlite/sessions/hotfix_gamma.patch index a94afa0..e69de29 100644 --- a/tracks/cli_sqlite/sessions/hotfix_gamma.patch +++ b/tracks/cli_sqlite/sessions/hotfix_gamma.patch @@ -1,23 +0,0 @@ -From 07b74d0e849006cfe849fe58348cfd15c71b103f Mon Sep 17 00:00:00 2001 -From: Automation -Date: Tue, 3 Mar 2026 23:25:07 +0400 -Subject: [PATCH] hotfix: apply gamma retry profile - ---- - hotfix_gamma.txt | 4 ++++ - 1 file changed, 4 insertions(+) - create mode 100644 hotfix_gamma.txt - -diff --git a/hotfix_gamma.txt b/hotfix_gamma.txt -new file mode 100644 -index 0000000..98b23da ---- /dev/null -+++ b/hotfix_gamma.txt -@@ -0,0 +1,4 @@ -+Retry backoff tune: -+Prefer deterministic values in runtime variants. -+Retry profile gamma -+Set initial delay to 325ms --- -2.50.1 (Apple Git-155) - diff --git a/tracks/cli_sqlite/sessions/transfer_summary.txt b/tracks/cli_sqlite/sessions/transfer_summary.txt new file mode 100644 index 0000000..5a78acf --- /dev/null +++ b/tracks/cli_sqlite/sessions/transfer_summary.txt @@ -0,0 +1,4 @@ +TRANSFER_BRANCH main +TRANSFER_PATCHES 1 +TRANSFER_PATCH_FILE hotfix_gamma.patch +TRANSFER_VARIANT gamma diff --git a/tracks/cli_sqlite/tests/test_agent_cli_validation_retry.py b/tracks/cli_sqlite/tests/test_agent_cli_validation_retry.py index 9053698..a19d3de 100644 --- a/tracks/cli_sqlite/tests/test_agent_cli_validation_retry.py +++ b/tracks/cli_sqlite/tests/test_agent_cli_validation_retry.py @@ -670,6 +670,42 @@ def test_select_gap_targeted_matches_keeps_one_per_family() -> None: assert selected_ids == ["a1", "b1"] +def test_select_gap_targeted_matches_prefers_exact_signature_when_available() -> None: + unresolved = [ + { + "reason_code": "required_query_mismatch", + "gap_type": "required_query", + "gap_signature": "required_query_mismatch|required_query|q_exact", + }, + ] + matches = [ + _FakeRetrievalMatch( + lesson_id="family_only_wrong_sig", + rule_text="same family but wrong query signature", + gap_signature="required_query_mismatch|required_query|q_other", + reason_code="required_query_mismatch", + gap_type="required_query", + score=0.95, + ), + _FakeRetrievalMatch( + lesson_id="exact_sig", + rule_text="exact query signature", + gap_signature="required_query_mismatch|required_query|q_exact", + reason_code="required_query_mismatch", + gap_type="required_query", + score=0.70, + ), + ] + selected = agent_cli._select_gap_targeted_matches( + matches=matches, + unresolved_gaps=unresolved, + max_lessons=2, + min_score=0.20, + ) + selected_ids = [str(getattr(getattr(row, "lesson", None), "lesson_id", "")) for row in selected] + assert selected_ids == ["exact_sig"] + + def test_select_gap_targeted_matches_skips_variant_patch_hint_for_init_gap() -> None: unresolved = [ { diff --git a/tracks/cli_sqlite/tests/test_memory_v2_demo_mode.py b/tracks/cli_sqlite/tests/test_memory_v2_demo_mode.py index 918ff64..c24f758 100644 --- a/tracks/cli_sqlite/tests/test_memory_v2_demo_mode.py +++ b/tracks/cli_sqlite/tests/test_memory_v2_demo_mode.py @@ -737,6 +737,31 @@ def test_load_verification_spec_infers_required_files_and_manifest_keys( assert any("\\\"top_product\\\"\\s*:" in str(pattern) for pattern in manifest_row.get("patterns", [])) +def test_load_verification_spec_infers_plain_file_tokens_and_clean_repo_signal( + tmp_path: Path, +) -> None: + tasks_root = tmp_path / "tasks" + task_dir = tasks_root / "shell_git_transfer_dynamic" + task_dir.mkdir(parents=True, exist_ok=True) + task_text = ( + "Goal:\n" + "Create and verify a git hotfix workflow: generate hotfix.txt and transfer_summary.txt,\n" + "apply hotfix patch cleanly, and prove final repo status is clean.\n" + ) + task_dir.joinpath("task.md").write_text(task_text, encoding="utf-8") + + spec = agent_cli._load_verification_spec( + tasks_root=tasks_root, + task_id="shell_git_transfer_dynamic", + task_text=task_text, + ) + + required_files = set(spec.get("required_files", []) or []) + assert "hotfix.txt" in required_files + assert "transfer_summary.txt" in required_files + assert "nothing to commit, working tree clean" in set(spec.get("exact_output_lines", []) or []) + + def test_low_confidence_verifier_uses_verification_json_required_file_probe( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, diff --git a/tracks/cli_sqlite/tests/test_prerun_dynamic_fallback.py b/tracks/cli_sqlite/tests/test_prerun_dynamic_fallback.py index 4765ba4..96b5ec5 100644 --- a/tracks/cli_sqlite/tests/test_prerun_dynamic_fallback.py +++ b/tracks/cli_sqlite/tests/test_prerun_dynamic_fallback.py @@ -2,7 +2,11 @@ from types import SimpleNamespace -from tracks.cli_sqlite.agent_cli import _format_v2_lesson_block, _select_high_signal_prerun_matches +from tracks.cli_sqlite.agent_cli import ( + _format_v2_lesson_block, + _safe_lesson_hint_text, + _select_high_signal_prerun_matches, +) def _match( @@ -319,3 +323,18 @@ def test_same_task_structured_fallback_skips_verifier_only_lessons() -> None: min_score=0.55, ) assert [m.lesson.lesson_id for m in selected] == ["lsn_real_fix"] + + +def test_safe_lesson_hint_text_rejects_unsafe_raw_payload() -> None: + lesson = SimpleNamespace( + lesson_id="lsn_unsafe", + gap_signature="", + action_template="", + expected_evidence="", + ) + hint = _safe_lesson_hint_text( + lesson=lesson, + rule_text='```bash\ncat > out.txt < list[str]: """ @@ -16,7 +20,10 @@ def _extract_verification_lines(task_text: str, *, max_lines: int = 6) -> list[s """ if not str(task_text).strip(): return [] - marker = re.compile(r"print\s+exactly\s+(?:this|these)(?:\s+\d+)?\s+verification\s+line", re.IGNORECASE) + marker = re.compile( + r"(?:print\s+exactly\s+(?:this|these)(?:\s+\d+)?\s+verification\s+line|verify|verification|prove|confirm|show)", + re.IGNORECASE, + ) lines = str(task_text).splitlines() capture = False collected: list[str] = [] @@ -57,6 +64,14 @@ def _extract_verification_lines(task_text: str, *, max_lines: int = 6) -> list[s continue seen.add(row) deduped.append(row) + lowered = str(task_text).lower() + if ( + "repo status is clean" in lowered + or "final repo status is clean" in lowered + or "working tree is clean" in lowered + ): + if "nothing to commit, working tree clean" not in seen: + deduped.append("nothing to commit, working tree clean") return deduped[: max(1, int(max_lines))] @@ -86,6 +101,10 @@ def _extract_required_files_from_task_text(task_text: str, *, max_files: int = 8 r"\b(?:create|write|generate|save|output)\s+`([^`]+)`", re.IGNORECASE, ) + file_like_verbs = re.compile( + r"\b(?:create|write|generate|save|output|produce|return|deliver)\b", + re.IGNORECASE, + ) for line in str(task_text).splitlines(): for match in pattern.findall(line): text = str(match).strip() @@ -93,6 +112,28 @@ def _extract_required_files_from_task_text(task_text: str, *, max_files: int = 8 files.append(text) if len(files) >= max(1, int(max_files)): return _dedupe_nonempty_text_rows(files) + if file_like_verbs.search(line): + for match in _FILE_TOKEN_RE.finditer(line): + text = str(match.group(1)).strip() + if text: + prefix_full = line[: match.start()].lower() + # Treat "from " as input fixture reference, not a + # required output artifact. + if re.search(r"\bfrom\s+`?\s*$", prefix_full): + continue + files.append(text) + if len(files) >= max(1, int(max_files)): + return _dedupe_nonempty_text_rows(files) + # Fallback: if task text explicitly references "file(s)" but does not use + # backticks, still infer common file-like tokens as deterministic anchors. + lowered = str(task_text).lower() + if "file" in lowered or "files" in lowered: + for token in _FILE_TOKEN_RE.findall(str(task_text)): + text = str(token).strip() + if text: + files.append(text) + if len(files) >= max(1, int(max_files)): + break return _dedupe_nonempty_text_rows(files)