artemgetmann · artemgetmann · Mar 9, 2026 · Mar 9, 2026 · Mar 9, 2026 · Mar 9, 2026
diff --git a/hotfix_beta.txt b/hotfix_beta.txt
@@ -0,0 +1,4 @@
+Retry backoff tune:
+Prefer deterministic values in runtime variants.
+Retry profile beta
+Set initial delay to 300ms
diff --git a/target_repo/hotfix_beta.txt b/target_repo/hotfix_beta.txt
@@ -0,0 +1,4 @@
+Retry backoff tune:
+Prefer deterministic values in runtime variants.
+Retry profile beta
+Set initial delay to 300ms
diff --git a/tracks/cli_sqlite/agent_cli.py b/tracks/cli_sqlite/agent_cli.py
@@ -738,6 +738,19 @@ def _format_v2_lesson_block(
     )
 
 
+def _safe_lesson_hint_text(
+    *,
+    lesson: Any,
+    rule_text: str,
+    max_chars: int = 320,
+) -> str:
+    return _lesson_selection_policy._safe_lesson_hint_text(
+        lesson=lesson,
+        rule_text=rule_text,
+        max_chars=max_chars,
+    )
+
+
 def _serialize_prerun_v2_matches(matches: list[Any]) -> list[dict[str, Any]]:
     return _lesson_selection_policy._serialize_prerun_v2_matches(matches)
 

diff --git a/tracks/cli_sqlite/agent_runtime_loop_impl_runtime.py b/tracks/cli_sqlite/agent_runtime_loop_impl_runtime.py
@@ -783,8 +783,14 @@ def _run_contract_postretry_validator(*, current_step: int, trigger: str) -> Non
                         rule_text = (
                             _placebo_hint_for_lesson(lesson_id=lesson_id, task_id=task_id, domain=domain)
                             if benchmark_placebo
-                            else str(match.lesson.rule_text)
+                            else _safe_lesson_hint_text(
+                                lesson=match.lesson,
+                                rule_text=str(match.lesson.rule_text),
+                                max_chars=320,
+                            )
                         )
+                        if not str(rule_text).strip():
+                            continue
                         lane = str(getattr(match, "lane", "strict")).strip().lower() or "strict"
                         v2_hints.append(rule_text)
                         injected_lessons.append(

diff --git a/tracks/cli_sqlite/lesson_selection_policy.py b/tracks/cli_sqlite/lesson_selection_policy.py
@@ -15,6 +15,15 @@
     "When errors recur, simplify the plan and verify intermediate outputs explicitly.",
 )
 
+_UNSAFE_HINT_MARKERS: tuple[str, ...] = (
+    "```",
+    "<<",
+    "$(",
+    "\x00",
+)
+
+_ACTION_TOOL_RE = re.compile(r"^\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*\(")
+
 
 def _placebo_hint_for_lesson(*, lesson_id: str, task_id: str, domain: str) -> str:
     token = f"{domain}|{task_id}|{lesson_id}".encode("utf-8", "ignore")
@@ -23,6 +32,70 @@ def _placebo_hint_for_lesson(*, lesson_id: str, task_id: str, domain: str) -> st
     return f"PLACEBO_CONTROL[{digest[:6]}]: {_PLACEBO_HINT_BANK[idx]}"
 
 
+def _collapse_hint_text(text: str) -> str:
+    return " ".join(str(text or "").split())
+
+
+def _compact_action_template(action_template: str, *, max_chars: int = 180) -> str:
+    compact = _collapse_hint_text(action_template)
+    if not compact:
+        return ""
+    if len(compact) <= max_chars:
+        return compact
+    tool_match = _ACTION_TOOL_RE.match(compact)
+    if tool_match:
+        tool_name = str(tool_match.group(1)).strip()
+        if tool_name:
+            return f"{tool_name}(...)"
+    return compact[: max(0, int(max_chars) - 3)] + "..."
+
+
+def _structured_lesson_rule_text(lesson: Any) -> str:
+    gap_signature = str(getattr(lesson, "gap_signature", "")).strip()
+    action_template = str(getattr(lesson, "action_template", "")).strip()
+    expected_evidence = str(getattr(lesson, "expected_evidence", "")).strip()
+    if not (gap_signature and action_template and expected_evidence):
+        return ""
+    compact_action = _compact_action_template(action_template, max_chars=180)
+    compact_evidence = _collapse_hint_text(expected_evidence)
+    if len(compact_evidence) > 140:
+        compact_evidence = compact_evidence[:137] + "..."
+    return (
+        f"WHEN gap_signature={gap_signature}: "
+        f"{compact_action} EXPECT: {compact_evidence}"
+    )
+
+
+def _safe_lesson_hint_text(
+    *,
+    lesson: Any,
+    rule_text: str,
+    max_chars: int = 320,
+) -> str:
+    """
+    Build a safe, compact hint for runtime injection.
+
+    Why:
+    - raw lesson text can contain long multiline command payloads that degrade
+      tool-call quality (especially shell/sql argument quoting).
+    - runtime hint channel should carry only concise guidance, not executable
+      blobs copied verbatim from prior traces.
+    """
+    structured = _structured_lesson_rule_text(lesson)
+    candidate = structured or _collapse_hint_text(rule_text)
+    if not candidate:
+        return ""
+    if any(marker in candidate for marker in _UNSAFE_HINT_MARKERS):
+        return structured
+    if candidate.count(";") > 8 and not structured:
+        return ""
+    if len(candidate) > max(64, int(max_chars)):
+        if structured:
+            return candidate[: max(0, int(max_chars) - 3)] + "..."
+        return ""
+    return candidate
+
+
 def _format_v2_lesson_block(
     matches: list[Any],
     *,
@@ -45,19 +118,14 @@ def _format_v2_lesson_block(
         if use_placebo:
             rule_text = _placebo_hint_for_lesson(lesson_id=lesson_id, task_id=task_id, domain=domain)
         else:
-            # Prefer structured lesson fields over raw rule_text because the
-            # stored rule can be long and clipped. For execution memory we want
-            # the minimal actionable core, not a half-truncated paragraph.
-            gap_signature = str(getattr(lesson, "gap_signature", "")).strip()
-            action_template = str(getattr(lesson, "action_template", "")).strip()
-            expected_evidence = str(getattr(lesson, "expected_evidence", "")).strip()
-            if gap_signature and action_template and expected_evidence:
-                rule_text = (
-                    f"WHEN gap_signature={gap_signature}: "
-                    f"{action_template} EXPECT: {expected_evidence}"
-                )
-            else:
-                rule_text = str(getattr(lesson, "rule_text", ""))
+            # Keep prompt artifacts aligned with runtime safety constraints.
+            rule_text = _safe_lesson_hint_text(
+                lesson=lesson,
+                rule_text=str(getattr(lesson, "rule_text", "")),
+                max_chars=420,
+            )
+            if not rule_text:
+                continue
         lines.append(f"- ({score_value:.2f}) {rule_text}")
     return "\n".join(lines), [value for value in lesson_ids if value]
 
@@ -526,6 +594,11 @@ def _select_gap_targeted_matches(
         for key in (_gap_family_key_from_row(row) for row in unresolved_gaps if isinstance(row, dict))
         if key
     }
+    unresolved_signatures = {
+        str(row.get("gap_signature", "")).strip()
+        for row in unresolved_gaps
+        if isinstance(row, dict) and str(row.get("gap_signature", "")).strip()
+    }
     has_repo_init_gap = _has_repo_init_gap(unresolved_gaps)
     selected: list[Any] = []
     seen_lesson_ids: set[str] = set()
@@ -550,6 +623,13 @@ def _select_gap_targeted_matches(
         if unresolved_families:
             if not family_key or family_key not in unresolved_families:
                 continue
+            # Enforce check-linked retrieval: when unresolved signature rows are
+            # available, prefer exact signature binding. This keeps on-error
+            # hints tied to the active blocker instead of broad same-family
+            # guidance that can be directionally correct but action-wrong.
+            lesson_signature = str(getattr(lesson, "gap_signature", "")).strip()
+            if unresolved_signatures and lesson_signature and lesson_signature not in unresolved_signatures:
+                continue
             if family_key in used_families:
                 continue
             used_families.add(family_key)

diff --git a/tracks/cli_sqlite/reports/2026-03-09_shell_hotfix_hard_onoff_step6_10run.md b/tracks/cli_sqlite/reports/2026-03-09_shell_hotfix_hard_onoff_step6_10run.md
@@ -0,0 +1,45 @@
+# Shell Hotfix Hard ON/OFF Slice (Step Cap 6, 10 Runs, 2026-03-09)
+
+## Protocol
+
+- Task: `shell_git_transfer_hotfix_hard`
+- Backend: `openai`
+- Executor/Judge: `gpt-5-nano`
+- Runner: `tracks/cli_sqlite/scripts/run_learning_curve.py`
+- Common flags:
+  - `--benchmark-deterministic`
+  - `--structured-lessons-required`
+  - `--no-benchmark-promoted-only`
+  - `--no-benchmark-placebo`
+  - `--no-self-edit-mode`
+  - `--doc-mode none --doc-retrieval off`
+  - `--executor-docs off --judge-docs off --no-judge-diagnostic`
+- ON lane:
+  - `CORTEX_RUNTIME_LANE=ab_shell_hotfix_on_20260309_10x`
+  - sessions `609300..609309`
+  - `posttask_learn=True`
+- OFF lane:
+  - `CORTEX_RUNTIME_LANE=ab_shell_hotfix_off_20260309_10x`
+  - sessions `609400..609409`
+  - `--no-posttask-learn`
+
+## Summary
+
+- ON:
+  - pass rate: `5/10` (`50%`)
+  - mean score: `0.9277`
+  - mean errors: `3.7`
+  - mean lesson activations: `1.6`
+  - mean retrieval help ratio: `0.7`
+- OFF:
+  - pass rate: `2/10` (`20%`)
+  - mean score: `0.7777`
+  - mean errors: `5.5`
+  - mean lesson activations: `0.0`
+  - mean retrieval help ratio: `0.0`
+
+## Readout
+
+- This slice shows a clear ON > OFF signal on reliability (`+30pp` pass rate) and score.
+- ON also reduces errors and shows active retrieval mechanism (`activations/help > 0`).
+- OFF confirms baseline remains materially weaker under identical step budget and task.
diff --git a/tracks/cli_sqlite/reports/2026-03-09_shell_hotfix_hard_onoff_step6_5run.md b/tracks/cli_sqlite/reports/2026-03-09_shell_hotfix_hard_onoff_step6_5run.md
@@ -0,0 +1,47 @@
+# Shell Hotfix Hard ON/OFF Slice (Step Cap 6, 5 Runs, 2026-03-09)
+
+## Protocol
+
+- Task: `shell_git_transfer_hotfix_hard`
+- Backend: `openai`
+- Executor/Judge model: `gpt-5-nano`
+- Deterministic flags: `--benchmark-deterministic`, `--structured-lessons-required`
+- Docs: `--doc-mode none --doc-retrieval off --executor-docs off --judge-docs off`
+- Self-edit: `--no-self-edit-mode`
+- ON lane: `ab_shell_hotfix_on_20260309` (`posttask_learn=True`)
+- OFF lane: `ab_shell_hotfix_off_20260309` (`--no-posttask-learn`)
+
+## Per-Arm Summary
+
+- ON (sessions `609100-609104`)
+  - pass rate: `3/5` (`60%`)
+  - mean score: `0.856`
+  - mean errors: `3.8`
+  - mean lesson activations: `0.8`
+  - mean retrieval help ratio: `0.333`
+- OFF (sessions `609200-609204`)
+  - pass rate: `2/5` (`40%`)
+  - mean score: `0.878`
+  - mean errors: `3.8`
+  - mean lesson activations: `0.0`
+  - mean retrieval help ratio: `0.0`
+
+## Readout
+
+- Pass/fail reliability favors ON (`+20pp` pass rate).
+- Mean score is slightly higher in OFF, so this slice still has variance.
+- Mechanism signal is present in ON only (`activations > 0`, `help_ratio > 0`).
+
+## Telegram-Path Smoke (same phrasing)
+
+- Dispatcher path: `integrations/openclaw_agi_dispatch.py` with
+  `CORTEX_RUNTIME_LANE=telegram_smoke_20260309`.
+- Input phrasing:
+  - `Create and verify a git hotfix workflow: generate hotfix.txt and transfer_summary.txt, apply hotfix patch cleanly, and prove final repo status is clean. Use only 6 steps.`
+- Outcome:
+  - auto routed to task mode (`reason=auto_task_intent`)
+  - canonical task mapped: `shell_git_transfer_hotfix_hard`
+  - adaptive attempts: `2` attempts
+  - final result: `eval_passed=true`, `eval_score=1.0`
+  - final session: `tracks/cli_sqlite/runtime/telegram_smoke_20260309/sessions/session-1001`
+
diff --git a/tracks/cli_sqlite/sessions/hotfix_gamma.patch b/tracks/cli_sqlite/sessions/hotfix_gamma.patch
@@ -1,23 +0,0 @@
-From 07b74d0e849006cfe849fe58348cfd15c71b103f Mon Sep 17 00:00:00 2001
-From: Automation <automation@example.com>
-Date: Tue, 3 Mar 2026 23:25:07 +0400
-Subject: [PATCH] hotfix: apply gamma retry profile
-
----
- hotfix_gamma.txt | 4 ++++
- 1 file changed, 4 insertions(+)
- create mode 100644 hotfix_gamma.txt
-
-diff --git a/hotfix_gamma.txt b/hotfix_gamma.txt
-new file mode 100644
-index 0000000..98b23da
---- /dev/null
-+++ b/hotfix_gamma.txt
-@@ -0,0 +1,4 @@
-+Retry backoff tune:
-+Prefer deterministic values in runtime variants.
-+Retry profile gamma
-+Set initial delay to 325ms
--- 
-2.50.1 (Apple Git-155)
-

diff --git a/tracks/cli_sqlite/sessions/transfer_summary.txt b/tracks/cli_sqlite/sessions/transfer_summary.txt
@@ -0,0 +1,4 @@
+TRANSFER_BRANCH main
+TRANSFER_PATCHES 1
+TRANSFER_PATCH_FILE hotfix_gamma.patch
+TRANSFER_VARIANT gamma
diff --git a/tracks/cli_sqlite/tests/test_agent_cli_validation_retry.py b/tracks/cli_sqlite/tests/test_agent_cli_validation_retry.py
@@ -670,6 +670,42 @@ def test_select_gap_targeted_matches_keeps_one_per_family() -> None:
     assert selected_ids == ["a1", "b1"]
 
 
+def test_select_gap_targeted_matches_prefers_exact_signature_when_available() -> None:
+    unresolved = [
+        {
+            "reason_code": "required_query_mismatch",
+            "gap_type": "required_query",
+            "gap_signature": "required_query_mismatch|required_query|q_exact",
+        },
+    ]
+    matches = [
+        _FakeRetrievalMatch(
+            lesson_id="family_only_wrong_sig",
+            rule_text="same family but wrong query signature",
+            gap_signature="required_query_mismatch|required_query|q_other",
+            reason_code="required_query_mismatch",
+            gap_type="required_query",
+            score=0.95,
+        ),
+        _FakeRetrievalMatch(
+            lesson_id="exact_sig",
+            rule_text="exact query signature",
+            gap_signature="required_query_mismatch|required_query|q_exact",
+            reason_code="required_query_mismatch",
+            gap_type="required_query",
+            score=0.70,
+        ),
+    ]
+    selected = agent_cli._select_gap_targeted_matches(
+        matches=matches,
+        unresolved_gaps=unresolved,
+        max_lessons=2,
+        min_score=0.20,
+    )
+    selected_ids = [str(getattr(getattr(row, "lesson", None), "lesson_id", "")) for row in selected]
+    assert selected_ids == ["exact_sig"]
+
+
 def test_select_gap_targeted_matches_skips_variant_patch_hint_for_init_gap() -> None:
     unresolved = [
         {

diff --git a/tracks/cli_sqlite/tests/test_memory_v2_demo_mode.py b/tracks/cli_sqlite/tests/test_memory_v2_demo_mode.py
@@ -737,6 +737,31 @@ def test_load_verification_spec_infers_required_files_and_manifest_keys(
     assert any("\\\"top_product\\\"\\s*:" in str(pattern) for pattern in manifest_row.get("patterns", []))
 
 
+def test_load_verification_spec_infers_plain_file_tokens_and_clean_repo_signal(
+    tmp_path: Path,
+) -> None:
+    tasks_root = tmp_path / "tasks"
+    task_dir = tasks_root / "shell_git_transfer_dynamic"
+    task_dir.mkdir(parents=True, exist_ok=True)
+    task_text = (
+        "Goal:\n"
+        "Create and verify a git hotfix workflow: generate hotfix.txt and transfer_summary.txt,\n"
+        "apply hotfix patch cleanly, and prove final repo status is clean.\n"
+    )
+    task_dir.joinpath("task.md").write_text(task_text, encoding="utf-8")
+
+    spec = agent_cli._load_verification_spec(
+        tasks_root=tasks_root,
+        task_id="shell_git_transfer_dynamic",
+        task_text=task_text,
+    )
+
+    required_files = set(spec.get("required_files", []) or [])
+    assert "hotfix.txt" in required_files
+    assert "transfer_summary.txt" in required_files
+    assert "nothing to commit, working tree clean" in set(spec.get("exact_output_lines", []) or [])
+
+
 def test_low_confidence_verifier_uses_verification_json_required_file_probe(
     monkeypatch: pytest.MonkeyPatch,
     tmp_path: Path,