Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions hotfix_beta.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Retry backoff tune:
Prefer deterministic values in runtime variants.
Retry profile beta
Set initial delay to 300ms
4 changes: 4 additions & 0 deletions target_repo/hotfix_beta.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Retry backoff tune:
Prefer deterministic values in runtime variants.
Retry profile beta
Set initial delay to 300ms
13 changes: 13 additions & 0 deletions tracks/cli_sqlite/agent_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -738,6 +738,19 @@ def _format_v2_lesson_block(
)


def _safe_lesson_hint_text(
*,
lesson: Any,
rule_text: str,
max_chars: int = 320,
) -> str:
return _lesson_selection_policy._safe_lesson_hint_text(
lesson=lesson,
rule_text=rule_text,
max_chars=max_chars,
)


def _serialize_prerun_v2_matches(matches: list[Any]) -> list[dict[str, Any]]:
return _lesson_selection_policy._serialize_prerun_v2_matches(matches)

Expand Down
8 changes: 7 additions & 1 deletion tracks/cli_sqlite/agent_runtime_loop_impl_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -783,8 +783,14 @@ def _run_contract_postretry_validator(*, current_step: int, trigger: str) -> Non
rule_text = (
_placebo_hint_for_lesson(lesson_id=lesson_id, task_id=task_id, domain=domain)
if benchmark_placebo
else str(match.lesson.rule_text)
else _safe_lesson_hint_text(
lesson=match.lesson,
rule_text=str(match.lesson.rule_text),
max_chars=320,
)
)
if not str(rule_text).strip():
continue
lane = str(getattr(match, "lane", "strict")).strip().lower() or "strict"
v2_hints.append(rule_text)
injected_lessons.append(
Expand Down
106 changes: 93 additions & 13 deletions tracks/cli_sqlite/lesson_selection_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,15 @@
"When errors recur, simplify the plan and verify intermediate outputs explicitly.",
)

_UNSAFE_HINT_MARKERS: tuple[str, ...] = (
"```",
"<<",
"$(",
"\x00",
)

_ACTION_TOOL_RE = re.compile(r"^\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*\(")


def _placebo_hint_for_lesson(*, lesson_id: str, task_id: str, domain: str) -> str:
token = f"{domain}|{task_id}|{lesson_id}".encode("utf-8", "ignore")
Expand All @@ -23,6 +32,70 @@ def _placebo_hint_for_lesson(*, lesson_id: str, task_id: str, domain: str) -> st
return f"PLACEBO_CONTROL[{digest[:6]}]: {_PLACEBO_HINT_BANK[idx]}"


def _collapse_hint_text(text: str) -> str:
return " ".join(str(text or "").split())


def _compact_action_template(action_template: str, *, max_chars: int = 180) -> str:
compact = _collapse_hint_text(action_template)
if not compact:
return ""
if len(compact) <= max_chars:
return compact
tool_match = _ACTION_TOOL_RE.match(compact)
if tool_match:
tool_name = str(tool_match.group(1)).strip()
if tool_name:
return f"{tool_name}(...)"
return compact[: max(0, int(max_chars) - 3)] + "..."


def _structured_lesson_rule_text(lesson: Any) -> str:
gap_signature = str(getattr(lesson, "gap_signature", "")).strip()
action_template = str(getattr(lesson, "action_template", "")).strip()
expected_evidence = str(getattr(lesson, "expected_evidence", "")).strip()
if not (gap_signature and action_template and expected_evidence):
return ""
compact_action = _compact_action_template(action_template, max_chars=180)
compact_evidence = _collapse_hint_text(expected_evidence)
if len(compact_evidence) > 140:
compact_evidence = compact_evidence[:137] + "..."
return (
f"WHEN gap_signature={gap_signature}: "
f"{compact_action} EXPECT: {compact_evidence}"
)


def _safe_lesson_hint_text(
*,
lesson: Any,
rule_text: str,
max_chars: int = 320,
) -> str:
"""
Build a safe, compact hint for runtime injection.

Why:
- raw lesson text can contain long multiline command payloads that degrade
tool-call quality (especially shell/sql argument quoting).
- runtime hint channel should carry only concise guidance, not executable
blobs copied verbatim from prior traces.
"""
structured = _structured_lesson_rule_text(lesson)
candidate = structured or _collapse_hint_text(rule_text)
if not candidate:
return ""
if any(marker in candidate for marker in _UNSAFE_HINT_MARKERS):
return structured
if candidate.count(";") > 8 and not structured:
return ""
if len(candidate) > max(64, int(max_chars)):
if structured:
return candidate[: max(0, int(max_chars) - 3)] + "..."
return ""
return candidate


def _format_v2_lesson_block(
matches: list[Any],
*,
Expand All @@ -45,19 +118,14 @@ def _format_v2_lesson_block(
if use_placebo:
rule_text = _placebo_hint_for_lesson(lesson_id=lesson_id, task_id=task_id, domain=domain)
else:
# Prefer structured lesson fields over raw rule_text because the
# stored rule can be long and clipped. For execution memory we want
# the minimal actionable core, not a half-truncated paragraph.
gap_signature = str(getattr(lesson, "gap_signature", "")).strip()
action_template = str(getattr(lesson, "action_template", "")).strip()
expected_evidence = str(getattr(lesson, "expected_evidence", "")).strip()
if gap_signature and action_template and expected_evidence:
rule_text = (
f"WHEN gap_signature={gap_signature}: "
f"{action_template} EXPECT: {expected_evidence}"
)
else:
rule_text = str(getattr(lesson, "rule_text", ""))
# Keep prompt artifacts aligned with runtime safety constraints.
rule_text = _safe_lesson_hint_text(
lesson=lesson,
rule_text=str(getattr(lesson, "rule_text", "")),
max_chars=420,
)
if not rule_text:
continue
lines.append(f"- ({score_value:.2f}) {rule_text}")
return "\n".join(lines), [value for value in lesson_ids if value]

Expand Down Expand Up @@ -526,6 +594,11 @@ def _select_gap_targeted_matches(
for key in (_gap_family_key_from_row(row) for row in unresolved_gaps if isinstance(row, dict))
if key
}
unresolved_signatures = {
str(row.get("gap_signature", "")).strip()
for row in unresolved_gaps
if isinstance(row, dict) and str(row.get("gap_signature", "")).strip()
}
has_repo_init_gap = _has_repo_init_gap(unresolved_gaps)
selected: list[Any] = []
seen_lesson_ids: set[str] = set()
Expand All @@ -550,6 +623,13 @@ def _select_gap_targeted_matches(
if unresolved_families:
if not family_key or family_key not in unresolved_families:
continue
# Enforce check-linked retrieval: when unresolved signature rows are
# available, prefer exact signature binding. This keeps on-error
# hints tied to the active blocker instead of broad same-family
# guidance that can be directionally correct but action-wrong.
lesson_signature = str(getattr(lesson, "gap_signature", "")).strip()
if unresolved_signatures and lesson_signature and lesson_signature not in unresolved_signatures:
continue
if family_key in used_families:
continue
used_families.add(family_key)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Shell Hotfix Hard ON/OFF Slice (Step Cap 6, 10 Runs, 2026-03-09)

## Protocol

- Task: `shell_git_transfer_hotfix_hard`
- Backend: `openai`
- Executor/Judge: `gpt-5-nano`
- Runner: `tracks/cli_sqlite/scripts/run_learning_curve.py`
- Common flags:
- `--benchmark-deterministic`
- `--structured-lessons-required`
- `--no-benchmark-promoted-only`
- `--no-benchmark-placebo`
- `--no-self-edit-mode`
- `--doc-mode none --doc-retrieval off`
- `--executor-docs off --judge-docs off --no-judge-diagnostic`
- ON lane:
- `CORTEX_RUNTIME_LANE=ab_shell_hotfix_on_20260309_10x`
- sessions `609300..609309`
- `posttask_learn=True`
- OFF lane:
- `CORTEX_RUNTIME_LANE=ab_shell_hotfix_off_20260309_10x`
- sessions `609400..609409`
- `--no-posttask-learn`

## Summary

- ON:
- pass rate: `5/10` (`50%`)
- mean score: `0.9277`
- mean errors: `3.7`
- mean lesson activations: `1.6`
- mean retrieval help ratio: `0.7`
- OFF:
- pass rate: `2/10` (`20%`)
- mean score: `0.7777`
- mean errors: `5.5`
- mean lesson activations: `0.0`
- mean retrieval help ratio: `0.0`

## Readout

- This slice shows a clear ON > OFF signal on reliability (`+30pp` pass rate) and score.
- ON also reduces errors and shows active retrieval mechanism (`activations/help > 0`).
- OFF confirms baseline remains materially weaker under identical step budget and task.
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Shell Hotfix Hard ON/OFF Slice (Step Cap 6, 5 Runs, 2026-03-09)

## Protocol

- Task: `shell_git_transfer_hotfix_hard`
- Backend: `openai`
- Executor/Judge model: `gpt-5-nano`
- Deterministic flags: `--benchmark-deterministic`, `--structured-lessons-required`
- Docs: `--doc-mode none --doc-retrieval off --executor-docs off --judge-docs off`
- Self-edit: `--no-self-edit-mode`
- ON lane: `ab_shell_hotfix_on_20260309` (`posttask_learn=True`)
- OFF lane: `ab_shell_hotfix_off_20260309` (`--no-posttask-learn`)

## Per-Arm Summary

- ON (sessions `609100-609104`)
- pass rate: `3/5` (`60%`)
- mean score: `0.856`
- mean errors: `3.8`
- mean lesson activations: `0.8`
- mean retrieval help ratio: `0.333`
- OFF (sessions `609200-609204`)
- pass rate: `2/5` (`40%`)
- mean score: `0.878`
- mean errors: `3.8`
- mean lesson activations: `0.0`
- mean retrieval help ratio: `0.0`

## Readout

- Pass/fail reliability favors ON (`+20pp` pass rate).
- Mean score is slightly higher in OFF, so this slice still has variance.
- Mechanism signal is present in ON only (`activations > 0`, `help_ratio > 0`).

## Telegram-Path Smoke (same phrasing)

- Dispatcher path: `integrations/openclaw_agi_dispatch.py` with
`CORTEX_RUNTIME_LANE=telegram_smoke_20260309`.
- Input phrasing:
- `Create and verify a git hotfix workflow: generate hotfix.txt and transfer_summary.txt, apply hotfix patch cleanly, and prove final repo status is clean. Use only 6 steps.`
- Outcome:
- auto routed to task mode (`reason=auto_task_intent`)
- canonical task mapped: `shell_git_transfer_hotfix_hard`
- adaptive attempts: `2` attempts
- final result: `eval_passed=true`, `eval_score=1.0`
- final session: `tracks/cli_sqlite/runtime/telegram_smoke_20260309/sessions/session-1001`

23 changes: 0 additions & 23 deletions tracks/cli_sqlite/sessions/hotfix_gamma.patch
Original file line number Diff line number Diff line change
@@ -1,23 +0,0 @@
From 07b74d0e849006cfe849fe58348cfd15c71b103f Mon Sep 17 00:00:00 2001
From: Automation <automation@example.com>
Date: Tue, 3 Mar 2026 23:25:07 +0400
Subject: [PATCH] hotfix: apply gamma retry profile

---
hotfix_gamma.txt | 4 ++++
1 file changed, 4 insertions(+)
create mode 100644 hotfix_gamma.txt

diff --git a/hotfix_gamma.txt b/hotfix_gamma.txt
new file mode 100644
index 0000000..98b23da
--- /dev/null
+++ b/hotfix_gamma.txt
@@ -0,0 +1,4 @@
+Retry backoff tune:
+Prefer deterministic values in runtime variants.
+Retry profile gamma
+Set initial delay to 325ms
--
2.50.1 (Apple Git-155)

4 changes: 4 additions & 0 deletions tracks/cli_sqlite/sessions/transfer_summary.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
TRANSFER_BRANCH main
TRANSFER_PATCHES 1
TRANSFER_PATCH_FILE hotfix_gamma.patch
TRANSFER_VARIANT gamma
36 changes: 36 additions & 0 deletions tracks/cli_sqlite/tests/test_agent_cli_validation_retry.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,6 +670,42 @@ def test_select_gap_targeted_matches_keeps_one_per_family() -> None:
assert selected_ids == ["a1", "b1"]


def test_select_gap_targeted_matches_prefers_exact_signature_when_available() -> None:
unresolved = [
{
"reason_code": "required_query_mismatch",
"gap_type": "required_query",
"gap_signature": "required_query_mismatch|required_query|q_exact",
},
]
matches = [
_FakeRetrievalMatch(
lesson_id="family_only_wrong_sig",
rule_text="same family but wrong query signature",
gap_signature="required_query_mismatch|required_query|q_other",
reason_code="required_query_mismatch",
gap_type="required_query",
score=0.95,
),
_FakeRetrievalMatch(
lesson_id="exact_sig",
rule_text="exact query signature",
gap_signature="required_query_mismatch|required_query|q_exact",
reason_code="required_query_mismatch",
gap_type="required_query",
score=0.70,
),
]
selected = agent_cli._select_gap_targeted_matches(
matches=matches,
unresolved_gaps=unresolved,
max_lessons=2,
min_score=0.20,
)
selected_ids = [str(getattr(getattr(row, "lesson", None), "lesson_id", "")) for row in selected]
assert selected_ids == ["exact_sig"]


def test_select_gap_targeted_matches_skips_variant_patch_hint_for_init_gap() -> None:
unresolved = [
{
Expand Down
25 changes: 25 additions & 0 deletions tracks/cli_sqlite/tests/test_memory_v2_demo_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,6 +737,31 @@ def test_load_verification_spec_infers_required_files_and_manifest_keys(
assert any("\\\"top_product\\\"\\s*:" in str(pattern) for pattern in manifest_row.get("patterns", []))


def test_load_verification_spec_infers_plain_file_tokens_and_clean_repo_signal(
tmp_path: Path,
) -> None:
tasks_root = tmp_path / "tasks"
task_dir = tasks_root / "shell_git_transfer_dynamic"
task_dir.mkdir(parents=True, exist_ok=True)
task_text = (
"Goal:\n"
"Create and verify a git hotfix workflow: generate hotfix.txt and transfer_summary.txt,\n"
"apply hotfix patch cleanly, and prove final repo status is clean.\n"
)
task_dir.joinpath("task.md").write_text(task_text, encoding="utf-8")

spec = agent_cli._load_verification_spec(
tasks_root=tasks_root,
task_id="shell_git_transfer_dynamic",
task_text=task_text,
)

required_files = set(spec.get("required_files", []) or [])
assert "hotfix.txt" in required_files
assert "transfer_summary.txt" in required_files
assert "nothing to commit, working tree clean" in set(spec.get("exact_output_lines", []) or [])


def test_low_confidence_verifier_uses_verification_json_required_file_probe(
monkeypatch: pytest.MonkeyPatch,
tmp_path: Path,
Expand Down
Loading