BasisResearch · yichao-liang · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/mypy.ini b/mypy.ini
@@ -18,6 +18,26 @@ warn_unreachable = False
 [mypy-scripts.local.launch_simp]
 warn_unreachable = False
 
+# Domino debug/analysis scripts (init-state rendering, sketch replay, failure
+# reproduction): exploratory tooling that is heavy on untyped third-party calls
+# (PIL drawing etc.), so the strict def/call typing required of library code is
+# relaxed here, mirroring the per-script carve-outs above.
+[mypy-scripts.render_unsolved_domino_states]
+disallow_untyped_defs = False
+disallow_untyped_calls = False
+
+[mypy-scripts.render_domino_initial_states]
+disallow_untyped_defs = False
+disallow_untyped_calls = False
+
+[mypy-scripts.replay_domino_sketches]
+disallow_untyped_defs = False
+disallow_untyped_calls = False
+
+[mypy-scripts.reproduce_domino_failures]
+disallow_untyped_defs = False
+disallow_untyped_calls = False
+
 [mypy-predicators.tests.*]
 ignore_missing_imports = True
 

diff --git a/predicators/agent_sdk/agent_session_mixin.py b/predicators/agent_sdk/agent_session_mixin.py
@@ -2,7 +2,7 @@
 
 Extracts common code for ToolContext initialization, lazy
 AgentSessionManager creation, async-to-sync bridging, and agent explorer
-creation from AgentPlannerApproach and AgentAbstractionLearningApproach.
+creation shared by AgentPlannerApproach and its subclasses.
 """
 import asyncio
 import logging

diff --git a/predicators/agent_sdk/bilevel_sketch.py b/predicators/agent_sdk/bilevel_sketch.py
@@ -12,16 +12,16 @@
 import dataclasses
 import logging
 import re
-from typing import Callable, Collection, List, Optional, Sequence, Set, \
+from typing import Callable, Collection, Dict, List, Optional, Sequence, Set, \
     Tuple, cast
 
 import numpy as np
 
 from predicators import utils
 from predicators.option_model import _OptionModelBase
 from predicators.planning import run_backtracking_refinement
-from predicators.structs import GroundAtom, Object, ParameterizedOption, \
-    Predicate, State, Task, Type, _Option
+from predicators.structs import GroundAtom, Object, OptionSampler, \
+    ParameterizedOption, Predicate, State, Task, Type, _Option
 
 # Signature of an info-gain scorer: given a candidate post-state and the
 # atoms whose truth the step is meant to establish, return a scalar where
@@ -101,11 +101,18 @@ def build_solve_prompt(
     trajectory_summary: str = "",
     tool_names: Optional[Sequence[str]] = None,
     experiment_guidance: str = "",
+    prior_failures: str = "",
 ) -> str:
     """Build the bilevel solve/explore prompt asking for a plan sketch.
 
     Mirrors ``AgentBilevelApproach._build_solve_prompt`` but takes
     dependencies explicitly so explorers can reuse it.
+
+    ``prior_failures`` is a pre-formatted block summarizing earlier
+    sketch attempts that the backtracking search could not refine (with a
+    pointer to the full per-step log in the sandbox). Injected so a
+    re-query produces a *different* skeleton instead of re-emitting the
+    dead one.
     """
     init_state = task.init
     objects = list(init_state)
@@ -157,6 +164,18 @@ def build_solve_prompt(
         experiment_section = (f"\n## Experiment Guidance\n"
                               f"{experiment_guidance}\n")
 
+    prior_failures_section = ""
+    if prior_failures:
+        prior_failures_section = (
+            "\n## Previous Sketch Attempts (FAILED — do NOT repeat them)\n"
+            "Each block below is a sketch you already tried and the "
+            "backtracking search could NOT refine, with where it got stuck "
+            "and a pointer to the full per-step refinement log (read it with "
+            "`Read` for details). Produce a DIFFERENT skeleton that avoids "
+            "the failure — change the step that got stuck (object choice, "
+            "ordering, an intermediate step, or its subgoal annotation).\n"
+            f"{prior_failures}\n")
+
     goal_nl_section = ""
     if task.goal_nl:
         goal_nl_section = f"\n## Goal Description\n{task.goal_nl}\n"
@@ -168,7 +187,11 @@ def build_solve_prompt(
     pred_strs = []
     for pred in sorted(all_predicates, key=lambda p: p.name):
         type_sig = ", ".join(t.name for t in pred.types)
-        pred_strs.append(f"  {pred.name}({type_sig})")
+        line = f"  {pred.name}({type_sig})"
+        if pred.natural_language_assertion is not None:
+            names = [t.name for t in pred.types]
+            line += f" — {pred.natural_language_assertion(names)}"
+        pred_strs.append(line)
 
     prompt = f"""You are solving a task. \
 Generate a plan sketch to achieve the goal.
@@ -187,7 +210,7 @@ def build_solve_prompt(
 
 ## Available Predicates (for subgoal annotations)
 {chr(10).join(pred_strs)}
-{trajectory_summary}{tools_str}
+{trajectory_summary}{tools_str}{prior_failures_section}
 ## Instructions
 Use your available tools to inspect the environment before producing the plan.
 
@@ -246,7 +269,11 @@ def parse_subgoal_annotations(
     results: List[Optional[Tuple[Set[GroundAtom], Set[GroundAtom]]]] = []
 
     for line in text.split('\n'):
-        stripped = line.strip()
+        # Mirror the enumeration-prefix tolerance in the option-plan
+        # parser so the per-line subgoal results stay index-parallel with
+        # the parsed options (a numbered "0: Pick(...)" line must be seen
+        # as an option line here too, else annotations misalign).
+        stripped = utils.strip_enumeration_prefix(line.strip())
         if not stripped:
             continue
         first_token = stripped.split('(')[0]
@@ -368,6 +395,7 @@ def refine_sketch(
     elapsed_holder: Optional[List[float]] = None,
     info_scorer: Optional[InfoScorer] = None,
     info_n_feasible_target: int = 1,
+    option_samplers: Optional[Dict[str, OptionSampler]] = None,
 ) -> Tuple[List[_Option], bool, int]:
     """Backtracking search over continuous parameters for a plan sketch.
 
@@ -415,6 +443,14 @@ def refine_sketch(
     from the sketch's subgoal annotations into ``grounded.memory`` so
     that ``WaitOption`` terminates on the intended atom change rather
     than the first incidental one.
+
+    ``option_samplers`` maps an option name to a per-skill sampler
+    ``(state, subgoal_atoms, rng, objects) -> params`` (the NSRTSampler
+    signature, with the step subgoal in the atoms slot), used on both
+    plain and info-seeking draws to aim that option's parameters at the
+    subgoal instead of drawing uniformly. The return is clipped to the
+    option's box; a missing or misbehaving sampler falls back to uniform
+    sampling.
     """
     if not sketch:
         return [], False, 0
@@ -431,6 +467,42 @@ def refine_sketch(
     deepest_fail_idx: List[int] = [-1]
     deepest_fail_prefix: List[List[Optional[_Option]]] = [[]]
 
+    # Options whose synthesized sampler already misbehaved once — so the
+    # per-draw fallback warning fires at most once per option, not on every
+    # one of the (potentially thousands of) draws during backtracking.
+    _sampler_warned: Set[str] = set()
+
+    def _draw_params(step: SketchStep, state: State,
+                     rng_: np.random.Generator) -> np.ndarray:
+        """Draw continuous params for a step's option.
+
+        Uses a registered per-skill sampler (keyed by option name) when
+        present, else falls back to uniform ``sample_params`` — also on
+        a sampler error or wrong-shaped return.
+        """
+        sampler = (option_samplers.get(step.option.name)
+                   if option_samplers else None)
+        if sampler is not None:
+            box = step.option.params_space
+            expected = box.shape[0]
+            try:
+                raw = sampler(state, step.subgoal_atoms or set(), rng_,
+                              list(step.objects))
+                params = np.asarray(raw, dtype=np.float32).reshape(-1)
+                if params.shape == (expected, ):
+                    return np.clip(params, box.low, box.high)
+                reason = (f"returned shape {params.shape}, "
+                          f"expected ({expected},)")
+            except Exception as e:  # pylint: disable=broad-except
+                reason = f"raised {type(e).__name__}: {e}"
+            if step.option.name not in _sampler_warned:
+                _sampler_warned.add(step.option.name)
+                logging.warning(
+                    "[%s] synthesized sampler for %s %s; falling back to "
+                    "uniform sampling for this option.", run_id,
+                    step.option.name, reason)
+        return sample_params(step.option, rng_)
+
     def _ground(step: SketchStep, params: np.ndarray) -> _Option:
         grounded = step.option.ground(list(step.objects), params)
         if grounded.name == "Wait":
@@ -458,10 +530,21 @@ def _info_seeking_applies(step: SketchStep) -> bool:
     # step exhausts precisely when every pooled candidate has been tried
     # (with 1-draw fillers for attempts left over when the pool came up
     # short of the target).
+    def _is_deterministic(step: SketchStep) -> bool:
+        # A sampler may flag itself as returning constant params (ignoring
+        # state/rng); re-drawing it yields the identical option, so its step
+        # gets a single attempt -- backtracking then skips straight past it
+        # instead of wasting the full budget re-descending through it.
+        sampler = (option_samplers.get(step.option.name)
+                   if option_samplers else None)
+        return bool(getattr(sampler, "deterministic", False))
+
     max_tries = []
     for _step in sketch:
         if _step.option.params_space.shape[0] == 0:
             max_tries.append(1)
+        elif _is_deterministic(_step):
+            max_tries.append(1)
         elif _info_seeking_applies(_step):
             max_tries.append(info_n_feasible_target)
         else:
@@ -538,7 +621,7 @@ def _sample_info_seeking(step: SketchStep, state: State,
         first_candidate: Optional[_Option] = None
         n_draws = 0
         while len(scored) < info_n_feasible_target and n_draws < draw_cap:
-            grounded = _ground(step, sample_params(step.option, rng_))
+            grounded = _ground(step, _draw_params(step, state, rng_))
             n_draws += 1
             if first_candidate is None:
                 first_candidate = grounded
@@ -610,7 +693,7 @@ def sample_fn(idx: int, state: State,
                           f"{state.pretty_str()}")
         if _info_seeking_applies(step):
             return _sample_info_seeking(step, state, rng_, idx)
-        return _ground(step, sample_params(step.option, rng_))
+        return _ground(step, _draw_params(step, state, rng_))
 
     def validate_fn(idx: int, _pre_state: State, _option: _Option,
                     post_state: State, _num_actions: int) -> Tuple[bool, str]:
@@ -861,3 +944,146 @@ def validate_fn(i: int, _pre: State, _opt: _Option, post: State,
             completed, opt_str, last_err or "unknown reason")
 
     return False, diagnosis_holder[0] or "validation failed"
+
+
+def resolve_refine_timeout(
+    timeout: Optional[float],
+    n_steps: int,
+    *,
+    per_step: float,
+    minimum: float,
+) -> Tuple[float, str]:
+    """Resolve a refinement timeout, auto-scaling by sketch length.
+
+    When ``timeout`` is None it auto-scales as
+    ``max(minimum, per_step * n_steps)`` so longer sketches get more
+    budget. Returns ``(timeout_seconds, source)`` where ``source`` is
+    ``"auto"`` or ``"explicit"``. Config defaults are passed in (not read
+    from ``CFG``) to keep this module settings-free.
+    """
+    if timeout is None:
+        return float(max(minimum, per_step * n_steps)), "auto"
+    return float(timeout), "explicit"
+
+
+def refine_and_validate_report(
+    task: Task,
+    sketch: List[SketchStep],
+    option_model: _OptionModelBase,
+    *,
+    predicates: Set[Predicate],
+    timeout: float,
+    rng: np.random.Generator,
+    max_samples_per_step: int,
+    check_subgoals: bool,
+    log_state: bool = False,
+    option_samplers: Optional[Dict[str, OptionSampler]] = None,
+    run_id: str = "refine",
+    timeout_source: str = "explicit",
+    extra_summary_lines: Optional[List[str]] = None,
+) -> Tuple[bool, str]:
+    """Refine a sketch, forward-validate on success, return a report.
+
+    Runs ``refine_sketch`` (backtracking search over continuous params)
+    and, when refinement succeeds, ``validate_plan_forward`` (continuous
+    re-execution). Returns ``(overall_success, human_readable_report)``
+    where ``overall_success`` is True only if both refinement and forward
+    validation pass. The report names the verdict (SUCCESS / TIMEOUT /
+    SAMPLE_EXHAUSTED / FORWARD_VALIDATION_FAILED), per-step sample counts,
+    the stuck step on failure, and the forward-validation outcome.
+
+    ``extra_summary_lines`` are appended verbatim after the time line
+    (e.g. a caller-specific ``Post-fit SSE`` line). Config-derived knobs
+    (``timeout``, ``max_samples_per_step``, ``check_subgoals``,
+    ``log_state``) are passed explicitly so this module stays free of
+    ``CFG``; callers read them from settings.
+    """
+    step_samples_cumulative: List[int] = [0] * len(sketch)
+    termination_reason: List[str] = []
+    elapsed_holder: List[float] = []
+    plan, success, n_samples = refine_sketch(
+        task,
+        sketch,
+        option_model,
+        predicates=predicates,
+        timeout=timeout,
+        rng=rng,
+        max_samples_per_step=max_samples_per_step,
+        check_subgoals=check_subgoals,
+        log_state=log_state,
+        run_id=run_id,
+        step_samples_cumulative=step_samples_cumulative,
+        termination_reason=termination_reason,
+        elapsed_holder=elapsed_holder,
+        option_samplers=option_samplers,
+    )
+
+    reason = termination_reason[0] if termination_reason else (
+        "success" if success else "exhausted")
+    elapsed = elapsed_holder[0] if elapsed_holder else 0.0
+    if success:
+        verdict = "SUCCESS"
+    elif reason == "timeout":
+        verdict = "FAILURE: TIMEOUT"
+    elif reason == "exhausted":
+        verdict = "FAILURE: SAMPLE_EXHAUSTED"
+    else:
+        verdict = "FAILURE"
+
+    lines = [
+        verdict,
+        f"  Sketch: {len(sketch)} steps  Refined: {len(plan)} steps  "
+        f"Samples: {n_samples} total",
+        f"  Per-step samples: {step_samples_cumulative}  "
+        f"(cap {max_samples_per_step}/step)",
+        f"  Time: {elapsed:.1f}s used / {timeout:.1f}s allotted "
+        f"(timeout source: {timeout_source})",
+    ]
+    if extra_summary_lines:
+        lines.extend(extra_summary_lines)
+    if not success and len(plan) < len(sketch):
+        stuck_idx = len(plan)
+        stuck = sketch[stuck_idx]
+        objs = ", ".join(f"{o.name}:{o.type.name}" for o in stuck.objects)
+        lines.append(f"  Stuck at step {stuck_idx}: "
+                     f"{stuck.option.name}({objs})")
+        if stuck.subgoal_atoms:
+            atoms = ", ".join(str(a) for a in stuck.subgoal_atoms)
+            lines.append(f"    subgoals: {atoms}")
+
+    # Forward validation: re-execute the refined plan continuously (state
+    # carries forward across all options). Refinement's per-step resets
+    # and resampling can mask drift the real env will hit at test time.
+    if success:
+        try:
+            fv_ok, fv_reason = validate_plan_forward(
+                task,
+                plan,
+                option_model,
+                predicates=predicates,
+                sketch=sketch,
+                run_id=run_id,
+            )
+        except Exception as e:  # pylint: disable=broad-except
+            fv_ok = False
+            fv_reason = f"forward validation raised: {e}"
+        if fv_ok:
+            lines.append("  Forward validation: SUCCESS")
+        else:
+            # Demote the headline verdict: refinement passed but the plan
+            # does not survive continuous execution, which is what the
+            # real env will see at test time.
+            success = False
+            lines[0] = "FAILURE: FORWARD_VALIDATION_FAILED"
+            lines.append(f"  Forward validation: FAIL — {fv_reason}")
+            lines.append(
+                "    (Refinement resets state between options and "
+                "resamples up to the per-step cap; forward validation "
+                "runs the same plan once continuously. A divergence here "
+                "means the refined plan does not survive continuous "
+                "execution — accumulated drift, or (when the model is "
+                "learned) a rule/threshold more permissive than the env's "
+                "effective behavior. See the INFO log for the step-by-step "
+                "divergence.)")
+
+    return success, "\n".join(lines)