attempt 1

mikasenghaas · mikasenghaas · commit f7485d441f8d · 2026-01-29T15:22:07.000Z
diff --git a/verifiers/envs/environment.py b/verifiers/envs/environment.py
@@ -7,6 +7,7 @@
 import json
 import logging
 import signal
+import sys
 import time
 import uuid
 from abc import ABC, abstractmethod
@@ -28,6 +29,7 @@
 
 from openai import AsyncOpenAI, BadRequestError, OpenAI
 
+from verifiers.utils.eval_utils import filter_inputs
 from verifiers.utils.worker_utils import get_free_port
 from verifiers.workers.client.zmq_env_client import ZMQEnvClient
 from verifiers.workers.server.zmq_env_server import ZMQEnvServer
@@ -71,7 +73,10 @@
 from verifiers.utils.save_utils import (
     GenerateOutputsBuilder,
     make_dataset,
-    save_generate_outputs,
+    push_results_to_hf_hub,
+    save_metadata,
+    save_new_outputs,
+    save_outputs,
     state_to_output,
 )
 from verifiers.utils.token_utils import (
@@ -673,12 +678,15 @@ async def init_state(
             state_input["info"] = json.loads(state_input["info"])
         if "task" not in state_input:
             state_input["task"] = self.env_id or "default"
+        # Extract rollout_idx before creating RolloutInput (it's not part of RolloutInput)
+        rollout_idx = state_input.pop("rollout_idx", 0)
         state = State(input=RolloutInput(**state_input))  # type: ignore[missing-typed-dict-key]
         state["client"] = client
         state["model"] = model
         state["sampling_args"] = sampling_args
         state["is_completed"] = False
         state["is_truncated"] = False
+        state["rollout_idx"] = rollout_idx
         state["oai_tools"] = None
         if "info" in state and hasattr(state["info"], "oai_tools"):
             state["oai_tools"] = state["info"]["oai_tools"]
@@ -845,7 +853,6 @@ async def generate(
         results_path: Path | None = None,
         state_columns: list[str] | None = None,
         save_results: bool = False,
-        save_every: int = -1,
         push_to_hf_hub: bool = False,
         hf_hub_dataset_name: str | None = None,
         use_tqdm: bool = True,
@@ -865,6 +872,10 @@ async def generate(
         elif isinstance(inputs, list):
             inputs_list = inputs
 
+        if not inputs_list:
+            self.logger.info("No inputs to generate")
+            sys.exit(0)
+
         # notify caller of actual total count (useful when num_examples=-1)
         if on_start is not None:
             on_start(len(inputs_list))
@@ -879,7 +890,7 @@ async def generate(
         else:
             sampling_args = default_sampling_args
 
-        # Initialize builder for incremental serialization
+        # initialize generate outputs builder
         builder = GenerateOutputsBuilder(
             env_id=self.env_id,
             env_args=self.env_args,
@@ -947,15 +958,14 @@ async def generate(
         # process tasks as they complete
         reward_sum, reward_count = 0, 0
         groups_or_rollouts_completed = 0
+        outputs: list[RolloutOutput] = []
         try:
             for coro in asyncio.as_completed(tasks.keys()):
                 result = await coro
 
                 # normalize: independent_scoring returns RolloutOutput, group returns list[RolloutOutput]
-                outputs = [result] if independent_scoring else result
-
-                # Serialize states to outputs immediately (serialization happens once here)
-                new_outputs = builder.add_outputs(outputs)
+                new_outputs = [result] if independent_scoring else result
+                builder.add_outputs(new_outputs)
                 groups_or_rollouts_completed += 1
 
                 # track reward for rolling average (from outputs)
@@ -971,19 +981,15 @@ async def generate(
                     if reward_count > 0:
                         pbar.set_postfix(reward=f"{reward_sum / reward_count:.3f}")
                 elif on_progress is not None:
-                    on_progress(builder.outputs, new_outputs)
-
-                # save intermediate results (outputs already serialized, no redundant work)
-                if (
-                    save_results
-                    and save_every > 0
-                    and groups_or_rollouts_completed % save_every == 0
-                ):
-                    intermediate_results = builder.build()
+                    on_progress(outputs, new_outputs)
+
+                if save_results:
+                    # incrementally save outputs
+                    save_new_outputs(new_outputs, builder.results_path)
+                    save_metadata(builder.build_metadata(), builder.results_path)
                     self.logger.debug(
-                        f"Saving intermediate results to {intermediate_results['metadata']['path_to_save']}"
+                        f"Saved {len(new_outputs)} new outputs to {builder.results_path}"
                     )
-                    save_generate_outputs(intermediate_results)
         finally:
             # cancel all outstanding tasks and await their completion
             pending = [task for task in tasks.keys() if not task.done()]
@@ -999,7 +1005,10 @@ async def generate(
 
         # save if requested
         if save_results:
-            save_generate_outputs(results, push_to_hf_hub, hf_hub_dataset_name)
+            save_outputs(results["outputs"], builder.results_path)
+            save_metadata(results["metadata"], builder.results_path)
+            if push_to_hf_hub:
+                push_results_to_hf_hub(results, hf_hub_dataset_name)
             if on_log is not None:
                 on_log(f"Saved final outputs to {results['metadata']['path_to_save']}")
 
@@ -1063,7 +1072,7 @@ async def evaluate(
         results_path: Path | None = None,
         state_columns: list[str] | None = None,
         save_results: bool = False,
-        save_every: int = -1,
+        resume_path: Path | None = None,
         push_to_hf_hub: bool = False,
         hf_hub_dataset_name: str | None = None,
         use_tqdm: bool = True,
@@ -1078,6 +1087,8 @@ async def evaluate(
         Evaluate model on the Environment evaluation dataset.
         """
         inputs = self._get_eval_inputs(num_examples, rollouts_per_example)
+        if resume_path is not None:
+            inputs = filter_inputs(inputs, resume_path, rollouts_per_example)
         return await self.generate(
             inputs,
             client=client,
@@ -1087,7 +1098,6 @@ async def evaluate(
             results_path=results_path,
             state_columns=state_columns,
             save_results=save_results,
-            save_every=save_every,
             push_to_hf_hub=push_to_hf_hub,
             hf_hub_dataset_name=hf_hub_dataset_name,
             use_tqdm=use_tqdm,
@@ -1110,7 +1120,7 @@ def evaluate_sync(
         results_path: Path | None = None,
         state_columns: list[str] | None = None,
         save_results: bool = False,
-        save_every: int = -1,
+        resume_path: Path | None = None,
         push_to_hf_hub: bool = False,
         hf_hub_dataset_name: str | None = None,
         independent_scoring: bool = False,
@@ -1129,7 +1139,7 @@ def evaluate_sync(
             results_path=results_path,
             state_columns=state_columns,
             save_results=save_results,
-            save_every=save_every,
+            resume_path=resume_path,
             push_to_hf_hub=push_to_hf_hub,
             hf_hub_dataset_name=hf_hub_dataset_name,
             independent_scoring=independent_scoring,
diff --git a/verifiers/scripts/eval.py b/verifiers/scripts/eval.py
@@ -1,5 +1,7 @@
 import os
 
+from verifiers.utils.path_utils import is_valid_eval_results_path
+
 # Suppress tokenizers parallelism warning (only prints when env var is unset)
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "true")
 
@@ -212,15 +214,13 @@ def main():
         help="Save results to disk",
     )
     parser.add_argument(
-        "--save-every",
-        "-f",
-        type=int,
-        default=DEFAULT_SAVE_EVERY,
-        help="Save dataset every n rollouts (-1 to disable)",
+        "--resume-path",
+        type=str,
+        default=None,
+        help="Resume from a previous run.",
     )
     parser.add_argument(
         "--independent-scoring",
-        "-R",
         default=False,
         action="store_true",
         help="Score each rollout individually instead of scoring by group",
@@ -389,6 +389,16 @@ def build_eval_config(raw: dict) -> EvalConfig:
             extra_headers=merged_headers,
         )
 
+        # handle resume path resolution
+        resume_path = raw.get("resume_path")
+        if resume_path is not None:
+            resume_path = Path(resume_path)
+            if not is_valid_eval_results_path(resume_path):
+                raise ValueError(
+                    f"Resume path {resume_path} is not a valid evaluation results path"
+                )
+            logger.info(f"Resuming from: {resume_path}")
+
         return EvalConfig(
             env_id=env_id,
             env_args=raw.get("env_args", {}),
@@ -404,7 +414,7 @@ def build_eval_config(raw: dict) -> EvalConfig:
             verbose=raw.get("verbose", False),
             state_columns=raw.get("state_columns", []),
             save_results=raw.get("save_results", False),
-            save_every=raw.get("save_every", DEFAULT_SAVE_EVERY),
+            resume_path=resume_path,
             independent_scoring=raw.get("independent_scoring", False),
             save_to_hf_hub=raw.get("save_to_hf_hub", False),
             hf_hub_dataset_name=raw.get("hf_hub_dataset_name", ""),
diff --git a/verifiers/types.py b/verifiers/types.py
@@ -272,7 +272,7 @@ class EvalConfig(BaseModel):
     # saving
     state_columns: list[str] | None = None
     save_results: bool = False
-    save_every: int = -1
+    resume_path: Path | None = None
     save_to_hf_hub: bool = False
     hf_hub_dataset_name: str | None = None
 
diff --git a/verifiers/utils/eval_display.py b/verifiers/utils/eval_display.py
@@ -318,10 +318,6 @@ def fmt_concurrency(val: int) -> str:
         if config.save_results:
             config_line.append("  |  ", style="dim")
             config_line.append("saving results", style="white")
-            if config.save_every > 0:
-                config_line.append(" every ", style="dim")
-                config_line.append(str(config.save_every), style="white")
-                config_line.append(" steps", style="dim")
 
         # create progress bar with timing
         # use env_state.total which gets updated by on_start callback
diff --git a/verifiers/utils/eval_utils.py b/verifiers/utils/eval_utils.py
@@ -13,6 +13,8 @@
 from datasets import disable_progress_bar, enable_progress_bar
 from datasets.utils import logging as ds_logging
 
+from verifiers.utils.save_utils import load_outputs
+
 try:
     import tomllib  # type: ignore[import-not-found]
 except ImportError:
@@ -31,6 +33,7 @@
     GenerateOutputs,
     LogCallback,
     ProgressCallback,
+    RolloutInput,
     RolloutOutput,
     StartCallback,
 )
@@ -181,6 +184,29 @@ def load_toml_config(path: Path) -> list[dict]:
     return merged_eval_list
 
 
+def filter_inputs(
+    inputs: list[RolloutInput], results_path: Path, rollouts_per_example: int
+):
+    """Filter inputs based on the number of rollouts per example."""
+    saved_outputs = load_outputs(results_path)
+
+    inputs_by_example_id, outputs_by_example_id = defaultdict(list), defaultdict(list)
+    for input in inputs:
+        inputs_by_example_id[input["example_id"]].append(input)
+    for output in saved_outputs:
+        outputs_by_example_id[output["example_id"]].append(output)
+
+    filtered_inputs = []
+    for example_id in inputs_by_example_id.keys():
+        example_inputs = inputs_by_example_id[example_id]
+        example_outputs = outputs_by_example_id[example_id]
+        rollouts_left = len(example_outputs) - rollouts_per_example
+        if rollouts_left > 0:
+            filtered_inputs.extend(example_inputs[:rollouts_per_example])
+
+    return filtered_inputs
+
+
 def to_col_order(list_of_dicts: list[Mapping[str, float]]) -> dict[str, list[float]]:
     """Convert a list of mappings to a dictionary of lists, ordered by the keys of the first mapping."""
     if not list_of_dicts:
@@ -339,7 +365,7 @@ async def run_evaluation(
         await vf_env.start_server(extra_env_kwargs=config.extra_env_kwargs)
 
         # run evaluation
-        results_path = get_eval_results_path(config)
+        results_path = config.resume_path or get_eval_results_path(config)
         logger.debug(f"Starting evaluation with model: {config.model}")
         logger.debug(
             f"Configuration: num_examples={config.num_examples}, rollouts_per_example={config.rollouts_per_example}, max_concurrent={config.max_concurrent}"
@@ -356,7 +382,7 @@ async def run_evaluation(
             results_path=results_path,
             state_columns=config.state_columns,
             save_results=config.save_results,
-            save_every=config.save_every,
+            resume_path=config.resume_path,
             push_to_hf_hub=config.save_to_hf_hub,
             hf_hub_dataset_name=config.hf_hub_dataset_name,
             use_tqdm=use_tqdm,
diff --git a/verifiers/utils/path_utils.py b/verifiers/utils/path_utils.py
@@ -1,8 +1,11 @@
+import logging
 import uuid
 from pathlib import Path
 
 from verifiers.types import EvalConfig
 
+logger = logging.getLogger(__name__)
+
 
 def get_results_path(
     env_id: str,
@@ -28,6 +31,16 @@ def get_eval_results_path(config: EvalConfig) -> Path:
     return results_path
 
 
+def is_valid_eval_results_path(path: Path) -> bool:
+    """Checks if a path is a valid evaluation results path."""
+    return (
+        path.exists()
+        and path.is_dir()
+        and Path(path / "results.jsonl").exists()
+        and Path(path / "metadata.json").exists()
+    )
+
+
 def get_gepa_results_path(
     env_id: str,
     model: str,
diff --git a/verifiers/utils/save_utils.py b/verifiers/utils/save_utils.py