From 479b24b8b0acfdb6d9a7ca4f4c930a895c48d3b0 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene.vinitsky@gmail.com>
Date: Sun, 8 Mar 2026 00:25:52 -0500
Subject: [PATCH 01/46] Add periodic safe eval during training with reward
 conditioning

Periodically run policy with safe/law-abiding reward conditioning values
(high collision, offroad, overspeed penalties) and render videos + collect
metrics under the eval/ wandb tab.

- Add [safe_eval] config section with safe reward conditioning values
- Add --config flag to C visualizer for alternative INI files
- Generate temp INI with min=max bounds for deterministic safe conditioning
- Render safe eval videos and collect metrics via subprocess
- Refactor eval subprocess functions to share common helper
- Fix visualize.c: guard rand()%0 crash, propagate eval_gif return value
- Fix safe_eval(): call policy.eval(), pass actual LSTM done/reward signals
---
 pufferlib/config/ocean/drive.ini  |  31 +++
 pufferlib/ocean/drive/visualize.c |  25 ++-
 pufferlib/pufferl.py              | 116 ++++++++++-
 pufferlib/utils.py                | 336 +++++++++++++++---------------
 4 files changed, 329 insertions(+), 179 deletions(-)

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index bae7e23972..1f13bde320 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -192,6 +192,37 @@ human_replay_eval = False
 ; Control only the self-driving car
 human_replay_control_mode = "control_sdc_only"
 
+[safe_eval]
+; If True, periodically run policy with safe/law-abiding reward conditioning and log videos + metrics
+safe_eval = True
+; How often to run safe eval (in training epochs). Defaults to render_interval.
+safe_eval_interval = 250
+; Number of episodes to collect metrics over
+safe_eval_num_episodes = 300
+
+; Safe reward conditioning values (min=max to fix the value)
+; High penalties for unsafe behavior
+safe_eval_collision = -3.0
+safe_eval_offroad = -3.0
+safe_eval_overspeed = -1.0
+safe_eval_traffic_light = -1.0
+safe_eval_reverse = -0.0075
+safe_eval_comfort = -0.1
+
+; Standard driving rewards
+safe_eval_goal_radius = 2.0
+safe_eval_lane_align = 0.0025
+safe_eval_lane_center = -0.00075
+safe_eval_velocity = 0.005
+safe_eval_center_bias = 0.0
+safe_eval_vel_align = 1.0
+safe_eval_timestep = -0.00005
+
+; Neutral scaling factors
+safe_eval_throttle = 1.0
+safe_eval_steer = 1.0
+safe_eval_acc = 1.0
+
 [render]
 ; Mode to render a bunch of maps with a given policy
 ; Path to dataset used for rendering
diff --git a/pufferlib/ocean/drive/visualize.c b/pufferlib/ocean/drive/visualize.c
index b9183b45c1..896ebf7db8 100644
--- a/pufferlib/ocean/drive/visualize.c
+++ b/pufferlib/ocean/drive/visualize.c
@@ -193,11 +193,11 @@ static int make_gif_from_frames(const char *pattern, int fps, const char *palett
 
 int eval_gif(const char *map_name, const char *policy_name, int show_grid, int obs_only, int lasers,
              int show_human_logs, int frame_skip, const char *view_mode, const char *output_topdown,
-             const char *output_agent, int num_maps, int zoom_in) {
+             const char *output_agent, int num_maps, int zoom_in, const char *config_path) {
 
     // Parse configuration from INI file
     env_init_config conf = {0};
-    const char *ini_file = "pufferlib/config/ocean/drive.ini";
+    const char *ini_file = config_path ? config_path : "pufferlib/config/ocean/drive.ini";
     if (ini_parse(ini_file, handler, &conf) < 0) {
         fprintf(stderr, "Error: Could not load %s. Cannot determine environment configuration.\n", ini_file);
         return -1;
@@ -432,13 +432,23 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o
 }
 
 int main(int argc, char *argv[]) {
+    // Scan for --config first so INI parsing uses the right file
+    const char *config_path = NULL;
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "--config") == 0 && i + 1 < argc) {
+            config_path = argv[i + 1];
+            break;
+        }
+    }
+
     // Parse configuration from INI file
-    env_init_config conf = {0}; // Initialize to zero
-    const char *ini_file = "pufferlib/config/ocean/drive.ini";
+    env_init_config conf = {0};
+    const char *ini_file = config_path ? config_path : "pufferlib/config/ocean/drive.ini";
     if (ini_parse(ini_file, handler, &conf) < 0) {
         fprintf(stderr, "Error: Could not load %s. Cannot determine environment configuration.\n", ini_file);
         return -1;
     }
+
     int show_grid = 0;
     int obs_only = 0;
     int lasers = 0;
@@ -518,10 +528,15 @@ int main(int argc, char *argv[]) {
                 num_maps = atoi(argv[i + 1]);
                 i++;
             }
+        } else if (strcmp(argv[i], "--config") == 0) {
+            // Already handled in pre-scan above
+            if (i + 1 < argc) {
+                i++;
+            }
         }
     }
 
     eval_gif(map_name, policy_name, show_grid, obs_only, lasers, show_human_logs, frame_skip, view_mode, output_topdown,
-             output_agent, num_maps, zoom_in);
+             output_agent, num_maps, zoom_in, config_path);
     return 0;
 }
diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 49486037d2..559f90f1f7 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -17,6 +17,7 @@
 import subprocess
 import argparse
 import importlib
+import json
 import configparser
 from threading import Thread
 from collections import defaultdict, deque
@@ -519,14 +520,11 @@ def train(self):
                 model_files = glob.glob(os.path.join(model_dir, "model_*.pt"))
 
                 if model_files:
-                    # Take the latest checkpoint
                     latest_cpt = max(model_files, key=os.path.getctime)
                     bin_path = f"{model_dir}.bin"
 
-                    # Export to .bin for rendering with raylib
                     try:
                         export_args = {"env_name": self.config["env"], "load_model_path": latest_cpt, **self.config}
-
                         export(
                             args=export_args,
                             env_name=self.config["env"],
@@ -538,10 +536,21 @@ def train(self):
 
                         bin_path_epoch = f"{model_dir}_epoch_{self.epoch:06d}.bin"
                         shutil.copy2(bin_path, bin_path_epoch)
-
                         env_cfg = getattr(self.vecenv, "driver_env", None)
-                        wandb_log = True if hasattr(self.logger, "wandb") and self.logger.wandb else False
+                        wandb_log = bool(hasattr(self.logger, "wandb") and self.logger.wandb)
                         wandb_run = self.logger.wandb if hasattr(self.logger, "wandb") else None
+
+                        # Check if safe eval should also run at this interval
+                        safe_eval_config = self.config.get("safe_eval", {})
+                        run_safe_eval = safe_eval_config.get("safe_eval", False)
+                        safe_eval_interval = safe_eval_config.get("safe_eval_interval", self.render_interval)
+                        should_safe_eval = run_safe_eval and self.epoch % safe_eval_interval == 0
+                        bin_path_safe = None
+                        if should_safe_eval:
+                            # Copy bin before render_videos deletes it
+                            bin_path_safe = f"{bin_path_epoch}.safe_eval.bin"
+                            shutil.copy2(bin_path_epoch, bin_path_safe)
+
                         if self.render_async:
                             # Clean up finished processes
                             self.render_processes = [p for p in self.render_processes if p.is_alive()]
@@ -583,6 +592,31 @@ def train(self):
                                 wandb_run=wandb_run,
                             )
 
+                        # Run safe eval using the copied bin (reuses the already-exported model)
+                        if should_safe_eval:
+                            safe_ini_path = None
+                            try:
+                                safe_ini_path = pufferlib.utils.generate_safe_eval_ini(safe_eval_config)
+                                pufferlib.utils.render_videos(
+                                    self.config, env_cfg, self.logger.run_id,
+                                    wandb_log, self.epoch, self.global_step,
+                                    bin_path_safe, False,
+                                    wandb_run=wandb_run,
+                                    config_path=safe_ini_path,
+                                    wandb_prefix="eval",
+                                )
+
+                                pufferlib.utils.run_safe_eval_metrics_in_subprocess(
+                                    self.config, self.logger, self.global_step, safe_eval_config
+                                )
+                            except Exception as e:
+                                print(f"Failed to run safe eval: {e}")
+                            finally:
+                                if safe_ini_path and os.path.exists(safe_ini_path):
+                                    os.remove(safe_ini_path)
+                                if bin_path_safe and os.path.exists(bin_path_safe):
+                                    os.remove(bin_path_safe)
+
                     except Exception as e:
                         print(f"Failed to export model weights: {e}")
 
@@ -606,19 +640,18 @@ def check_render_queue(self):
                 result = self.render_queue.get_nowait()
                 step = result["step"]
                 videos = result["videos"]
+                prefix = result.get("wandb_prefix", "render")
 
-                # Log to wandb if available
                 if hasattr(self.logger, "wandb") and self.logger.wandb:
                     import wandb
 
                     payload = {}
                     if videos["output_topdown"]:
-                        payload["render/world_state"] = [wandb.Video(p, format="mp4") for p in videos["output_topdown"]]
+                        payload[f"{prefix}/world_state"] = [wandb.Video(p, format="mp4") for p in videos["output_topdown"]]
                     if videos["output_agent"]:
-                        payload["render/agent_view"] = [wandb.Video(p, format="mp4") for p in videos["output_agent"]]
+                        payload[f"{prefix}/agent_view"] = [wandb.Video(p, format="mp4") for p in videos["output_agent"]]
 
                     if payload:
-                        # Custom step for render logs to prevent monotonic logic wandb errors
                         payload["render_step"] = step
                         self.logger.wandb.log(payload)
 
@@ -1139,7 +1172,7 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None):
     else:
         logger = None
 
-    train_config = dict(**args["train"], env=env_name, eval=args.get("eval", {}))
+    train_config = dict(**args["train"], env=env_name, eval=args.get("eval", {}), safe_eval=args.get("safe_eval", {}))
     if "vec" in args and "num_workers" in args["vec"]:
         train_config["num_workers"] = args["vec"]["num_workers"]
     pufferl = PuffeRL(train_config, vecenv, policy, logger)
@@ -1323,6 +1356,65 @@ def eval(env_name, args=None, vecenv=None, policy=None):
                 frames.append("Done")
 
 
+def safe_eval(env_name, args=None, vecenv=None, policy=None):
+    """Evaluate policy with safe/law-abiding reward conditioning and output metrics."""
+    args = args or load_config(env_name)
+
+    args["vec"] = dict(backend="PufferEnv", num_envs=1)
+    args["env"]["num_agents"] = 64
+
+    vecenv = vecenv or load_env(env_name, args)
+    policy = policy or load_policy(args, vecenv, env_name)
+    policy.eval()
+
+    num_steps = args.get("safe_eval", {}).get("safe_eval_num_episodes", 300)
+    device = args["train"]["device"]
+    num_agents = vecenv.observation_space.shape[0]
+    use_rnn = args["train"]["use_rnn"]
+
+    ob, _ = vecenv.reset()
+    state = {}
+    dones = torch.zeros(num_agents, device=device)
+    prev_rewards = torch.zeros(num_agents, device=device)
+    if use_rnn:
+        state = dict(
+            lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device),
+            lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device),
+        )
+
+    all_stats = defaultdict(list)
+    for _ in range(num_steps):
+        with torch.no_grad():
+            ob_t = torch.as_tensor(ob).to(device)
+            if use_rnn:
+                state["reward"] = prev_rewards
+                state["done"] = dones
+            logits, value = policy.forward_eval(ob_t, state)
+            action, logprob, _ = pufferlib.pytorch.sample_logits(logits)
+            action = action.cpu().numpy().reshape(vecenv.action_space.shape)
+
+        ob, rewards, terminals, truncations, infos = vecenv.step(action)
+        prev_rewards = torch.as_tensor(rewards).float().to(device)
+        dones = torch.as_tensor(np.maximum(terminals, truncations)).float().to(device)
+        for entry in infos:
+            if isinstance(entry, dict):
+                for k, v in entry.items():
+                    try:
+                        float(v)
+                        all_stats[k].append(v)
+                    except (TypeError, ValueError):
+                        pass
+
+    metrics = {k: float(np.mean(v)) for k, v in all_stats.items() if len(v) > 0}
+
+    print("SAFE_EVAL_METRICS_START")
+    print(json.dumps(metrics))
+    print("SAFE_EVAL_METRICS_END")
+
+    vecenv.close()
+    return metrics
+
+
 def sweep(args=None, env_name=None):
     args = args or load_config(env_name)
     if not args["wandb"] and not args["neptune"]:
@@ -1756,7 +1848,7 @@ def render_task(map_path):
 
 
 def main():
-    err = "Usage: puffer [train, eval, sweep, controlled_exp, autotune, profile, export, sanity, render] [env_name] [optional args]. --help for more info"
+    err = "Usage: puffer [train, eval, safe_eval, sweep, controlled_exp, autotune, profile, export, sanity, render] [env_name] [optional args]. --help for more info"
     if len(sys.argv) < 3:
         raise pufferlib.APIUsageError(err)
 
@@ -1780,6 +1872,8 @@ def main():
         sanity(env_name=env_name)
     elif mode == "render":
         render(env_name=env_name)
+    elif mode == "safe_eval":
+        safe_eval(env_name=env_name)
     else:
         raise pufferlib.APIUsageError(err)
 
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index 1f2ccd5142..28e5f496ee 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -4,185 +4,155 @@
 import shutil
 import subprocess
 import json
+import configparser
+import tempfile
+
+# Mapping from safe_eval config keys to (reward_bound_min, reward_bound_max) pairs.
+# Used by both generate_safe_eval_ini (underscore form) and the metrics subprocess (hyphen form).
+SAFE_EVAL_BOUND_KEYS = [
+    ("safe_eval_collision", "collision"),
+    ("safe_eval_offroad", "offroad"),
+    ("safe_eval_overspeed", "overspeed"),
+    ("safe_eval_traffic_light", "traffic_light"),
+    ("safe_eval_reverse", "reverse"),
+    ("safe_eval_comfort", "comfort"),
+    ("safe_eval_goal_radius", "goal_radius"),
+    ("safe_eval_lane_align", "lane_align"),
+    ("safe_eval_lane_center", "lane_center"),
+    ("safe_eval_velocity", "velocity"),
+    ("safe_eval_center_bias", "center_bias"),
+    ("safe_eval_vel_align", "vel_align"),
+    ("safe_eval_timestep", "timestep"),
+    ("safe_eval_throttle", "throttle"),
+    ("safe_eval_steer", "steer"),
+    ("safe_eval_acc", "acc"),
+]
+
+
+def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_name, wandb_keys=None):
+    """Run an evaluation subprocess and log metrics to wandb.
 
-
-def run_human_replay_eval_in_subprocess(config, logger, global_step):
-    """
-    Run human replay evaluation in a subprocess and log metrics to wandb.
-
+    Args:
+        config: Training config dict (must have data_dir, env)
+        logger: Logger with run_id and optional wandb attribute
+        global_step: Current global training step
+        mode: pufferl mode to run (e.g. "eval", "safe_eval")
+        extra_args: List of extra CLI args appended to the base command
+        marker_name: Marker prefix for JSON extraction (e.g. "WOSAC" looks for WOSAC_METRICS_START/END)
+        wandb_keys: If dict, maps metric keys to wandb keys. If None, logs all as eval/<key>.
     """
+    eval_name = marker_name.lower().replace("_", " ")
     try:
         run_id = logger.run_id
         model_dir = os.path.join(config["data_dir"], f"{config['env']}_{run_id}")
         model_files = glob.glob(os.path.join(model_dir, "model_*.pt"))
 
         if not model_files:
-            print("No model files found for human replay evaluation")
+            print(f"No model files found for {eval_name} evaluation")
             return
 
         latest_cpt = max(model_files, key=os.path.getctime)
 
-        # Prepare evaluation command
-        eval_config = config["eval"]
         cmd = [
-            sys.executable,
-            "-m",
-            "pufferlib.pufferl",
-            "eval",
-            config["env"],
-            "--load-model-path",
-            latest_cpt,
-            "--eval.wosac-realism-eval",
-            "False",
-            "--eval.human-replay-eval",
-            "True",
-            "--eval.human-replay-num-agents",
-            str(eval_config["human_replay_num_agents"]),
-            "--eval.human-replay-control-mode",
-            str(eval_config["human_replay_control_mode"]),
-        ]
-
-        # Run human replay evaluation in subprocess
+            sys.executable, "-m", "pufferlib.pufferl",
+            mode, config["env"],
+            "--load-model-path", latest_cpt,
+        ] + extra_args
+
         result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, cwd=os.getcwd())
 
+        start_marker = f"{marker_name}_METRICS_START"
+        end_marker = f"{marker_name}_METRICS_END"
+
         if result.returncode == 0:
-            # Extract JSON from stdout between markers
             stdout = result.stdout
-            if "HUMAN_REPLAY_METRICS_START" in stdout and "HUMAN_REPLAY_METRICS_END" in stdout:
-                start = stdout.find("HUMAN_REPLAY_METRICS_START") + len("HUMAN_REPLAY_METRICS_START")
-                end = stdout.find("HUMAN_REPLAY_METRICS_END")
-                json_str = stdout[start:end].strip()
-                human_replay_metrics = json.loads(json_str)
+            if start_marker in stdout and end_marker in stdout:
+                start = stdout.find(start_marker) + len(start_marker)
+                end = stdout.find(end_marker)
+                metrics = json.loads(stdout[start:end].strip())
 
-                # Log to wandb if available
                 if hasattr(logger, "wandb") and logger.wandb:
-                    logger.wandb.log(
-                        {
-                            "eval/human_replay_collision_rate": human_replay_metrics["collision_rate"],
-                            "eval/human_replay_offroad_rate": human_replay_metrics["offroad_rate"],
-                            "eval/human_replay_completion_rate": human_replay_metrics["completion_rate"],
-                        },
-                        step=global_step,
-                    )
+                    if wandb_keys is not None:
+                        payload = {wandb_keys[k]: metrics[k] for k in wandb_keys if k in metrics}
+                    else:
+                        payload = {f"eval/{k}": v for k, v in metrics.items()}
+                    if payload:
+                        logger.wandb.log(payload, step=global_step)
         else:
-            print(f"Human replay evaluation failed with exit code {result.returncode}: {result.stderr}")
+            print(f"{eval_name} evaluation failed with exit code {result.returncode}: {result.stderr[:500]}")
 
     except subprocess.TimeoutExpired:
-        print("Human replay evaluation timed out")
+        print(f"{eval_name} evaluation timed out")
     except Exception as e:
-        print(f"Failed to run human replay evaluation: {e}")
+        print(f"Failed to run {eval_name} evaluation: {e}")
 
 
-def run_wosac_eval_in_subprocess(config, logger, global_step):
-    """
-    Run WOSAC evaluation in a subprocess and log metrics to wandb.
-
-    Args:
-        config: Configuration dictionary containing data_dir, env, and wosac settings
-        logger: Logger object with run_id and optional wandb attribute
-        epoch: Current training epoch
-        global_step: Current global training step
-
-    Returns:
-        None. Prints error messages if evaluation fails.
-    """
-    try:
-        run_id = logger.run_id
-        model_dir = os.path.join(config["data_dir"], f"{config['env']}_{run_id}")
-        model_files = glob.glob(os.path.join(model_dir, "model_*.pt"))
-
-        if not model_files:
-            print("No model files found for WOSAC evaluation")
-            return
-
-        latest_cpt = max(model_files, key=os.path.getctime)
-
-        # Prepare evaluation command
-        eval_config = config.get("eval", {})
-        cmd = [
-            sys.executable,
-            "-m",
-            "pufferlib.pufferl",
-            "eval",
-            config["env"],
-            "--load-model-path",
-            latest_cpt,
-            "--eval.wosac-realism-eval",
-            "True",
-            "--eval.wosac-num-agents",
-            str(eval_config.get("wosac_num_agents", 256)),
-            "--eval.wosac-init-mode",
-            str(eval_config.get("wosac_init_mode", "create_all_valid")),
-            "--eval.wosac-control-mode",
-            str(eval_config.get("wosac_control_mode", "control_wosac")),
-            "--eval.wosac-init-steps",
-            str(eval_config.get("wosac_init_steps", 10)),
-            "--eval.wosac-goal-behavior",
-            str(eval_config.get("wosac_goal_behavior", 2)),
-            "--eval.wosac-goal-radius",
-            str(eval_config.get("wosac_goal_radius", 2.0)),
-            "--eval.wosac-sanity-check",
-            str(eval_config.get("wosac_sanity_check", False)),
-            "--eval.wosac-aggregate-results",
-            str(eval_config.get("wosac_aggregate_results", True)),
-        ]
-
-        # Run WOSAC evaluation in subprocess
-        result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, cwd=os.getcwd())
-
-        if result.returncode == 0:
-            # Extract JSON from stdout between markers
-            stdout = result.stdout
-            if "WOSAC_METRICS_START" in stdout and "WOSAC_METRICS_END" in stdout:
-                start = stdout.find("WOSAC_METRICS_START") + len("WOSAC_METRICS_START")
-                end = stdout.find("WOSAC_METRICS_END")
-                json_str = stdout[start:end].strip()
-                wosac_metrics = json.loads(json_str)
-
-                # Log to wandb if available
-                if hasattr(logger, "wandb") and logger.wandb:
-                    logger.wandb.log(
-                        {
-                            "eval/wosac_realism_meta_score": wosac_metrics["realism_meta_score"],
-                            "eval/wosac_ade": wosac_metrics["ade"],
-                            "eval/wosac_min_ade": wosac_metrics["min_ade"],
-                            "eval/wosac_total_num_agents": wosac_metrics["total_num_agents"],
-                        },
-                        step=global_step,
-                    )
-        else:
-            print(f"WOSAC evaluation failed with exit code {result.returncode}")
-            print(f"Error: {result.stderr}")
+def run_human_replay_eval_in_subprocess(config, logger, global_step):
+    eval_config = config["eval"]
+    _run_eval_subprocess(
+        config, logger, global_step,
+        mode="eval",
+        extra_args=[
+            "--eval.wosac-realism-eval", "False",
+            "--eval.human-replay-eval", "True",
+            "--eval.human-replay-num-agents", str(eval_config["human_replay_num_agents"]),
+            "--eval.human-replay-control-mode", str(eval_config["human_replay_control_mode"]),
+        ],
+        marker_name="HUMAN_REPLAY",
+        wandb_keys={
+            "collision_rate": "eval/human_replay_collision_rate",
+            "offroad_rate": "eval/human_replay_offroad_rate",
+            "completion_rate": "eval/human_replay_completion_rate",
+        },
+    )
 
-            # Check for memory issues
-            stderr_lower = result.stderr.lower()
-            if "out of memory" in stderr_lower or "cuda out of memory" in stderr_lower:
-                print("GPU out of memory. Skipping this WOSAC evaluation.")
 
-    except subprocess.TimeoutExpired:
-        print("WOSAC evaluation timed out after 600 seconds")
-    except MemoryError as e:
-        print(f"WOSAC evaluation ran out of memory. Skipping this evaluation: {e}")
-    except Exception as e:
-        print(f"Failed to run WOSAC evaluation: {type(e).__name__}: {e}")
+def run_wosac_eval_in_subprocess(config, logger, global_step):
+    eval_config = config.get("eval", {})
+    _run_eval_subprocess(
+        config, logger, global_step,
+        mode="eval",
+        extra_args=[
+            "--eval.wosac-realism-eval", "True",
+            "--eval.wosac-num-agents", str(eval_config.get("wosac_num_agents", 256)),
+            "--eval.wosac-init-mode", str(eval_config.get("wosac_init_mode", "create_all_valid")),
+            "--eval.wosac-control-mode", str(eval_config.get("wosac_control_mode", "control_wosac")),
+            "--eval.wosac-init-steps", str(eval_config.get("wosac_init_steps", 10)),
+            "--eval.wosac-goal-behavior", str(eval_config.get("wosac_goal_behavior", 2)),
+            "--eval.wosac-goal-radius", str(eval_config.get("wosac_goal_radius", 2.0)),
+            "--eval.wosac-sanity-check", str(eval_config.get("wosac_sanity_check", False)),
+            "--eval.wosac-aggregate-results", str(eval_config.get("wosac_aggregate_results", True)),
+        ],
+        marker_name="WOSAC",
+        wandb_keys={
+            "realism_meta_score": "eval/wosac_realism_meta_score",
+            "ade": "eval/wosac_ade",
+            "min_ade": "eval/wosac_min_ade",
+            "total_num_agents": "eval/wosac_total_num_agents",
+        },
+    )
 
 
 def render_videos(
-    config, env_cfg, run_id, wandb_log, epoch, global_step, bin_path, render_async, render_queue=None, wandb_run=None
+    config, env_cfg, run_id, wandb_log, epoch, global_step, bin_path, render_async,
+    render_queue=None, wandb_run=None, config_path=None, wandb_prefix="render",
 ):
     """
     Generate and log training videos using C-based rendering.
 
     Args:
         config: Configuration dictionary containing data_dir, env, and render settings
-        vecenv: Vectorized environment with driver_env attribute
-        logger: Logger object with run_id and optional wandb attribute
+        env_cfg: Environment config object (driver_env) with map_dir, num_maps, etc.
+        run_id: Wandb/Neptune run identifier
+        wandb_log: Whether to log videos to wandb
         epoch: Current training epoch
         global_step: Current global training step
         bin_path: Path to the exported .bin model weights file
-
-    Returns:
-        None. Prints error messages if rendering fails.
+        render_async: Whether rendering is async (uses render_queue)
+        render_queue: Queue for async render results
+        wandb_run: Wandb run object for sync logging
+        config_path: Optional path to alternative INI config file for the visualize binary
+        wandb_prefix: Prefix for wandb keys (e.g. "render" or "eval")
     """
     if not os.path.exists(bin_path):
         print(f"Binary weights file does not exist: {bin_path}")
@@ -190,21 +160,19 @@ def render_videos(
 
     model_dir = os.path.join(config["data_dir"], f"{config['env']}_{run_id}")
 
-    # Now call the C rendering function
     try:
-        # Create output directory for videos
         video_output_dir = os.path.join(model_dir, "videos")
         os.makedirs(video_output_dir, exist_ok=True)
 
         # TODO: Fix memory leaks so that this is not needed
-        # Suppress AddressSanitizer exit code (temp)
         env_vars = os.environ.copy()
         env_vars["ASAN_OPTIONS"] = "exitcode=0"
 
-        # Base command with only visualization flags (env config comes from INI)
         base_cmd = ["xvfb-run", "-a", "-s", "-screen 0 1280x720x24", "./visualize"]
 
-        # Visualization config flags only
+        if config_path:
+            base_cmd.extend(["--config", config_path])
+
         if config.get("show_grid", False):
             base_cmd.append("--show-grid")
         if config.get("obs_only", False):
@@ -216,16 +184,13 @@ def render_videos(
         if config.get("zoom_in", False):
             base_cmd.append("--zoom-in")
 
-        # Frame skip for rendering performance
         frame_skip = config.get("frame_skip", 1)
         if frame_skip > 1:
             base_cmd.extend(["--frame-skip", str(frame_skip)])
 
-        # View mode
         view_mode = config.get("view_mode", "both")
         base_cmd.extend(["--view", view_mode])
 
-        # Get num_maps if available
         if env_cfg is not None and getattr(env_cfg, "num_maps", None):
             base_cmd.extend(["--num-maps", str(env_cfg.num_maps)])
 
@@ -234,7 +199,6 @@ def render_videos(
         # Handle single or multiple map rendering
         render_maps = config.get("render_map", None)
         if render_maps is None or render_maps == "none":
-            # Pick a random map from the training map_dir
             map_dir = None
             if env_cfg is not None and hasattr(env_cfg, "map_dir"):
                 map_dir = env_cfg.map_dir
@@ -253,25 +217,23 @@ def render_videos(
         elif isinstance(render_maps, (str, os.PathLike)):
             render_maps = [render_maps]
         else:
-            # Ensure list-like
             render_maps = list(render_maps)
 
-        # Collect videos to log as lists so W&B shows all in the same step
+        file_prefix = f"{wandb_prefix}_" if wandb_prefix != "render" else ""
         videos_to_log_world = []
         videos_to_log_agent = []
         generated_videos = {"output_topdown": [], "output_agent": []}
-        output_topdown = f"resources/drive/output_topdown_{epoch}"
-        output_agent = f"resources/drive/output_agent_{epoch}"
+        output_topdown = f"resources/drive/{file_prefix}output_topdown_{epoch}"
+        output_agent = f"resources/drive/{file_prefix}output_agent_{epoch}"
 
         for i, map_path in enumerate(render_maps):
-            cmd = list(base_cmd)  # copy
+            cmd = list(base_cmd)
             if map_path is not None and os.path.exists(map_path):
                 cmd.extend(["--map-name", str(map_path)])
 
             output_topdown_map = output_topdown + (f"_map{i:02d}.mp4" if len(render_maps) > 1 else ".mp4")
             output_agent_map = output_agent + (f"_map{i:02d}.mp4" if len(render_maps) > 1 else ".mp4")
 
-            # Output paths (overwrite each iteration; then moved/renamed)
             cmd.extend(["--output-topdown", output_topdown_map])
             cmd.extend(["--output-agent", output_agent_map])
 
@@ -284,12 +246,12 @@ def render_videos(
                     (
                         "output_topdown",
                         output_topdown_map,
-                        f"epoch_{epoch:06d}_map{i:02d}_topdown.mp4" if map_path else f"epoch_{epoch:06d}_topdown.mp4",
+                        f"{file_prefix}epoch_{epoch:06d}_map{i:02d}_topdown.mp4" if map_path else f"{file_prefix}epoch_{epoch:06d}_topdown.mp4",
                     ),
                     (
                         "output_agent",
                         output_agent_map,
-                        f"epoch_{epoch:06d}_map{i:02d}_agent.mp4" if map_path else f"epoch_{epoch:06d}_agent.mp4",
+                        f"{file_prefix}epoch_{epoch:06d}_map{i:02d}_agent.mp4" if map_path else f"{file_prefix}epoch_{epoch:06d}_agent.mp4",
                     ),
                 ]
 
@@ -300,7 +262,6 @@ def render_videos(
                         generated_videos[vid_type].append(target_path)
                         if render_async:
                             continue
-                        # Accumulate for a single wandb.log call
                         if wandb_log:
                             import wandb
 
@@ -322,24 +283,73 @@ def render_videos(
                 {
                     "videos": generated_videos,
                     "step": global_step,
+                    "wandb_prefix": wandb_prefix,
                 }
             )
 
-        # Log all videos at once so W&B keeps all of them under the same step
         if wandb_log and (videos_to_log_world or videos_to_log_agent) and not render_async:
             payload = {}
             if videos_to_log_world:
-                payload["render/world_state"] = videos_to_log_world
+                payload[f"{wandb_prefix}/world_state"] = videos_to_log_world
             if videos_to_log_agent:
-                payload["render/agent_view"] = videos_to_log_agent
+                payload[f"{wandb_prefix}/agent_view"] = videos_to_log_agent
             wandb_run.log(payload, step=global_step)
 
     except subprocess.TimeoutExpired:
         print("C rendering timed out")
     except Exception as e:
-        print(f"Failed to generate GIF: {e}")
+        print(f"Failed to generate videos: {e}")
 
     finally:
-        # Clean up bin weights file
         if os.path.exists(bin_path):
             os.remove(bin_path)
+
+
+def generate_safe_eval_ini(safe_eval_config, base_ini_path="pufferlib/config/ocean/drive.ini"):
+    """Generate a temporary ini file with safe/law-abiding reward conditioning values.
+
+    Sets reward_randomization=1 with min=max bounds so the conditioning values
+    are deterministically set to the safe values the policy sees in its observation.
+    """
+    config = configparser.ConfigParser()
+    config.read(base_ini_path)
+
+    for safe_key, bound_name in SAFE_EVAL_BOUND_KEYS:
+        if safe_key in safe_eval_config:
+            val = str(safe_eval_config[safe_key])
+            config.set("env", f"reward_bound_{bound_name}_min", val)
+            config.set("env", f"reward_bound_{bound_name}_max", val)
+
+    config.set("env", "reward_randomization", "1")
+    config.set("env", "reward_conditioning", "1")
+
+    fd, tmp_path = tempfile.mkstemp(suffix=".ini", prefix="safe_eval_")
+    with os.fdopen(fd, "w") as f:
+        config.write(f)
+
+    return tmp_path
+
+
+def run_safe_eval_metrics_in_subprocess(config, logger, global_step, safe_eval_config):
+    """Run policy evaluation with safe reward conditioning in a subprocess and log metrics."""
+    num_episodes = safe_eval_config.get("safe_eval_num_episodes", 300)
+
+    extra_args = [
+        "--env.reward-randomization", "1",
+        "--env.reward-conditioning", "1",
+        "--safe-eval.safe-eval-num-episodes", str(num_episodes),
+    ]
+
+    for safe_key, bound_name in SAFE_EVAL_BOUND_KEYS:
+        if safe_key in safe_eval_config:
+            val = str(safe_eval_config[safe_key])
+            cli_name = bound_name.replace("_", "-")
+            extra_args.extend([f"--env.reward-bound-{cli_name}-min", val,
+                               f"--env.reward-bound-{cli_name}-max", val])
+
+    _run_eval_subprocess(
+        config, logger, global_step,
+        mode="safe_eval",
+        extra_args=extra_args,
+        marker_name="SAFE_EVAL",
+    )

From 14e288aaccbc4413c1d460104807ac8bd8afa30b Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Sun, 8 Mar 2026 00:46:30 -0500
Subject: [PATCH 02/46] Move bin file cleanup from render_videos to caller

render_videos no longer deletes its input bin file as a side effect.
Cleanup of bin_path and bin_path_epoch is now handled in a finally block
in pufferl.py, eliminating the need for a defensive copy (bin_path_safe).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/pufferl.py | 16 +++++++---------
 pufferlib/utils.py   |  4 ----
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 559f90f1f7..d8595c2a97 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -545,11 +545,6 @@ def train(self):
                         run_safe_eval = safe_eval_config.get("safe_eval", False)
                         safe_eval_interval = safe_eval_config.get("safe_eval_interval", self.render_interval)
                         should_safe_eval = run_safe_eval and self.epoch % safe_eval_interval == 0
-                        bin_path_safe = None
-                        if should_safe_eval:
-                            # Copy bin before render_videos deletes it
-                            bin_path_safe = f"{bin_path_epoch}.safe_eval.bin"
-                            shutil.copy2(bin_path_epoch, bin_path_safe)
 
                         if self.render_async:
                             # Clean up finished processes
@@ -592,7 +587,7 @@ def train(self):
                                 wandb_run=wandb_run,
                             )
 
-                        # Run safe eval using the copied bin (reuses the already-exported model)
+                        # Run safe eval using the same bin (reuses the already-exported model)
                         if should_safe_eval:
                             safe_ini_path = None
                             try:
@@ -600,7 +595,7 @@ def train(self):
                                 pufferlib.utils.render_videos(
                                     self.config, env_cfg, self.logger.run_id,
                                     wandb_log, self.epoch, self.global_step,
-                                    bin_path_safe, False,
+                                    bin_path_epoch, False,
                                     wandb_run=wandb_run,
                                     config_path=safe_ini_path,
                                     wandb_prefix="eval",
@@ -614,11 +609,14 @@ def train(self):
                             finally:
                                 if safe_ini_path and os.path.exists(safe_ini_path):
                                     os.remove(safe_ini_path)
-                                if bin_path_safe and os.path.exists(bin_path_safe):
-                                    os.remove(bin_path_safe)
 
                     except Exception as e:
                         print(f"Failed to export model weights: {e}")
+                    finally:
+                        if os.path.exists(bin_path):
+                            os.remove(bin_path)
+                        if os.path.exists(bin_path_epoch):
+                            os.remove(bin_path_epoch)
 
         if self.config["eval"]["wosac_realism_eval"] and (
             self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index 28e5f496ee..d4be6dd78c 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -300,10 +300,6 @@ def render_videos(
     except Exception as e:
         print(f"Failed to generate videos: {e}")
 
-    finally:
-        if os.path.exists(bin_path):
-            os.remove(bin_path)
-
 
 def generate_safe_eval_ini(safe_eval_config, base_ini_path="pufferlib/config/ocean/drive.ini"):
     """Generate a temporary ini file with safe/law-abiding reward conditioning values.

From c65517559ca42c7f11e31e15ba310e8dee2957db Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Mon, 9 Mar 2026 06:51:55 -0400
Subject: [PATCH 03/46] Fix eval subprocess: pass device and handle negative
 arg values

Pass --train.device from parent config so eval subprocesses use the
correct device (e.g. cpu). Use = syntax for reward bound args to prevent
argparse from interpreting negative values like -5e-05 as flags.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index d4be6dd78c..dae790a19c 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -57,6 +57,7 @@ def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_n
             sys.executable, "-m", "pufferlib.pufferl",
             mode, config["env"],
             "--load-model-path", latest_cpt,
+            "--train.device", config.get("device", "cuda"),
         ] + extra_args
 
         result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, cwd=os.getcwd())
@@ -340,8 +341,9 @@ def run_safe_eval_metrics_in_subprocess(config, logger, global_step, safe_eval_c
         if safe_key in safe_eval_config:
             val = str(safe_eval_config[safe_key])
             cli_name = bound_name.replace("_", "-")
-            extra_args.extend([f"--env.reward-bound-{cli_name}-min", val,
-                               f"--env.reward-bound-{cli_name}-max", val])
+            # Use = syntax to avoid argparse interpreting negative values as flags
+            extra_args.extend([f"--env.reward-bound-{cli_name}-min={val}",
+                               f"--env.reward-bound-{cli_name}-max={val}"])
 
     _run_eval_subprocess(
         config, logger, global_step,

From a674189c77c95dc872b53d9488c46e83e936ecda Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Mon, 9 Mar 2026 06:54:26 -0400
Subject: [PATCH 04/46] Apply ruff formatting to utils.py

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/utils.py | 98 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 69 insertions(+), 29 deletions(-)

diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index dae790a19c..ca99c30554 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -54,10 +54,15 @@ def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_n
         latest_cpt = max(model_files, key=os.path.getctime)
 
         cmd = [
-            sys.executable, "-m", "pufferlib.pufferl",
-            mode, config["env"],
-            "--load-model-path", latest_cpt,
-            "--train.device", config.get("device", "cuda"),
+            sys.executable,
+            "-m",
+            "pufferlib.pufferl",
+            mode,
+            config["env"],
+            "--load-model-path",
+            latest_cpt,
+            "--train.device",
+            config.get("device", "cuda"),
         ] + extra_args
 
         result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, cwd=os.getcwd())
@@ -91,13 +96,19 @@ def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_n
 def run_human_replay_eval_in_subprocess(config, logger, global_step):
     eval_config = config["eval"]
     _run_eval_subprocess(
-        config, logger, global_step,
+        config,
+        logger,
+        global_step,
         mode="eval",
         extra_args=[
-            "--eval.wosac-realism-eval", "False",
-            "--eval.human-replay-eval", "True",
-            "--eval.human-replay-num-agents", str(eval_config["human_replay_num_agents"]),
-            "--eval.human-replay-control-mode", str(eval_config["human_replay_control_mode"]),
+            "--eval.wosac-realism-eval",
+            "False",
+            "--eval.human-replay-eval",
+            "True",
+            "--eval.human-replay-num-agents",
+            str(eval_config["human_replay_num_agents"]),
+            "--eval.human-replay-control-mode",
+            str(eval_config["human_replay_control_mode"]),
         ],
         marker_name="HUMAN_REPLAY",
         wandb_keys={
@@ -111,18 +122,29 @@ def run_human_replay_eval_in_subprocess(config, logger, global_step):
 def run_wosac_eval_in_subprocess(config, logger, global_step):
     eval_config = config.get("eval", {})
     _run_eval_subprocess(
-        config, logger, global_step,
+        config,
+        logger,
+        global_step,
         mode="eval",
         extra_args=[
-            "--eval.wosac-realism-eval", "True",
-            "--eval.wosac-num-agents", str(eval_config.get("wosac_num_agents", 256)),
-            "--eval.wosac-init-mode", str(eval_config.get("wosac_init_mode", "create_all_valid")),
-            "--eval.wosac-control-mode", str(eval_config.get("wosac_control_mode", "control_wosac")),
-            "--eval.wosac-init-steps", str(eval_config.get("wosac_init_steps", 10)),
-            "--eval.wosac-goal-behavior", str(eval_config.get("wosac_goal_behavior", 2)),
-            "--eval.wosac-goal-radius", str(eval_config.get("wosac_goal_radius", 2.0)),
-            "--eval.wosac-sanity-check", str(eval_config.get("wosac_sanity_check", False)),
-            "--eval.wosac-aggregate-results", str(eval_config.get("wosac_aggregate_results", True)),
+            "--eval.wosac-realism-eval",
+            "True",
+            "--eval.wosac-num-agents",
+            str(eval_config.get("wosac_num_agents", 256)),
+            "--eval.wosac-init-mode",
+            str(eval_config.get("wosac_init_mode", "create_all_valid")),
+            "--eval.wosac-control-mode",
+            str(eval_config.get("wosac_control_mode", "control_wosac")),
+            "--eval.wosac-init-steps",
+            str(eval_config.get("wosac_init_steps", 10)),
+            "--eval.wosac-goal-behavior",
+            str(eval_config.get("wosac_goal_behavior", 2)),
+            "--eval.wosac-goal-radius",
+            str(eval_config.get("wosac_goal_radius", 2.0)),
+            "--eval.wosac-sanity-check",
+            str(eval_config.get("wosac_sanity_check", False)),
+            "--eval.wosac-aggregate-results",
+            str(eval_config.get("wosac_aggregate_results", True)),
         ],
         marker_name="WOSAC",
         wandb_keys={
@@ -135,8 +157,18 @@ def run_wosac_eval_in_subprocess(config, logger, global_step):
 
 
 def render_videos(
-    config, env_cfg, run_id, wandb_log, epoch, global_step, bin_path, render_async,
-    render_queue=None, wandb_run=None, config_path=None, wandb_prefix="render",
+    config,
+    env_cfg,
+    run_id,
+    wandb_log,
+    epoch,
+    global_step,
+    bin_path,
+    render_async,
+    render_queue=None,
+    wandb_run=None,
+    config_path=None,
+    wandb_prefix="render",
 ):
     """
     Generate and log training videos using C-based rendering.
@@ -247,12 +279,16 @@ def render_videos(
                     (
                         "output_topdown",
                         output_topdown_map,
-                        f"{file_prefix}epoch_{epoch:06d}_map{i:02d}_topdown.mp4" if map_path else f"{file_prefix}epoch_{epoch:06d}_topdown.mp4",
+                        f"{file_prefix}epoch_{epoch:06d}_map{i:02d}_topdown.mp4"
+                        if map_path
+                        else f"{file_prefix}epoch_{epoch:06d}_topdown.mp4",
                     ),
                     (
                         "output_agent",
                         output_agent_map,
-                        f"{file_prefix}epoch_{epoch:06d}_map{i:02d}_agent.mp4" if map_path else f"{file_prefix}epoch_{epoch:06d}_agent.mp4",
+                        f"{file_prefix}epoch_{epoch:06d}_map{i:02d}_agent.mp4"
+                        if map_path
+                        else f"{file_prefix}epoch_{epoch:06d}_agent.mp4",
                     ),
                 ]
 
@@ -332,9 +368,12 @@ def run_safe_eval_metrics_in_subprocess(config, logger, global_step, safe_eval_c
     num_episodes = safe_eval_config.get("safe_eval_num_episodes", 300)
 
     extra_args = [
-        "--env.reward-randomization", "1",
-        "--env.reward-conditioning", "1",
-        "--safe-eval.safe-eval-num-episodes", str(num_episodes),
+        "--env.reward-randomization",
+        "1",
+        "--env.reward-conditioning",
+        "1",
+        "--safe-eval.safe-eval-num-episodes",
+        str(num_episodes),
     ]
 
     for safe_key, bound_name in SAFE_EVAL_BOUND_KEYS:
@@ -342,11 +381,12 @@ def run_safe_eval_metrics_in_subprocess(config, logger, global_step, safe_eval_c
             val = str(safe_eval_config[safe_key])
             cli_name = bound_name.replace("_", "-")
             # Use = syntax to avoid argparse interpreting negative values as flags
-            extra_args.extend([f"--env.reward-bound-{cli_name}-min={val}",
-                               f"--env.reward-bound-{cli_name}-max={val}"])
+            extra_args.extend([f"--env.reward-bound-{cli_name}-min={val}", f"--env.reward-bound-{cli_name}-max={val}"])
 
     _run_eval_subprocess(
-        config, logger, global_step,
+        config,
+        logger,
+        global_step,
         mode="safe_eval",
         extra_args=extra_args,
         marker_name="SAFE_EVAL",

From 4517070cd764860596d3e0b7f3c5d09ecffc065f Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Mon, 9 Mar 2026 07:03:48 -0400
Subject: [PATCH 05/46] Decouple render and safe eval from checkpoint interval

Render and safe eval now trigger on their own intervals independently
of checkpoint_interval. They use the latest available .pt checkpoint,
so they don't require a fresh one. This means safe_eval_interval no
longer needs to be a multiple of checkpoint_interval.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/pufferl.py | 134 +++++++++++++++++++++++--------------------
 1 file changed, 72 insertions(+), 62 deletions(-)

diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index d8595c2a97..2576642747 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -515,37 +515,40 @@ def train(self):
             self.save_checkpoint()
             self.msg = f"Checkpoint saved at update {self.epoch}"
 
-            if self.render and self.epoch % self.render_interval == 0:
-                model_dir = os.path.join(self.config["data_dir"], f"{self.config['env']}_{self.logger.run_id}")
-                model_files = glob.glob(os.path.join(model_dir, "model_*.pt"))
-
-                if model_files:
-                    latest_cpt = max(model_files, key=os.path.getctime)
-                    bin_path = f"{model_dir}.bin"
-
-                    try:
-                        export_args = {"env_name": self.config["env"], "load_model_path": latest_cpt, **self.config}
-                        export(
-                            args=export_args,
-                            env_name=self.config["env"],
-                            vecenv=self.vecenv,
-                            policy=self.uncompiled_policy,
-                            path=bin_path,
-                            silent=True,
-                        )
-
-                        bin_path_epoch = f"{model_dir}_epoch_{self.epoch:06d}.bin"
-                        shutil.copy2(bin_path, bin_path_epoch)
-                        env_cfg = getattr(self.vecenv, "driver_env", None)
-                        wandb_log = bool(hasattr(self.logger, "wandb") and self.logger.wandb)
-                        wandb_run = self.logger.wandb if hasattr(self.logger, "wandb") else None
-
-                        # Check if safe eval should also run at this interval
-                        safe_eval_config = self.config.get("safe_eval", {})
-                        run_safe_eval = safe_eval_config.get("safe_eval", False)
-                        safe_eval_interval = safe_eval_config.get("safe_eval_interval", self.render_interval)
-                        should_safe_eval = run_safe_eval and self.epoch % safe_eval_interval == 0
-
+        # Render and safe eval run on their own intervals, independent of checkpointing.
+        # They use the latest available checkpoint, so they don't need a fresh one.
+        should_render = self.render and self.epoch % self.render_interval == 0
+        safe_eval_config = self.config.get("safe_eval", {})
+        run_safe_eval = safe_eval_config.get("safe_eval", False)
+        safe_eval_interval = safe_eval_config.get("safe_eval_interval", self.render_interval)
+        should_safe_eval = run_safe_eval and self.epoch % safe_eval_interval == 0
+
+        if should_render or should_safe_eval:
+            model_dir = os.path.join(self.config["data_dir"], f"{self.config['env']}_{self.logger.run_id}")
+            model_files = glob.glob(os.path.join(model_dir, "model_*.pt"))
+
+            if model_files:
+                latest_cpt = max(model_files, key=os.path.getctime)
+                bin_path = f"{model_dir}.bin"
+
+                try:
+                    export_args = {"env_name": self.config["env"], "load_model_path": latest_cpt, **self.config}
+                    export(
+                        args=export_args,
+                        env_name=self.config["env"],
+                        vecenv=self.vecenv,
+                        policy=self.uncompiled_policy,
+                        path=bin_path,
+                        silent=True,
+                    )
+
+                    bin_path_epoch = f"{model_dir}_epoch_{self.epoch:06d}.bin"
+                    shutil.copy2(bin_path, bin_path_epoch)
+                    env_cfg = getattr(self.vecenv, "driver_env", None)
+                    wandb_log = bool(hasattr(self.logger, "wandb") and self.logger.wandb)
+                    wandb_run = self.logger.wandb if hasattr(self.logger, "wandb") else None
+
+                    if should_render:
                         if self.render_async:
                             # Clean up finished processes
                             self.render_processes = [p for p in self.render_processes if p.is_alive()]
@@ -587,36 +590,41 @@ def train(self):
                                 wandb_run=wandb_run,
                             )
 
-                        # Run safe eval using the same bin (reuses the already-exported model)
-                        if should_safe_eval:
-                            safe_ini_path = None
-                            try:
-                                safe_ini_path = pufferlib.utils.generate_safe_eval_ini(safe_eval_config)
-                                pufferlib.utils.render_videos(
-                                    self.config, env_cfg, self.logger.run_id,
-                                    wandb_log, self.epoch, self.global_step,
-                                    bin_path_epoch, False,
-                                    wandb_run=wandb_run,
-                                    config_path=safe_ini_path,
-                                    wandb_prefix="eval",
-                                )
-
-                                pufferlib.utils.run_safe_eval_metrics_in_subprocess(
-                                    self.config, self.logger, self.global_step, safe_eval_config
-                                )
-                            except Exception as e:
-                                print(f"Failed to run safe eval: {e}")
-                            finally:
-                                if safe_ini_path and os.path.exists(safe_ini_path):
-                                    os.remove(safe_ini_path)
-
-                    except Exception as e:
-                        print(f"Failed to export model weights: {e}")
-                    finally:
-                        if os.path.exists(bin_path):
-                            os.remove(bin_path)
-                        if os.path.exists(bin_path_epoch):
-                            os.remove(bin_path_epoch)
+                    # Run safe eval using the same bin (reuses the already-exported model)
+                    if should_safe_eval:
+                        safe_ini_path = None
+                        try:
+                            safe_ini_path = pufferlib.utils.generate_safe_eval_ini(safe_eval_config)
+                            pufferlib.utils.render_videos(
+                                self.config,
+                                env_cfg,
+                                self.logger.run_id,
+                                wandb_log,
+                                self.epoch,
+                                self.global_step,
+                                bin_path_epoch,
+                                False,
+                                wandb_run=wandb_run,
+                                config_path=safe_ini_path,
+                                wandb_prefix="eval",
+                            )
+
+                            pufferlib.utils.run_safe_eval_metrics_in_subprocess(
+                                self.config, self.logger, self.global_step, safe_eval_config
+                            )
+                        except Exception as e:
+                            print(f"Failed to run safe eval: {e}")
+                        finally:
+                            if safe_ini_path and os.path.exists(safe_ini_path):
+                                os.remove(safe_ini_path)
+
+                except Exception as e:
+                    print(f"Failed to export model weights: {e}")
+                finally:
+                    if os.path.exists(bin_path):
+                        os.remove(bin_path)
+                    if os.path.exists(bin_path_epoch):
+                        os.remove(bin_path_epoch)
 
         if self.config["eval"]["wosac_realism_eval"] and (
             self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training
@@ -645,7 +653,9 @@ def check_render_queue(self):
 
                     payload = {}
                     if videos["output_topdown"]:
-                        payload[f"{prefix}/world_state"] = [wandb.Video(p, format="mp4") for p in videos["output_topdown"]]
+                        payload[f"{prefix}/world_state"] = [
+                            wandb.Video(p, format="mp4") for p in videos["output_topdown"]
+                        ]
                     if videos["output_agent"]:
                         payload[f"{prefix}/agent_view"] = [wandb.Video(p, format="mp4") for p in videos["output_agent"]]
 

From a97ed9830da467bf8b78591042843f59d0b206f7 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Mon, 9 Mar 2026 07:15:20 -0400
Subject: [PATCH 06/46] Simplify safe_eval config: remove redundant prefixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rename safe_eval config keys to drop the safe_eval_ prefix since
they're already in the [safe_eval] section. Replace SAFE_EVAL_BOUND_KEYS
mapping with a simple SAFE_EVAL_REWARD_BOUNDS list since config keys
now match bound names directly (e.g. collision instead of
safe_eval_collision). Also rename safe_eval→enabled,
safe_eval_interval→interval, safe_eval_num_agents→num_agents,
safe_eval_num_episodes→num_episodes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/config/ocean/drive.ini | 43 +++++++++++++------------
 pufferlib/pufferl.py             |  9 +++---
 pufferlib/utils.py               | 55 ++++++++++++++++----------------
 3 files changed, 56 insertions(+), 51 deletions(-)

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index 1f13bde320..b526e88b5a 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -194,34 +194,37 @@ human_replay_control_mode = "control_sdc_only"
 
 [safe_eval]
 ; If True, periodically run policy with safe/law-abiding reward conditioning and log videos + metrics
-safe_eval = True
+enabled = True
 ; How often to run safe eval (in training epochs). Defaults to render_interval.
-safe_eval_interval = 250
+interval = 250
+; Number of agents to run in the eval environment
+num_agents = 64
 ; Number of episodes to collect metrics over
-safe_eval_num_episodes = 300
+num_episodes = 300
 
-; Safe reward conditioning values (min=max to fix the value)
+; Reward conditioning values (min=max to fix the value).
+; Names match the env reward_bound_* keys.
 ; High penalties for unsafe behavior
-safe_eval_collision = -3.0
-safe_eval_offroad = -3.0
-safe_eval_overspeed = -1.0
-safe_eval_traffic_light = -1.0
-safe_eval_reverse = -0.0075
-safe_eval_comfort = -0.1
+collision = -3.0
+offroad = -3.0
+overspeed = -1.0
+traffic_light = -1.0
+reverse = -0.0075
+comfort = -0.1
 
 ; Standard driving rewards
-safe_eval_goal_radius = 2.0
-safe_eval_lane_align = 0.0025
-safe_eval_lane_center = -0.00075
-safe_eval_velocity = 0.005
-safe_eval_center_bias = 0.0
-safe_eval_vel_align = 1.0
-safe_eval_timestep = -0.00005
+goal_radius = 2.0
+lane_align = 0.0025
+lane_center = -0.00075
+velocity = 0.005
+center_bias = 0.0
+vel_align = 1.0
+timestep = -0.00005
 
 ; Neutral scaling factors
-safe_eval_throttle = 1.0
-safe_eval_steer = 1.0
-safe_eval_acc = 1.0
+throttle = 1.0
+steer = 1.0
+acc = 1.0
 
 [render]
 ; Mode to render a bunch of maps with a given policy
diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 2576642747..32a6e877fd 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -519,8 +519,8 @@ def train(self):
         # They use the latest available checkpoint, so they don't need a fresh one.
         should_render = self.render and self.epoch % self.render_interval == 0
         safe_eval_config = self.config.get("safe_eval", {})
-        run_safe_eval = safe_eval_config.get("safe_eval", False)
-        safe_eval_interval = safe_eval_config.get("safe_eval_interval", self.render_interval)
+        run_safe_eval = safe_eval_config.get("enabled", False)
+        safe_eval_interval = safe_eval_config.get("interval", self.render_interval)
         should_safe_eval = run_safe_eval and self.epoch % safe_eval_interval == 0
 
         if should_render or should_safe_eval:
@@ -1368,14 +1368,15 @@ def safe_eval(env_name, args=None, vecenv=None, policy=None):
     """Evaluate policy with safe/law-abiding reward conditioning and output metrics."""
     args = args or load_config(env_name)
 
+    safe_eval_config = args.get("safe_eval", {})
     args["vec"] = dict(backend="PufferEnv", num_envs=1)
-    args["env"]["num_agents"] = 64
+    args["env"]["num_agents"] = safe_eval_config.get("num_agents", 64)
 
     vecenv = vecenv or load_env(env_name, args)
     policy = policy or load_policy(args, vecenv, env_name)
     policy.eval()
 
-    num_steps = args.get("safe_eval", {}).get("safe_eval_num_episodes", 300)
+    num_steps = args.get("safe_eval", {}).get("num_episodes", 300)
     device = args["train"]["device"]
     num_agents = vecenv.observation_space.shape[0]
     use_rnn = args["train"]["use_rnn"]
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index ca99c30554..e48249e6ee 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -7,25 +7,26 @@
 import configparser
 import tempfile
 
-# Mapping from safe_eval config keys to (reward_bound_min, reward_bound_max) pairs.
-# Used by both generate_safe_eval_ini (underscore form) and the metrics subprocess (hyphen form).
-SAFE_EVAL_BOUND_KEYS = [
-    ("safe_eval_collision", "collision"),
-    ("safe_eval_offroad", "offroad"),
-    ("safe_eval_overspeed", "overspeed"),
-    ("safe_eval_traffic_light", "traffic_light"),
-    ("safe_eval_reverse", "reverse"),
-    ("safe_eval_comfort", "comfort"),
-    ("safe_eval_goal_radius", "goal_radius"),
-    ("safe_eval_lane_align", "lane_align"),
-    ("safe_eval_lane_center", "lane_center"),
-    ("safe_eval_velocity", "velocity"),
-    ("safe_eval_center_bias", "center_bias"),
-    ("safe_eval_vel_align", "vel_align"),
-    ("safe_eval_timestep", "timestep"),
-    ("safe_eval_throttle", "throttle"),
-    ("safe_eval_steer", "steer"),
-    ("safe_eval_acc", "acc"),
+# Reward bound names used for safe eval conditioning.
+# Config keys in [safe_eval] match these names directly (e.g. collision = -3.0).
+# They map to env reward_bound_{name}_min/max.
+SAFE_EVAL_REWARD_BOUNDS = [
+    "collision",
+    "offroad",
+    "overspeed",
+    "traffic_light",
+    "reverse",
+    "comfort",
+    "goal_radius",
+    "lane_align",
+    "lane_center",
+    "velocity",
+    "center_bias",
+    "vel_align",
+    "timestep",
+    "throttle",
+    "steer",
+    "acc",
 ]
 
 
@@ -347,9 +348,9 @@ def generate_safe_eval_ini(safe_eval_config, base_ini_path="pufferlib/config/oce
     config = configparser.ConfigParser()
     config.read(base_ini_path)
 
-    for safe_key, bound_name in SAFE_EVAL_BOUND_KEYS:
-        if safe_key in safe_eval_config:
-            val = str(safe_eval_config[safe_key])
+    for bound_name in SAFE_EVAL_REWARD_BOUNDS:
+        if bound_name in safe_eval_config:
+            val = str(safe_eval_config[bound_name])
             config.set("env", f"reward_bound_{bound_name}_min", val)
             config.set("env", f"reward_bound_{bound_name}_max", val)
 
@@ -365,20 +366,20 @@ def generate_safe_eval_ini(safe_eval_config, base_ini_path="pufferlib/config/oce
 
 def run_safe_eval_metrics_in_subprocess(config, logger, global_step, safe_eval_config):
     """Run policy evaluation with safe reward conditioning in a subprocess and log metrics."""
-    num_episodes = safe_eval_config.get("safe_eval_num_episodes", 300)
+    num_episodes = safe_eval_config.get("num_episodes", 300)
 
     extra_args = [
         "--env.reward-randomization",
         "1",
         "--env.reward-conditioning",
         "1",
-        "--safe-eval.safe-eval-num-episodes",
+        "--safe-eval.num-episodes",
         str(num_episodes),
     ]
 
-    for safe_key, bound_name in SAFE_EVAL_BOUND_KEYS:
-        if safe_key in safe_eval_config:
-            val = str(safe_eval_config[safe_key])
+    for bound_name in SAFE_EVAL_REWARD_BOUNDS:
+        if bound_name in safe_eval_config:
+            val = str(safe_eval_config[bound_name])
             cli_name = bound_name.replace("_", "-")
             # Use = syntax to avoid argparse interpreting negative values as flags
             extra_args.extend([f"--env.reward-bound-{cli_name}-min={val}", f"--env.reward-bound-{cli_name}-max={val}"])

From 1d8f526071f430c185bd849e00598fc2c62d203d Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Mon, 9 Mar 2026 07:17:40 -0400
Subject: [PATCH 07/46] Replace SAFE_EVAL_REWARD_BOUNDS list with dynamic
 config iteration

Instead of maintaining a hardcoded list of reward bound names, treat
every key in [safe_eval] that isn't a control key (enabled, interval,
num_agents, num_episodes) as a reward bound. Adding a new reward bound
now only requires adding it to the config file.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/utils.py | 49 ++++++++++++++++------------------------------
 1 file changed, 17 insertions(+), 32 deletions(-)

diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index e48249e6ee..ad1785362a 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -7,27 +7,10 @@
 import configparser
 import tempfile
 
-# Reward bound names used for safe eval conditioning.
-# Config keys in [safe_eval] match these names directly (e.g. collision = -3.0).
-# They map to env reward_bound_{name}_min/max.
-SAFE_EVAL_REWARD_BOUNDS = [
-    "collision",
-    "offroad",
-    "overspeed",
-    "traffic_light",
-    "reverse",
-    "comfort",
-    "goal_radius",
-    "lane_align",
-    "lane_center",
-    "velocity",
-    "center_bias",
-    "vel_align",
-    "timestep",
-    "throttle",
-    "steer",
-    "acc",
-]
+# Control keys in [safe_eval] that are not reward bounds.
+# Everything else in the section is treated as a reward bound name
+# mapping to env reward_bound_{name}_min/max.
+SAFE_EVAL_CONTROL_KEYS = {"enabled", "interval", "num_agents", "num_episodes"}
 
 
 def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_name, wandb_keys=None):
@@ -348,11 +331,12 @@ def generate_safe_eval_ini(safe_eval_config, base_ini_path="pufferlib/config/oce
     config = configparser.ConfigParser()
     config.read(base_ini_path)
 
-    for bound_name in SAFE_EVAL_REWARD_BOUNDS:
-        if bound_name in safe_eval_config:
-            val = str(safe_eval_config[bound_name])
-            config.set("env", f"reward_bound_{bound_name}_min", val)
-            config.set("env", f"reward_bound_{bound_name}_max", val)
+    for key, val in safe_eval_config.items():
+        if key in SAFE_EVAL_CONTROL_KEYS:
+            continue
+        val = str(val)
+        config.set("env", f"reward_bound_{key}_min", val)
+        config.set("env", f"reward_bound_{key}_max", val)
 
     config.set("env", "reward_randomization", "1")
     config.set("env", "reward_conditioning", "1")
@@ -377,12 +361,13 @@ def run_safe_eval_metrics_in_subprocess(config, logger, global_step, safe_eval_c
         str(num_episodes),
     ]
 
-    for bound_name in SAFE_EVAL_REWARD_BOUNDS:
-        if bound_name in safe_eval_config:
-            val = str(safe_eval_config[bound_name])
-            cli_name = bound_name.replace("_", "-")
-            # Use = syntax to avoid argparse interpreting negative values as flags
-            extra_args.extend([f"--env.reward-bound-{cli_name}-min={val}", f"--env.reward-bound-{cli_name}-max={val}"])
+    for key, val in safe_eval_config.items():
+        if key in SAFE_EVAL_CONTROL_KEYS:
+            continue
+        val = str(val)
+        cli_name = key.replace("_", "-")
+        # Use = syntax to avoid argparse interpreting negative values as flags
+        extra_args.extend([f"--env.reward-bound-{cli_name}-min={val}", f"--env.reward-bound-{cli_name}-max={val}"])
 
     _run_eval_subprocess(
         config,

From d98d553b4279b2f7b268a3cb855f4c9c354ce7ac Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Mon, 9 Mar 2026 07:20:39 -0400
Subject: [PATCH 08/46] Discover reward bounds from env config instead of
 hardcoding

Replace SAFE_EVAL_CONTROL_KEYS exclusion list with
_get_env_reward_bound_names() that discovers valid bound names by
pattern-matching reward_bound_*_min keys in the [env] config section.
Safe eval now only passes keys that match known env bounds, so adding
a new control key to [safe_eval] can't accidentally become a bound arg.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/utils.py | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index ad1785362a..101225b4f0 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -7,10 +7,19 @@
 import configparser
 import tempfile
 
-# Control keys in [safe_eval] that are not reward bounds.
-# Everything else in the section is treated as a reward bound name
-# mapping to env reward_bound_{name}_min/max.
-SAFE_EVAL_CONTROL_KEYS = {"enabled", "interval", "num_agents", "num_episodes"}
+
+def _get_env_reward_bound_names(ini_path="pufferlib/config/ocean/drive.ini"):
+    """Discover valid reward bound names from the env config section."""
+    import re
+
+    config = configparser.ConfigParser()
+    config.read(ini_path)
+    bounds = set()
+    for key in config["env"]:
+        m = re.match(r"reward_bound_(.+)_min$", key)
+        if m:
+            bounds.add(m.group(1))
+    return bounds
 
 
 def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_name, wandb_keys=None):
@@ -331,8 +340,9 @@ def generate_safe_eval_ini(safe_eval_config, base_ini_path="pufferlib/config/oce
     config = configparser.ConfigParser()
     config.read(base_ini_path)
 
+    valid_bounds = _get_env_reward_bound_names(base_ini_path)
     for key, val in safe_eval_config.items():
-        if key in SAFE_EVAL_CONTROL_KEYS:
+        if key not in valid_bounds:
             continue
         val = str(val)
         config.set("env", f"reward_bound_{key}_min", val)
@@ -361,8 +371,9 @@ def run_safe_eval_metrics_in_subprocess(config, logger, global_step, safe_eval_c
         str(num_episodes),
     ]
 
+    valid_bounds = _get_env_reward_bound_names()
     for key, val in safe_eval_config.items():
-        if key in SAFE_EVAL_CONTROL_KEYS:
+        if key not in valid_bounds:
             continue
         val = str(val)
         cli_name = key.replace("_", "-")

From fa599f0c5fd417feb49a4456bda462c2f48ded7e Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Mon, 9 Mar 2026 08:34:55 -0400
Subject: [PATCH 09/46] Fix wosac eval: use correct CLI flag and show more
 stderr on failure

Fix --eval.wosac-num-agents (nonexistent) to --eval.wosac-num-rollouts
(actual config key). Also increase stderr output from 500 to 1000 chars
to show full tracebacks when eval subprocesses fail.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index 101225b4f0..08a90c11c9 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -78,7 +78,7 @@ def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_n
                     if payload:
                         logger.wandb.log(payload, step=global_step)
         else:
-            print(f"{eval_name} evaluation failed with exit code {result.returncode}: {result.stderr[:500]}")
+            print(f"{eval_name} evaluation failed with exit code {result.returncode}: {result.stderr[-1000:]}")
 
     except subprocess.TimeoutExpired:
         print(f"{eval_name} evaluation timed out")
@@ -122,8 +122,8 @@ def run_wosac_eval_in_subprocess(config, logger, global_step):
         extra_args=[
             "--eval.wosac-realism-eval",
             "True",
-            "--eval.wosac-num-agents",
-            str(eval_config.get("wosac_num_agents", 256)),
+            "--eval.wosac-num-rollouts",
+            str(eval_config.get("wosac_num_rollouts", 32)),
             "--eval.wosac-init-mode",
             str(eval_config.get("wosac_init_mode", "create_all_valid")),
             "--eval.wosac-control-mode",

From f6f8d07932a80e35a65914cf2de18afc62f53d15 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Mon, 9 Mar 2026 09:30:59 -0400
Subject: [PATCH 10/46] Pass episode_length=91 to wosac eval subprocess to
 match ground truth logs

WOSAC ground truth trajectories are 9.1s at 10Hz = 91 steps. Without setting
episode_length, the eval subprocess used the default training episode_length,
causing shape mismatches between simulated and reference trajectories.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/config/ocean/drive.ini | 2 ++
 pufferlib/utils.py               | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index b526e88b5a..af5ed7e53c 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -187,6 +187,8 @@ wosac_goal_radius = 2.0
 wosac_sanity_check = False
 ; Only return aggregate results across all scenes
 wosac_aggregate_results = True
+; Episode length for WOSAC eval (ground truth logs are 9.1s at 10Hz = 91 steps)
+wosac_episode_length = 91
 ; If True, enable human replay evaluation (pair policy-controlled agent with human replays)
 human_replay_eval = False
 ; Control only the self-driving car
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index 08a90c11c9..c83f460353 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -138,6 +138,8 @@ def run_wosac_eval_in_subprocess(config, logger, global_step):
             str(eval_config.get("wosac_sanity_check", False)),
             "--eval.wosac-aggregate-results",
             str(eval_config.get("wosac_aggregate_results", True)),
+            "--env.episode-length",
+            str(eval_config.get("wosac_episode_length", 91)),
         ],
         marker_name="WOSAC",
         wandb_keys={

From 1db6914fab3ddb38dd01e95afd589e648efbc566 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Mon, 9 Mar 2026 09:41:05 -0400
Subject: [PATCH 11/46] Add eval_async option to run eval subprocesses in
 background threads

Eval subprocesses (wosac, human replay, safe eval metrics) previously
blocked the training loop. Now they optionally run in daemon threads,
with cleanup on close(). Enabled by default via eval_async=True.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/config/ocean/drive.ini |  2 ++
 pufferlib/pufferl.py             | 34 ++++++++++++++++++++++++++++----
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index af5ed7e53c..4f12ecb2a3 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -164,6 +164,8 @@ render_map = none
 
 [eval]
 eval_interval = 1000
+; If True, run eval subprocesses (wosac, human replay, safe eval metrics) in background threads
+eval_async = True
 ; Path to dataset used for evaluation
 map_dir = "resources/drive/binaries/training"
 ; Evaluation will run on the first num_maps maps in the map_dir directory
diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 32a6e877fd..486f2ea6f7 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -136,6 +136,8 @@ def __init__(self, config, vecenv, policy, logger=None):
             self.render_queue = multiprocessing.Queue()
             self.render_processes = []
 
+        self.eval_threads = []
+
         # LSTM
         if config["use_rnn"]:
             n = vecenv.agents_per_batch
@@ -609,8 +611,9 @@ def train(self):
                                 wandb_prefix="eval",
                             )
 
-                            pufferlib.utils.run_safe_eval_metrics_in_subprocess(
-                                self.config, self.logger, self.global_step, safe_eval_config
+                            self._run_eval(
+                                pufferlib.utils.run_safe_eval_metrics_in_subprocess,
+                                self.config, self.logger, self.global_step, safe_eval_config,
                             )
                         except Exception as e:
                             print(f"Failed to run safe eval: {e}")
@@ -629,12 +632,30 @@ def train(self):
         if self.config["eval"]["wosac_realism_eval"] and (
             self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training
         ):
-            pufferlib.utils.run_wosac_eval_in_subprocess(self.config, self.logger, self.global_step)
+            self._run_eval(
+                pufferlib.utils.run_wosac_eval_in_subprocess,
+                self.config, self.logger, self.global_step,
+            )
 
         if self.config["eval"]["human_replay_eval"] and (
             self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training
         ):
-            pufferlib.utils.run_human_replay_eval_in_subprocess(self.config, self.logger, self.global_step)
+            self._run_eval(
+                pufferlib.utils.run_human_replay_eval_in_subprocess,
+                self.config, self.logger, self.global_step,
+            )
+
+    def _run_eval(self, fn, *args, **kwargs):
+        """Run an eval function, optionally in a background thread."""
+        eval_async = self.config.get("eval", {}).get("eval_async", False)
+        if eval_async:
+            # Clean up finished threads
+            self.eval_threads = [t for t in self.eval_threads if t.is_alive()]
+            t = Thread(target=fn, args=args, kwargs=kwargs, daemon=True)
+            t.start()
+            self.eval_threads.append(t)
+        else:
+            fn(*args, **kwargs)
 
     def check_render_queue(self):
         """Check if any async render jobs finished and log them."""
@@ -708,6 +729,11 @@ def close(self):
         self.vecenv.close()
         self.utilization.stop()
 
+        # Wait for any background eval threads to finish
+        for t in self.eval_threads:
+            t.join(timeout=660)  # slightly longer than subprocess timeout (600s)
+        self.eval_threads = []
+
         if self.render_async:  # Ensure all render processes are properly terminated before closing the queue
             if hasattr(self, "render_processes"):
                 for p in self.render_processes:

From 3b91abcd64c0923a75cf4aab9120b27a640377cd Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Mon, 9 Mar 2026 09:44:49 -0400
Subject: [PATCH 12/46] Fix ruff formatting for eval_async call sites

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/pufferl.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 486f2ea6f7..38106b14e6 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -613,7 +613,10 @@ def train(self):
 
                             self._run_eval(
                                 pufferlib.utils.run_safe_eval_metrics_in_subprocess,
-                                self.config, self.logger, self.global_step, safe_eval_config,
+                                self.config,
+                                self.logger,
+                                self.global_step,
+                                safe_eval_config,
                             )
                         except Exception as e:
                             print(f"Failed to run safe eval: {e}")
@@ -634,7 +637,9 @@ def train(self):
         ):
             self._run_eval(
                 pufferlib.utils.run_wosac_eval_in_subprocess,
-                self.config, self.logger, self.global_step,
+                self.config,
+                self.logger,
+                self.global_step,
             )
 
         if self.config["eval"]["human_replay_eval"] and (
@@ -642,7 +647,9 @@ def train(self):
         ):
             self._run_eval(
                 pufferlib.utils.run_human_replay_eval_in_subprocess,
-                self.config, self.logger, self.global_step,
+                self.config,
+                self.logger,
+                self.global_step,
             )
 
     def _run_eval(self, fn, *args, **kwargs):

From 648bd0b8f30f3e6d8abe655a74f109f624a8f9c6 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@users.noreply.github.com>
Date: Tue, 10 Mar 2026 10:46:26 -0400
Subject: [PATCH 13/46] Adjust lane alignment value in drive.ini

---
 pufferlib/config/ocean/drive.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index 4f12ecb2a3..c8081b9d59 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -218,7 +218,7 @@ comfort = -0.1
 
 ; Standard driving rewards
 goal_radius = 2.0
-lane_align = 0.0025
+lane_align = 0.025
 lane_center = -0.00075
 velocity = 0.005
 center_bias = 0.0

From e73380dc2adb44fe5bddafda7fbcfac0ddfe0acd Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@users.noreply.github.com>
Date: Tue, 10 Mar 2026 10:48:24 -0400
Subject: [PATCH 14/46] Update drive.ini configuration parameters

Reduced the number of episodes from 300 to 100 and added episode length, min and max goal distance parameters.
---
 pufferlib/config/ocean/drive.ini | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index c8081b9d59..b3b8f8e7b0 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -204,7 +204,11 @@ interval = 250
 ; Number of agents to run in the eval environment
 num_agents = 64
 ; Number of episodes to collect metrics over
-num_episodes = 300
+num_episodes = 100
+; episode length
+episode_length = 1000
+min_goal_distance = 0.5
+max_goal_distance = 1000.0
 
 ; Reward conditioning values (min=max to fix the value).
 ; Names match the env reward_bound_* keys.

From 741401b9e8ae68cedd44617030b932456b675be1 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Tue, 10 Mar 2026 11:01:25 -0400
Subject: [PATCH 15/46] Fix eval config parameters not being passed to
 subprocesses

- safe_eval: pass episode_length, min_goal_distance, max_goal_distance
  to both the safe_eval() function (env overrides) and the subprocess
  (CLI args). Also pass num_agents to subprocess.
- wosac/human_replay: pass map_dir and num_maps to subprocesses so
  CLI overrides are respected.
- Add missing human_replay_num_agents to drive.ini config.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/config/ocean/drive.ini |  2 ++
 pufferlib/pufferl.py             |  6 ++++++
 pufferlib/utils.py               | 20 ++++++++++++++++++++
 3 files changed, 28 insertions(+)

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index b3b8f8e7b0..857e87c62f 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -193,6 +193,8 @@ wosac_aggregate_results = True
 wosac_episode_length = 91
 ; If True, enable human replay evaluation (pair policy-controlled agent with human replays)
 human_replay_eval = False
+; Number of agents for human replay evaluation
+human_replay_num_agents = 64
 ; Control only the self-driving car
 human_replay_control_mode = "control_sdc_only"
 
diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 38106b14e6..fffdb62ad6 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -1404,6 +1404,12 @@ def safe_eval(env_name, args=None, vecenv=None, policy=None):
     safe_eval_config = args.get("safe_eval", {})
     args["vec"] = dict(backend="PufferEnv", num_envs=1)
     args["env"]["num_agents"] = safe_eval_config.get("num_agents", 64)
+    if "episode_length" in safe_eval_config:
+        args["env"]["episode_length"] = safe_eval_config["episode_length"]
+    if "min_goal_distance" in safe_eval_config:
+        args["env"]["min_goal_distance"] = safe_eval_config["min_goal_distance"]
+    if "max_goal_distance" in safe_eval_config:
+        args["env"]["max_goal_distance"] = safe_eval_config["max_goal_distance"]
 
     vecenv = vecenv or load_env(env_name, args)
     policy = policy or load_policy(args, vecenv, env_name)
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index c83f460353..e288656469 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -102,6 +102,10 @@ def run_human_replay_eval_in_subprocess(config, logger, global_step):
             str(eval_config["human_replay_num_agents"]),
             "--eval.human-replay-control-mode",
             str(eval_config["human_replay_control_mode"]),
+            "--eval.map-dir",
+            str(eval_config.get("map_dir", "resources/drive/binaries/training")),
+            "--eval.num-maps",
+            str(eval_config.get("num_maps", 20)),
         ],
         marker_name="HUMAN_REPLAY",
         wandb_keys={
@@ -140,6 +144,10 @@ def run_wosac_eval_in_subprocess(config, logger, global_step):
             str(eval_config.get("wosac_aggregate_results", True)),
             "--env.episode-length",
             str(eval_config.get("wosac_episode_length", 91)),
+            "--eval.map-dir",
+            str(eval_config.get("map_dir", "resources/drive/binaries/training")),
+            "--eval.num-maps",
+            str(eval_config.get("num_maps", 20)),
         ],
         marker_name="WOSAC",
         wandb_keys={
@@ -371,8 +379,20 @@ def run_safe_eval_metrics_in_subprocess(config, logger, global_step, safe_eval_c
         "1",
         "--safe-eval.num-episodes",
         str(num_episodes),
+        "--safe-eval.num-agents",
+        str(safe_eval_config.get("num_agents", 64)),
     ]
 
+    # Pass env overrides from safe_eval config
+    env_overrides = {
+        "episode_length": "episode-length",
+        "min_goal_distance": "min-goal-distance",
+        "max_goal_distance": "max-goal-distance",
+    }
+    for config_key, cli_key in env_overrides.items():
+        if config_key in safe_eval_config:
+            extra_args.extend([f"--env.{cli_key}", str(safe_eval_config[config_key])])
+
     valid_bounds = _get_env_reward_bound_names()
     for key, val in safe_eval_config.items():
         if key not in valid_bounds:

From ea1c7af74d8e7ad1fbd20cf3acdfe38cf3d0af52 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Tue, 10 Mar 2026 11:35:25 -0400
Subject: [PATCH 16/46] Fix safe eval: count episodes not steps, fix wandb
 async logging

- safe_eval() was using num_episodes as num_steps, running far too few
  steps to complete any episodes. Now runs until enough episodes are
  collected, counting by the 'n' field in each info dict.
- Fix wandb step conflict: async eval threads logged at stale steps,
  causing wandb to drop metrics. Use define_metric("eval/*",
  step_metric="eval_step") so eval metrics have their own step counter.
- Fix subprocess config passthrough: pass episode_length,
  min_goal_distance, max_goal_distance as --safe-eval.* args (not
  --env.*) so safe_eval() correctly applies them from safe_eval_config.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/pufferl.py | 14 ++++++++++++--
 pufferlib/utils.py   | 18 ++++++++----------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index fffdb62ad6..3cadd99c5b 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -208,6 +208,9 @@ def __init__(self, config, vecenv, policy, logger=None):
         if self.render_async and hasattr(self.logger, "wandb") and self.logger.wandb:
             self.logger.wandb.define_metric("render_step", hidden=True)
             self.logger.wandb.define_metric("render/*", step_metric="render_step")
+        if hasattr(self.logger, "wandb") and self.logger.wandb:
+            self.logger.wandb.define_metric("eval_step", hidden=True)
+            self.logger.wandb.define_metric("eval/*", step_metric="eval_step")
 
         # Learning rate scheduler
         epochs = config["total_timesteps"] // config["batch_size"]
@@ -1415,7 +1418,8 @@ def safe_eval(env_name, args=None, vecenv=None, policy=None):
     policy = policy or load_policy(args, vecenv, env_name)
     policy.eval()
 
-    num_steps = args.get("safe_eval", {}).get("num_episodes", 300)
+    num_episodes = args.get("safe_eval", {}).get("num_episodes", 300)
+    episode_length = args["env"].get("episode_length", 300)
     device = args["train"]["device"]
     num_agents = vecenv.observation_space.shape[0]
     use_rnn = args["train"]["use_rnn"]
@@ -1431,7 +1435,12 @@ def safe_eval(env_name, args=None, vecenv=None, policy=None):
         )
 
     all_stats = defaultdict(list)
-    for _ in range(num_steps):
+    episodes_collected = 0
+    # Run until we collect enough episode completions
+    max_steps = (num_episodes // max(num_agents, 1) + 2) * episode_length
+    for step in range(max_steps):
+        if episodes_collected >= num_episodes:
+            break
         with torch.no_grad():
             ob_t = torch.as_tensor(ob).to(device)
             if use_rnn:
@@ -1446,6 +1455,7 @@ def safe_eval(env_name, args=None, vecenv=None, policy=None):
         dones = torch.as_tensor(np.maximum(terminals, truncations)).float().to(device)
         for entry in infos:
             if isinstance(entry, dict):
+                episodes_collected += int(entry.get("n", 1))
                 for k, v in entry.items():
                     try:
                         float(v)
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index e288656469..8cc1262f0b 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -76,7 +76,8 @@ def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_n
                     else:
                         payload = {f"eval/{k}": v for k, v in metrics.items()}
                     if payload:
-                        logger.wandb.log(payload, step=global_step)
+                        payload["eval_step"] = global_step
+                        logger.wandb.log(payload)
         else:
             print(f"{eval_name} evaluation failed with exit code {result.returncode}: {result.stderr[-1000:]}")
 
@@ -383,15 +384,12 @@ def run_safe_eval_metrics_in_subprocess(config, logger, global_step, safe_eval_c
         str(safe_eval_config.get("num_agents", 64)),
     ]
 
-    # Pass env overrides from safe_eval config
-    env_overrides = {
-        "episode_length": "episode-length",
-        "min_goal_distance": "min-goal-distance",
-        "max_goal_distance": "max-goal-distance",
-    }
-    for config_key, cli_key in env_overrides.items():
-        if config_key in safe_eval_config:
-            extra_args.extend([f"--env.{cli_key}", str(safe_eval_config[config_key])])
+    # Pass safe_eval overrides that safe_eval() applies to env config
+    safe_eval_overrides = ["episode_length", "min_goal_distance", "max_goal_distance"]
+    for key in safe_eval_overrides:
+        if key in safe_eval_config:
+            cli_key = key.replace("_", "-")
+            extra_args.extend([f"--safe-eval.{cli_key}", str(safe_eval_config[key])])
 
     valid_bounds = _get_env_reward_bound_names()
     for key, val in safe_eval_config.items():

From f6413c29883726aa3d95df73d45f2c61b173e792 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Tue, 10 Mar 2026 22:27:11 -0400
Subject: [PATCH 17/46] Fix async render bin file race condition

When render_async is enabled, the bin_path_epoch file was deleted in
the finally block while the async render process was still reading it.
Now the async render process passes bin_path through the render queue,
and check_render_queue() deletes it after the process finishes. Also
drains the queue in close() to clean up any remaining bin files.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/pufferl.py | 15 ++++++++++++++-
 pufferlib/utils.py   |  1 +
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 3cadd99c5b..565a081d71 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -549,6 +549,7 @@ def train(self):
 
                     bin_path_epoch = f"{model_dir}_epoch_{self.epoch:06d}.bin"
                     shutil.copy2(bin_path, bin_path_epoch)
+                    async_render_owns_bin = False
                     env_cfg = getattr(self.vecenv, "driver_env", None)
                     wandb_log = bool(hasattr(self.logger, "wandb") and self.logger.wandb)
                     wandb_run = self.logger.wandb if hasattr(self.logger, "wandb") else None
@@ -582,6 +583,7 @@ def train(self):
                             )
                             render_proc.start()
                             self.render_processes.append(render_proc)
+                            async_render_owns_bin = True
                         else:
                             pufferlib.utils.render_videos(
                                 self.config,
@@ -632,7 +634,8 @@ def train(self):
                 finally:
                     if os.path.exists(bin_path):
                         os.remove(bin_path)
-                    if os.path.exists(bin_path_epoch):
+                    # If async render is using bin_path_epoch, let check_render_queue clean it up
+                    if not async_render_owns_bin and os.path.exists(bin_path_epoch):
                         os.remove(bin_path_epoch)
 
         if self.config["eval"]["wosac_realism_eval"] and (
@@ -679,6 +682,14 @@ def check_render_queue(self):
                 videos = result["videos"]
                 prefix = result.get("wandb_prefix", "render")
 
+                # Clean up bin file that the async render process was using
+                result_bin_path = result.get("bin_path")
+                if result_bin_path and os.path.exists(result_bin_path):
+                    try:
+                        os.remove(result_bin_path)
+                    except OSError:
+                        pass
+
                 if hasattr(self.logger, "wandb") and self.logger.wandb:
                     import wandb
 
@@ -745,6 +756,8 @@ def close(self):
         self.eval_threads = []
 
         if self.render_async:  # Ensure all render processes are properly terminated before closing the queue
+            # Drain the queue and clean up any bin files from completed renders
+            self.check_render_queue()
             if hasattr(self, "render_processes"):
                 for p in self.render_processes:
                     try:
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index 8cc1262f0b..4fe982b3a7 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -325,6 +325,7 @@ def render_videos(
                     "videos": generated_videos,
                     "step": global_step,
                     "wandb_prefix": wandb_prefix,
+                    "bin_path": bin_path,
                 }
             )
 

From 0a9277a32f11d87d7e5cccea0d108ab545ac5e1f Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Tue, 10 Mar 2026 22:52:02 -0400
Subject: [PATCH 18/46] Use lexicographic sort for checkpoints, update cluster
 config for torch

- Replace os.path.getctime with lexicographic max() for finding latest
  checkpoint (filenames are zero-padded epoch numbers)
- Update SLURM account to torch_pr_355_tandon_advanced
- Fix container overlay path

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/pufferl.py                    | 4 ++--
 pufferlib/utils.py                      | 2 +-
 scripts/cluster_configs/nyu_greene.yaml | 2 +-
 scripts/submit_cluster.py               | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index a41f6a410d..cebe9f5df8 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -533,7 +533,7 @@ def train(self):
             model_files = glob.glob(os.path.join(model_dir, "model_*.pt"))
 
             if model_files:
-                latest_cpt = max(model_files, key=os.path.getctime)
+                latest_cpt = max(model_files)
                 bin_path = f"{model_dir}.bin"
 
                 try:
@@ -1741,7 +1741,7 @@ def load_policy(args, vecenv, env_name=""):
 
     load_path = args["load_model_path"]
     if load_path == "latest":
-        load_path = max(glob.glob(f"experiments/{env_name}*.pt"), key=os.path.getctime)
+        load_path = max(glob.glob(f"experiments/{env_name}*.pt"))
 
     if load_path is not None:
         state_dict = torch.load(load_path, map_location=device)
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index 7208b252e8..7d06aa1b00 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -44,7 +44,7 @@ def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_n
             print(f"No model files found for {eval_name} evaluation")
             return
 
-        latest_cpt = max(model_files, key=os.path.getctime)
+        latest_cpt = max(model_files)
 
         cmd = [
             sys.executable,
diff --git a/scripts/cluster_configs/nyu_greene.yaml b/scripts/cluster_configs/nyu_greene.yaml
index 06dd3a3b46..1841023b5f 100644
--- a/scripts/cluster_configs/nyu_greene.yaml
+++ b/scripts/cluster_configs/nyu_greene.yaml
@@ -1,5 +1,5 @@
 # NYU Greene cluster compute configuration
-account: torch_pr_355_general  # Set your account/allocation
+account: torch_pr_355_tandon_advanced  # Set your account/allocation
 nodes: 1
 gpus: 1
 cpus: 16
diff --git a/scripts/submit_cluster.py b/scripts/submit_cluster.py
index 7c697c0692..53d208394c 100644
--- a/scripts/submit_cluster.py
+++ b/scripts/submit_cluster.py
@@ -91,7 +91,7 @@ def parse_args():
     parser.add_argument(
         "--container_overlay",
         type=str,
-        default="/scratch/ev2237/containers/pufferdrive/overlay.ext3",
+        default="/scratch/ev2237/containers/pufferdrive-overlay.ext3",
         help="Singularity overlay path",
     )
 

From b3a03dc5e3b8f57f22a5fcda6802e41ad73dbf7f Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Tue, 10 Mar 2026 23:09:20 -0400
Subject: [PATCH 19/46] Switch cluster account to torch_pr_355_general

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 scripts/cluster_configs/nyu_greene.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/cluster_configs/nyu_greene.yaml b/scripts/cluster_configs/nyu_greene.yaml
index 1841023b5f..06dd3a3b46 100644
--- a/scripts/cluster_configs/nyu_greene.yaml
+++ b/scripts/cluster_configs/nyu_greene.yaml
@@ -1,5 +1,5 @@
 # NYU Greene cluster compute configuration
-account: torch_pr_355_tandon_advanced  # Set your account/allocation
+account: torch_pr_355_general  # Set your account/allocation
 nodes: 1
 gpus: 1
 cpus: 16

From c7563d6c00037fb3c525a12097d038928f5c3428 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Tue, 10 Mar 2026 23:10:06 -0400
Subject: [PATCH 20/46] Fix sbatch exclude error: don't pass empty --exclude

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 scripts/cluster_configs/nyu_greene.yaml | 1 -
 scripts/submit_cluster.py               | 9 +++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/scripts/cluster_configs/nyu_greene.yaml b/scripts/cluster_configs/nyu_greene.yaml
index 06dd3a3b46..a82cd2659a 100644
--- a/scripts/cluster_configs/nyu_greene.yaml
+++ b/scripts/cluster_configs/nyu_greene.yaml
@@ -6,5 +6,4 @@ cpus: 16
 mem: 32gb
 time: 360  # minutes
 gpu_type: null  # rtx8000, a100, v100 (optional, uses partition default)
-exclude: ""
 nodelist: null
diff --git a/scripts/submit_cluster.py b/scripts/submit_cluster.py
index 53d208394c..597931819c 100644
--- a/scripts/submit_cluster.py
+++ b/scripts/submit_cluster.py
@@ -223,20 +223,25 @@ def submit(args, job_name: str, command: List[str], save_dir: str, dry: bool):
     if from_config.get("nodelist") is not None:
         additional_parameters["nodelist"] = from_config["nodelist"]
 
-    executor.update_parameters(
+    params = dict(
         slurm_account=from_config.get("account"),
         slurm_partition=from_config.get("partition"),
         cpus_per_task=from_config.get("cpus", 8) // args.task_per_node,
         tasks_per_node=args.task_per_node,
         nodes=from_config.get("nodes", 1),
         slurm_gres=gres,
-        slurm_exclude=from_config.get("exclude", ""),
         slurm_mem=from_config.get("mem"),
         slurm_time=from_config.get("time", 60),
         slurm_job_name=job_name,
         slurm_additional_parameters=additional_parameters,
     )
 
+    exclude = from_config.get("exclude", "")
+    if exclude:
+        params["slurm_exclude"] = exclude
+
+    executor.update_parameters(**params)
+
     def launch_training(args, from_config, cmd, save_dir, project_root, container_config=None):
         """Runs inside the SLURM allocation."""
         import os

From 46d74f6765887f39b63e2e1a8e70f90a56d78150 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Fri, 13 Mar 2026 00:09:28 -0400
Subject: [PATCH 21/46] Make all evals async by default, fix human replay
 --eval.num-maps arg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Enable render_async and eval_async by default in drive.ini
- Fix human replay subprocess: --eval.num-maps → --env.num-maps
- Fix eval_async string "False" being truthy (explicit string-to-bool)
- Align WOSAC/human replay trigger to same interval as render/safe eval
- Forward env config to eval subprocesses
- Add eval documentation to docs/src/evaluation.md
- Add TIMING prints for eval component profiling

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 docs/src/evaluation.md           |  77 +++++++++++++++++-
 pufferlib/config/ocean/drive.ini |   4 +-
 pufferlib/pufferl.py             | 129 ++++++++++++++++---------------
 pufferlib/utils.py               |  28 ++++++-
 4 files changed, 166 insertions(+), 72 deletions(-)

diff --git a/docs/src/evaluation.md b/docs/src/evaluation.md
index 0b73228aa5..4d5c249456 100644
--- a/docs/src/evaluation.md
+++ b/docs/src/evaluation.md
@@ -2,7 +2,78 @@
 
 Driving is a safety-critical multi-agent application, making careful evaluation and risk assessment essential. Mistakes in the real world are costly, so simulations are used to catch errors before deployment. To support rapid iteration, evaluations should ideally run efficiently. This is why we also paid attention to optimizing the speed of the evaluations. This page contains an overview of the available benchmarks and evals.
 
-## Sanity maps 🐛
+## Evaluation during training
+
+PufferDrive supports running evaluations automatically during training. There are four evaluation types that can run periodically:
+
+| Eval type | What it does | CLI flag to enable | Interval flag |
+|---|---|---|---|
+| **Render** | Records top-down and agent-view videos | `--train.render True` | `--train.render-interval N` |
+| **Safe eval render** | Records videos with safe reward conditioning | `--safe-eval.enabled True` | `--safe-eval.interval N` |
+| **Safe eval metrics** | Runs policy in subprocess, logs driving metrics | `--safe-eval.enabled True` | `--safe-eval.interval N` |
+| **WOSAC realism** | Measures distributional realism (WOSAC benchmark) | `--eval.wosac-realism-eval True` | `--eval.eval-interval N` |
+| **Human replay** | Tests policy alongside replayed human trajectories | `--eval.human-replay-eval True` | `--eval.eval-interval N` |
+
+All eval types trigger at `epoch % interval == 0`. They require a saved checkpoint, so **`checkpoint-interval` must be <= the smallest eval interval**.
+
+### Example: enable all evals
+
+```bash
+puffer train puffer_drive \
+  --wandb --wandb-project pufferdrive \
+  --train.checkpoint-interval 250 \
+  --train.render True --train.render-interval 250 \
+  --safe-eval.enabled True --safe-eval.interval 250 \
+  --eval.wosac-realism-eval True \
+  --eval.human-replay-eval True \
+  --eval.eval-interval 250
+```
+
+### Safe eval
+
+Safe eval measures how well the policy drives when given "safe" reward conditioning values (high penalties for collisions and offroad driving, rewards for lane keeping). It runs in a **separate subprocess** that loads the latest checkpoint, creates a fresh environment, and collects metrics over multiple episodes.
+
+The safe eval subprocess inherits the training environment configuration (map directory, reward bounds, etc.) but overrides a few parameters:
+
+- `num_agents`: Number of agents in the eval environment (default: 64)
+- `episode_length`: How long each eval episode runs (default: 1000 steps)
+- `num_episodes`: How many episode completions to collect before reporting (default: 100)
+- `resample_frequency`: Automatically set to 0 (disabled) so episodes can run to completion
+
+Metrics logged to wandb under `eval/*`:
+
+- `eval/score`, `eval/collision_rate`, `eval/offroad_rate`
+- `eval/completion_rate`, `eval/dnf_rate`
+- `eval/episode_length`, `eval/episode_return`
+- `eval/lane_alignment_rate`, `eval/lane_center_rate`
+- And more (see `drive.h` `Log` struct for the full list)
+
+Configure safe eval reward conditioning in `drive.ini` under `[safe_eval]`:
+
+```ini
+[safe_eval]
+enabled = True
+interval = 250
+num_agents = 64
+num_episodes = 100
+episode_length = 1000
+
+; Fixed reward conditioning values (min=max pins the value)
+collision = -3.0
+offroad = -3.0
+overspeed = -1.0
+traffic_light = -1.0
+lane_align = 0.025
+velocity = 0.005
+```
+
+### Async vs sync evaluation
+
+By default, WOSAC and human replay evals run synchronously, blocking training until they finish. Set `--eval.eval-async True` to run them in background threads instead.
+
+> **Note:** Render and safe eval always run synchronously in the training loop. The `eval_async` flag only affects WOSAC and human replay evaluations.
+
+## Sanity maps
 
 Quickly test the training on curated, lightweight scenarios without downloading the full dataset. Each sanity map tests a specific behavior.
 
@@ -33,7 +104,7 @@ Available maps:
 
 ![Sanity map gallery placeholder](images/maps_screenshot.png)
 
-## Distributional realism benchmark 📊
+## Distributional realism benchmark (WOSAC)
 
 We provide a PufferDrive implementation of the Waymo Open Sim Agents Challenge (WOSAC) for fast, easy evaluation of how well your trained agent matches distributional properties of human behavior.
 
@@ -45,7 +116,7 @@ Add `--load-model-path <path_to_checkpoint>.pt` to score a trained policy, inste
 
 See [the WOSAC benchmark page](wosac.md) for the metric pipeline and all the details.
 
-## Human-compatibility benchmark 🤝
+## Human-compatibility benchmark
 
 You may be interested in how compatible your agent is with human partners. For this purpose, we support an eval where your policy only controls the self-driving car (SDC). The rest of the agents in the scene are stepped using the logs. While it is not a perfect eval since the human partners here are static, it will still give you a sense of how closely aligned your agent's behavior is to how people drive. You can run it like this:
 
diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index 06300caa67..354cb0e1a7 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -158,7 +158,7 @@ vtrace_rho_clip = 1
 checkpoint_interval = 1000
 ; Rendering options
 render = True
-render_async = False     # Render interval of below 50 might cause process starvation and slowness in training
+render_async = True
 render_interval = 1000
 ; If True, show exactly what the agent sees in agent observation
 obs_only = True
@@ -176,7 +176,7 @@ render_map = none
 [eval]
 eval_interval = 1000
 ; If True, run eval subprocesses (wosac, human replay, safe eval metrics) in background threads
-eval_async = True
+eval_async = True    # Run eval subprocesses (wosac, human replay, safe eval metrics) in background threads
 ; Path to dataset used for evaluation
 map_dir = "resources/drive/binaries/training"
 ; Number of scenarios to process per batch
diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index cebe9f5df8..15cb7cc78b 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -537,6 +537,7 @@ def train(self):
                 bin_path = f"{model_dir}.bin"
 
                 try:
+                    t0 = time.time()
                     export_args = {"env_name": self.config["env"], "load_model_path": latest_cpt, **self.config}
                     export(
                         args=export_args,
@@ -546,121 +547,120 @@ def train(self):
                         path=bin_path,
                         silent=True,
                     )
+                    print(f"TIMING: model export: {time.time() - t0:.1f}s")
 
-                    bin_path_epoch = f"{model_dir}_epoch_{self.epoch:06d}.bin"
-                    shutil.copy2(bin_path, bin_path_epoch)
-                    async_render_owns_bin = False
                     env_cfg = getattr(self.vecenv, "driver_env", None)
                     wandb_log = bool(hasattr(self.logger, "wandb") and self.logger.wandb)
                     wandb_run = self.logger.wandb if hasattr(self.logger, "wandb") else None
+                    async_bin_paths = []  # track bin copies owned by async processes
 
                     if should_render:
+                        t1 = time.time()
                         if self.render_async:
-                            # Clean up finished processes
                             self.render_processes = [p for p in self.render_processes if p.is_alive()]
-
-                            # Cap the number of processes to num_workers
                             max_processes = self.config.get("num_workers", 1)
-                            if len(self.render_processes) >= max_processes:
-                                print("Waiting for render processes to finish...")
                             while len(self.render_processes) >= max_processes:
                                 time.sleep(1)
                                 self.render_processes = [p for p in self.render_processes if p.is_alive()]
 
+                            bin_copy = f"{model_dir}_epoch_{self.epoch:06d}_render.bin"
+                            shutil.copy2(bin_path, bin_copy)
+                            async_bin_paths.append(bin_copy)
                             render_proc = multiprocessing.Process(
                                 target=pufferlib.utils.render_videos,
                                 args=(
-                                    self.config,
-                                    env_cfg,
-                                    self.logger.run_id,
-                                    wandb_log,
-                                    self.epoch,
-                                    self.global_step,
-                                    bin_path_epoch,
-                                    self.render_async,
-                                    self.render_queue,
+                                    self.config, env_cfg, self.logger.run_id,
+                                    wandb_log, self.epoch, self.global_step,
+                                    bin_copy, True, self.render_queue,
                                 ),
                             )
                             render_proc.start()
                             self.render_processes.append(render_proc)
-                            async_render_owns_bin = True
                         else:
                             pufferlib.utils.render_videos(
-                                self.config,
-                                env_cfg,
-                                self.logger.run_id,
-                                wandb_log,
-                                self.epoch,
-                                self.global_step,
-                                bin_path_epoch,
-                                self.render_async,
-                                wandb_run=wandb_run,
+                                self.config, env_cfg, self.logger.run_id,
+                                wandb_log, self.epoch, self.global_step,
+                                bin_path, False, wandb_run=wandb_run,
                             )
+                        print(f"TIMING: training render: {time.time() - t1:.1f}s")
 
-                    # Run safe eval using the same bin (reuses the already-exported model)
                     if should_safe_eval:
-                        safe_ini_path = None
                         try:
                             safe_ini_path = pufferlib.utils.generate_safe_eval_ini(safe_eval_config)
-                            pufferlib.utils.render_videos(
-                                self.config,
-                                env_cfg,
-                                self.logger.run_id,
-                                wandb_log,
-                                self.epoch,
-                                self.global_step,
-                                bin_path_epoch,
-                                False,
-                                wandb_run=wandb_run,
-                                config_path=safe_ini_path,
-                                wandb_prefix="eval",
-                            )
 
+                            t2 = time.time()
+                            if self.render_async:
+                                bin_copy_eval = f"{model_dir}_epoch_{self.epoch:06d}_eval.bin"
+                                shutil.copy2(bin_path, bin_copy_eval)
+                                async_bin_paths.append(bin_copy_eval)
+                                eval_render_proc = multiprocessing.Process(
+                                    target=pufferlib.utils.render_videos,
+                                    args=(
+                                        self.config, env_cfg, self.logger.run_id,
+                                        wandb_log, self.epoch, self.global_step,
+                                        bin_copy_eval, True, self.render_queue,
+                                    ),
+                                    kwargs={"config_path": safe_ini_path, "wandb_prefix": "eval"},
+                                )
+                                eval_render_proc.start()
+                                self.render_processes.append(eval_render_proc)
+                            else:
+                                pufferlib.utils.render_videos(
+                                    self.config, env_cfg, self.logger.run_id,
+                                    wandb_log, self.epoch, self.global_step,
+                                    bin_path, False, wandb_run=wandb_run,
+                                    config_path=safe_ini_path, wandb_prefix="eval",
+                                )
+                                if os.path.exists(safe_ini_path):
+                                    os.remove(safe_ini_path)
+                            print(f"TIMING: safe eval render: {time.time() - t2:.1f}s")
+
+                            t3 = time.time()
                             self._run_eval(
                                 pufferlib.utils.run_safe_eval_metrics_in_subprocess,
-                                self.config,
-                                self.logger,
-                                self.global_step,
+                                self.config, self.logger, self.global_step,
                                 safe_eval_config,
                             )
+                            print(f"TIMING: safe eval metrics subprocess: {time.time() - t3:.1f}s")
                         except Exception as e:
                             print(f"Failed to run safe eval: {e}")
-                        finally:
-                            if safe_ini_path and os.path.exists(safe_ini_path):
-                                os.remove(safe_ini_path)
 
                 except Exception as e:
                     print(f"Failed to export model weights: {e}")
                 finally:
                     if os.path.exists(bin_path):
                         os.remove(bin_path)
-                    # If async render is using bin_path_epoch, let check_render_queue clean it up
-                    if not async_render_owns_bin and os.path.exists(bin_path_epoch):
-                        os.remove(bin_path_epoch)
 
         if self.config["eval"]["wosac_realism_eval"] and (
-            (self.epoch - 1) % self.config["eval"]["eval_interval"] == 0 or done_training
+            self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training
         ):
+            t_wosac = time.time()
             self._run_eval(
                 pufferlib.utils.run_wosac_eval_in_subprocess,
                 self.config,
                 self.logger,
                 self.global_step,
             )
+            print(f"TIMING: wosac eval: {time.time() - t_wosac:.1f}s")
 
         if self.config["eval"]["human_replay_eval"] and (
-            (self.epoch - 1) % self.config["eval"]["eval_interval"] == 0 or done_training
+            self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training
         ):
+            t_human = time.time()
             self._run_eval(
                 pufferlib.utils.run_human_replay_eval_in_subprocess,
                 self.config,
                 self.logger,
                 self.global_step,
             )
+            print(f"TIMING: human replay eval: {time.time() - t_human:.1f}s")
 
     def _run_eval(self, fn, *args, **kwargs):
         """Run an eval function, optionally in a background thread."""
         eval_async = self.config.get("eval", {}).get("eval_async", False)
+        # Handle string "False"/"True" from INI config
+        if isinstance(eval_async, str):
+            eval_async = eval_async.lower() not in ("false", "0", "no", "")
         if eval_async:
             # Clean up finished threads
             self.eval_threads = [t for t in self.eval_threads if t.is_alive()]
@@ -682,13 +682,14 @@ def check_render_queue(self):
                 videos = result["videos"]
                 prefix = result.get("wandb_prefix", "render")
 
-                # Clean up bin file that the async render process was using
-                result_bin_path = result.get("bin_path")
-                if result_bin_path and os.path.exists(result_bin_path):
-                    try:
-                        os.remove(result_bin_path)
-                    except OSError:
-                        pass
+                # Clean up files that the async render process was using
+                for cleanup_key in ("bin_path", "config_path"):
+                    cleanup_path = result.get(cleanup_key)
+                    if cleanup_path and os.path.exists(cleanup_path):
+                        try:
+                            os.remove(cleanup_path)
+                        except OSError:
+                            pass
 
                 if hasattr(self.logger, "wandb") and self.logger.wandb:
                     import wandb
@@ -1229,7 +1230,7 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None):
     else:
         logger = None
 
-    train_config = dict(**args["train"], env=env_name, eval=args.get("eval", {}), safe_eval=args.get("safe_eval", {}))
+    train_config = dict(**args["train"], env=env_name, eval=args.get("eval", {}), safe_eval=args.get("safe_eval", {}), env_config=args.get("env", {}))
     if "vec" in args and "num_workers" in args["vec"]:
         train_config["num_workers"] = args["vec"]["num_workers"]
     pufferl = PuffeRL(train_config, vecenv, policy, logger)
@@ -1425,6 +1426,9 @@ def safe_eval(env_name, args=None, vecenv=None, policy=None):
         args["env"]["min_goal_distance"] = safe_eval_config["min_goal_distance"]
     if "max_goal_distance" in safe_eval_config:
         args["env"]["max_goal_distance"] = safe_eval_config["max_goal_distance"]
+    # Disable map resampling during eval — episodes must complete to generate metrics.
+    # resample_frequency < episode_length would destroy envs before episodes finish.
+    args["env"]["resample_frequency"] = 0
 
     vecenv = vecenv or load_env(env_name, args)
     policy = policy or load_policy(args, vecenv, env_name)
@@ -1474,7 +1478,6 @@ def safe_eval(env_name, args=None, vecenv=None, policy=None):
                         all_stats[k].append(v)
                     except (TypeError, ValueError):
                         pass
-
     metrics = {k: float(np.mean(v)) for k, v in all_stats.items() if len(v) > 0}
 
     print("SAFE_EVAL_METRICS_START")
@@ -1681,7 +1684,7 @@ def ensure_drive_binary():
 
     try:
         result = subprocess.run(
-            ["bash", "scripts/build_ocean.sh", "visualize", "local"], capture_output=True, text=True, timeout=300
+            ["bash", "scripts/build_ocean.sh", "visualize", "fast"], capture_output=True, text=True, timeout=300
         )
 
         if result.returncode != 0:
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index 7d06aa1b00..d148dd3e3f 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -8,6 +8,13 @@
 import tempfile
 
 
+def _normalize_device(device):
+    """Convert device to a string suitable for torch.load(map_location=...)."""
+    if isinstance(device, int):
+        return f"cuda:{device}"
+    return str(device)
+
+
 def _get_env_reward_bound_names(ini_path="pufferlib/config/ocean/drive.ini"):
     """Discover valid reward bound names from the env config section."""
     import re
@@ -55,8 +62,17 @@ def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_n
             "--load-model-path",
             latest_cpt,
             "--train.device",
-            config.get("device", "cuda"),
-        ] + extra_args
+            _normalize_device(config.get("device", "cuda")),
+        ]
+
+        # Forward the training env config so the subprocess inherits it
+        # Use = syntax to avoid argparse interpreting negative values as flags
+        env_config = config.get("env_config", {})
+        for key, val in env_config.items():
+            cli_key = key.replace("_", "-")
+            cmd.append(f"--env.{cli_key}={val}")
+
+        cmd += extra_args
 
         result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, cwd=os.getcwd())
 
@@ -65,7 +81,8 @@ def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_n
 
         if result.returncode == 0:
             stdout = result.stdout
-            if start_marker in stdout and end_marker in stdout:
+            has_markers = start_marker in stdout and end_marker in stdout
+            if has_markers:
                 start = stdout.find(start_marker) + len(start_marker)
                 end = stdout.find(end_marker)
                 metrics = json.loads(stdout[start:end].strip())
@@ -84,7 +101,9 @@ def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_n
     except subprocess.TimeoutExpired:
         print(f"{eval_name} evaluation timed out")
     except Exception as e:
+        import traceback
         print(f"Failed to run {eval_name} evaluation: {e}")
+        traceback.print_exc()
 
 
 def run_human_replay_eval_in_subprocess(config, logger, global_step):
@@ -105,7 +124,7 @@ def run_human_replay_eval_in_subprocess(config, logger, global_step):
             str(eval_config["human_replay_control_mode"]),
             "--eval.map-dir",
             str(eval_config.get("map_dir", "resources/drive/binaries/training")),
-            "--eval.num-maps",
+            "--env.num-maps",
             str(eval_config.get("num_maps", 20)),
         ],
         marker_name="HUMAN_REPLAY",
@@ -334,6 +353,7 @@ def render_videos(
                     "step": global_step,
                     "wandb_prefix": wandb_prefix,
                     "bin_path": bin_path,
+                    "config_path": config_path,
                 }
             )
 

From 5781223e6f487ec57b3ec7615ed3291f452c86e2 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Fri, 13 Mar 2026 08:41:12 -0400
Subject: [PATCH 22/46] Fix async render cleanup, remove env_config forwarding,
 remove TIMING prints

- close() now waits for render processes (120s timeout) before killing,
  so async renders can finish and upload videos to wandb
- Track temp files per render process; clean up when process dies
- Remove broken env_config forwarding from _run_eval_subprocess (key
  "env_config" didn't exist; WOSAC/human replay should use waymo
  defaults, not training env config)
- Fix wandb step consistency: async renders now use step= parameter
- Remove all TIMING debug prints

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/pufferl.py | 142 ++++++++++++++++++++++++++++---------------
 pufferlib/utils.py   |   8 +--
 2 files changed, 93 insertions(+), 57 deletions(-)

diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 15cb7cc78b..e3df72eb67 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -135,6 +135,7 @@ def __init__(self, config, vecenv, policy, logger=None):
         if self.render_async:
             self.render_queue = multiprocessing.Queue()
             self.render_processes = []
+            self._render_proc_temp_files = {}  # pid -> [temp_file_paths]
 
         self.eval_threads = []
 
@@ -537,7 +538,6 @@ def train(self):
                 bin_path = f"{model_dir}.bin"
 
                 try:
-                    t0 = time.time()
                     export_args = {"env_name": self.config["env"], "load_model_path": latest_cpt, **self.config}
                     export(
                         args=export_args,
@@ -547,81 +547,100 @@ def train(self):
                         path=bin_path,
                         silent=True,
                     )
-                    print(f"TIMING: model export: {time.time() - t0:.1f}s")
 
                     env_cfg = getattr(self.vecenv, "driver_env", None)
                     wandb_log = bool(hasattr(self.logger, "wandb") and self.logger.wandb)
                     wandb_run = self.logger.wandb if hasattr(self.logger, "wandb") else None
-                    async_bin_paths = []  # track bin copies owned by async processes
 
                     if should_render:
-                        t1 = time.time()
                         if self.render_async:
-                            self.render_processes = [p for p in self.render_processes if p.is_alive()]
+                            self._cleanup_dead_render_processes()
                             max_processes = self.config.get("num_workers", 1)
                             while len(self.render_processes) >= max_processes:
                                 time.sleep(1)
-                                self.render_processes = [p for p in self.render_processes if p.is_alive()]
+                                self._cleanup_dead_render_processes()
 
                             bin_copy = f"{model_dir}_epoch_{self.epoch:06d}_render.bin"
                             shutil.copy2(bin_path, bin_copy)
-                            async_bin_paths.append(bin_copy)
                             render_proc = multiprocessing.Process(
                                 target=pufferlib.utils.render_videos,
                                 args=(
-                                    self.config, env_cfg, self.logger.run_id,
-                                    wandb_log, self.epoch, self.global_step,
-                                    bin_copy, True, self.render_queue,
+                                    self.config,
+                                    env_cfg,
+                                    self.logger.run_id,
+                                    wandb_log,
+                                    self.epoch,
+                                    self.global_step,
+                                    bin_copy,
+                                    True,
+                                    self.render_queue,
                                 ),
                             )
                             render_proc.start()
+                            self._render_proc_temp_files[render_proc.pid] = [bin_copy]
                             self.render_processes.append(render_proc)
                         else:
                             pufferlib.utils.render_videos(
-                                self.config, env_cfg, self.logger.run_id,
-                                wandb_log, self.epoch, self.global_step,
-                                bin_path, False, wandb_run=wandb_run,
+                                self.config,
+                                env_cfg,
+                                self.logger.run_id,
+                                wandb_log,
+                                self.epoch,
+                                self.global_step,
+                                bin_path,
+                                False,
+                                wandb_run=wandb_run,
                             )
-                        print(f"TIMING: training render: {time.time() - t1:.1f}s")
 
                     if should_safe_eval:
                         try:
                             safe_ini_path = pufferlib.utils.generate_safe_eval_ini(safe_eval_config)
 
-                            t2 = time.time()
                             if self.render_async:
                                 bin_copy_eval = f"{model_dir}_epoch_{self.epoch:06d}_eval.bin"
                                 shutil.copy2(bin_path, bin_copy_eval)
-                                async_bin_paths.append(bin_copy_eval)
                                 eval_render_proc = multiprocessing.Process(
                                     target=pufferlib.utils.render_videos,
                                     args=(
-                                        self.config, env_cfg, self.logger.run_id,
-                                        wandb_log, self.epoch, self.global_step,
-                                        bin_copy_eval, True, self.render_queue,
+                                        self.config,
+                                        env_cfg,
+                                        self.logger.run_id,
+                                        wandb_log,
+                                        self.epoch,
+                                        self.global_step,
+                                        bin_copy_eval,
+                                        True,
+                                        self.render_queue,
                                     ),
                                     kwargs={"config_path": safe_ini_path, "wandb_prefix": "eval"},
                                 )
                                 eval_render_proc.start()
+                                self._render_proc_temp_files[eval_render_proc.pid] = [bin_copy_eval, safe_ini_path]
                                 self.render_processes.append(eval_render_proc)
                             else:
                                 pufferlib.utils.render_videos(
-                                    self.config, env_cfg, self.logger.run_id,
-                                    wandb_log, self.epoch, self.global_step,
-                                    bin_path, False, wandb_run=wandb_run,
-                                    config_path=safe_ini_path, wandb_prefix="eval",
+                                    self.config,
+                                    env_cfg,
+                                    self.logger.run_id,
+                                    wandb_log,
+                                    self.epoch,
+                                    self.global_step,
+                                    bin_path,
+                                    False,
+                                    wandb_run=wandb_run,
+                                    config_path=safe_ini_path,
+                                    wandb_prefix="eval",
                                 )
                                 if os.path.exists(safe_ini_path):
                                     os.remove(safe_ini_path)
-                            print(f"TIMING: safe eval render: {time.time() - t2:.1f}s")
 
-                            t3 = time.time()
                             self._run_eval(
                                 pufferlib.utils.run_safe_eval_metrics_in_subprocess,
-                                self.config, self.logger, self.global_step,
+                                self.config,
+                                self.logger,
+                                self.global_step,
                                 safe_eval_config,
                             )
-                            print(f"TIMING: safe eval metrics subprocess: {time.time() - t3:.1f}s")
                         except Exception as e:
                             print(f"Failed to run safe eval: {e}")
 
@@ -634,26 +653,22 @@ def train(self):
         if self.config["eval"]["wosac_realism_eval"] and (
             self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training
         ):
-            t_wosac = time.time()
             self._run_eval(
                 pufferlib.utils.run_wosac_eval_in_subprocess,
                 self.config,
                 self.logger,
                 self.global_step,
             )
-            print(f"TIMING: wosac eval: {time.time() - t_wosac:.1f}s")
 
         if self.config["eval"]["human_replay_eval"] and (
             self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training
         ):
-            t_human = time.time()
             self._run_eval(
                 pufferlib.utils.run_human_replay_eval_in_subprocess,
                 self.config,
                 self.logger,
                 self.global_step,
             )
-            print(f"TIMING: human replay eval: {time.time() - t_human:.1f}s")
 
     def _run_eval(self, fn, *args, **kwargs):
         """Run an eval function, optionally in a background thread."""
@@ -670,11 +685,31 @@ def _run_eval(self, fn, *args, **kwargs):
         else:
             fn(*args, **kwargs)
 
+    def _cleanup_dead_render_processes(self):
+        """Remove dead render processes and clean up their temp files."""
+        if not hasattr(self, "render_processes"):
+            return
+        alive = []
+        for p in self.render_processes:
+            if p.is_alive():
+                alive.append(p)
+            else:
+                # Process finished (or crashed) — clean up its temp files
+                for f in self._render_proc_temp_files.pop(p.pid, []):
+                    if os.path.exists(f):
+                        try:
+                            os.remove(f)
+                        except OSError:
+                            pass
+        self.render_processes = alive
+
     def check_render_queue(self):
         """Check if any async render jobs finished and log them."""
         if not self.render_async or not hasattr(self, "render_queue"):
             return
 
+        self._cleanup_dead_render_processes()
+
         try:
             while not self.render_queue.empty():
                 result = self.render_queue.get_nowait()
@@ -682,15 +717,6 @@ def check_render_queue(self):
                 videos = result["videos"]
                 prefix = result.get("wandb_prefix", "render")
 
-                # Clean up files that the async render process was using
-                for cleanup_key in ("bin_path", "config_path"):
-                    cleanup_path = result.get(cleanup_key)
-                    if cleanup_path and os.path.exists(cleanup_path):
-                        try:
-                            os.remove(cleanup_path)
-                        except OSError:
-                            pass
-
                 if hasattr(self.logger, "wandb") and self.logger.wandb:
                     import wandb
 
@@ -703,8 +729,7 @@ def check_render_queue(self):
                         payload[f"{prefix}/agent_view"] = [wandb.Video(p, format="mp4") for p in videos["output_agent"]]
 
                     if payload:
-                        payload["render_step"] = step
-                        self.logger.wandb.log(payload)
+                        self.logger.wandb.log(payload, step=step)
 
         except queue.Empty:
             pass
@@ -756,22 +781,33 @@ def close(self):
             t.join(timeout=660)  # slightly longer than subprocess timeout (600s)
         self.eval_threads = []
 
-        if self.render_async:  # Ensure all render processes are properly terminated before closing the queue
-            # Drain the queue and clean up any bin files from completed renders
-            self.check_render_queue()
+        if self.render_async:
+            # Wait for in-flight render processes to finish so their results
+            # reach the queue and get uploaded to wandb.
             if hasattr(self, "render_processes"):
                 for p in self.render_processes:
                     try:
+                        p.join(timeout=120)
                         if p.is_alive():
+                            print(f"Render process {p.pid} did not finish in time, terminating")
                             p.terminate()
                             p.join(timeout=5)
                             if p.is_alive():
                                 p.kill()
                     except Exception:
-                        # Best-effort cleanup; avoid letting close() crash on process errors
-                        print(f"Failed to terminate render process {p.pid}")
-                # Optionally clear the list to drop references to finished processes
-                self.render_processes = []
+                        print(f"Failed to clean up render process {p.pid}")
+            # Drain the queue — all finished processes have put their results
+            self.check_render_queue()
+            # Clean up any remaining temp files (from crashed processes)
+            for pid, files in self._render_proc_temp_files.items():
+                for f in files:
+                    if os.path.exists(f):
+                        try:
+                            os.remove(f)
+                        except OSError:
+                            pass
+            self._render_proc_temp_files.clear()
+            self.render_processes = []
             if hasattr(self, "render_queue"):
                 self.render_queue.close()
                 self.render_queue.join_thread()
@@ -1230,7 +1266,13 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None):
     else:
         logger = None
 
-    train_config = dict(**args["train"], env=env_name, eval=args.get("eval", {}), safe_eval=args.get("safe_eval", {}), env_config=args.get("env", {}))
+    train_config = dict(
+        **args["train"],
+        env=env_name,
+        eval=args.get("eval", {}),
+        safe_eval=args.get("safe_eval", {}),
+        env_config=args.get("env", {}),
+    )
     if "vec" in args and "num_workers" in args["vec"]:
         train_config["num_workers"] = args["vec"]["num_workers"]
     pufferl = PuffeRL(train_config, vecenv, policy, logger)
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index d148dd3e3f..bbd5e0e49c 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -65,13 +65,6 @@ def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_n
             _normalize_device(config.get("device", "cuda")),
         ]
 
-        # Forward the training env config so the subprocess inherits it
-        # Use = syntax to avoid argparse interpreting negative values as flags
-        env_config = config.get("env_config", {})
-        for key, val in env_config.items():
-            cli_key = key.replace("_", "-")
-            cmd.append(f"--env.{cli_key}={val}")
-
         cmd += extra_args
 
         result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, cwd=os.getcwd())
@@ -102,6 +95,7 @@ def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_n
         print(f"{eval_name} evaluation timed out")
     except Exception as e:
         import traceback
+
         print(f"Failed to run {eval_name} evaluation: {e}")
         traceback.print_exc()
 

From 77be25e8bfc12196879767e2da320df196a4d329 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Fri, 13 Mar 2026 08:57:01 -0400
Subject: [PATCH 23/46] Forward training map_dir and num_maps to safe eval
 subprocess

The safe eval subprocess loads config from the default INI, which has
map_dir=resources/drive/binaries/carla_2D. When training overrides
map_dir via CLI (e.g. to resources/drive/binaries/training), the
subprocess needs that forwarded explicitly.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/utils.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index bbd5e0e49c..d8b3657aaa 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -396,6 +396,9 @@ def run_safe_eval_metrics_in_subprocess(config, logger, global_step, safe_eval_c
     """Run policy evaluation with safe reward conditioning in a subprocess and log metrics."""
     num_episodes = safe_eval_config.get("num_episodes", 300)
 
+    # Forward training env's map_dir and num_maps so the subprocess uses the
+    # same maps as training (the default INI may point elsewhere).
+    env_config = config.get("env", {})
     extra_args = [
         "--env.reward-randomization",
         "1",
@@ -405,6 +408,8 @@ def run_safe_eval_metrics_in_subprocess(config, logger, global_step, safe_eval_c
         str(num_episodes),
         "--safe-eval.num-agents",
         str(safe_eval_config.get("num_agents", 64)),
+        f"--env.map-dir={env_config.get('map_dir', 'resources/drive/binaries/training')}",
+        f"--env.num-maps={env_config.get('num_maps', 100)}",
     ]
 
     # Pass safe_eval overrides that safe_eval() applies to env config

From 93ce1170291bf25044f2998fc43298d91bf461d9 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Fri, 13 Mar 2026 08:59:39 -0400
Subject: [PATCH 24/46] Add human replay video rendering, forward map_dir to
 safe eval

- Render a video with human replay settings (control_sdc_only,
  create_all_valid, waymo maps) alongside human replay metrics
- generate_human_replay_ini() creates temp INI for the visualize binary
- Restructure eval block: all render-based evals share the bin export
- Videos logged to wandb under human_replay/ prefix

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/pufferl.py | 65 +++++++++++++++++++++++++++++++++++++++-----
 pufferlib/utils.py   | 23 ++++++++++++++++
 2 files changed, 81 insertions(+), 7 deletions(-)

diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index e3df72eb67..2dbfe8b46e 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -528,8 +528,16 @@ def train(self):
         run_safe_eval = safe_eval_config.get("enabled", False)
         safe_eval_interval = safe_eval_config.get("interval", self.render_interval)
         should_safe_eval = run_safe_eval and self.epoch % safe_eval_interval == 0
+        eval_interval = self.config["eval"]["eval_interval"]
+        should_wosac = self.config["eval"]["wosac_realism_eval"] and (self.epoch % eval_interval == 0 or done_training)
+        should_human_replay = self.config["eval"]["human_replay_eval"] and (
+            self.epoch % eval_interval == 0 or done_training
+        )
+
+        # Any render-based eval needs a .bin export of the current policy
+        needs_bin = should_render or should_safe_eval or should_human_replay
 
-        if should_render or should_safe_eval:
+        if needs_bin:
             model_dir = os.path.join(self.config["data_dir"], f"{self.config['env']}_{self.logger.run_id}")
             model_files = glob.glob(os.path.join(model_dir, "model_*.pt"))
 
@@ -644,15 +652,60 @@ def train(self):
                         except Exception as e:
                             print(f"Failed to run safe eval: {e}")
 
+                    if should_human_replay:
+                        try:
+                            eval_config = self.config["eval"]
+                            hr_ini_path = pufferlib.utils.generate_human_replay_ini(eval_config)
+
+                            if self.render_async:
+                                bin_copy_hr = f"{model_dir}_epoch_{self.epoch:06d}_human_replay.bin"
+                                shutil.copy2(bin_path, bin_copy_hr)
+                                hr_render_proc = multiprocessing.Process(
+                                    target=pufferlib.utils.render_videos,
+                                    args=(
+                                        self.config,
+                                        env_cfg,
+                                        self.logger.run_id,
+                                        wandb_log,
+                                        self.epoch,
+                                        self.global_step,
+                                        bin_copy_hr,
+                                        True,
+                                        self.render_queue,
+                                    ),
+                                    kwargs={"config_path": hr_ini_path, "wandb_prefix": "human_replay"},
+                                )
+                                hr_render_proc.start()
+                                self._render_proc_temp_files[hr_render_proc.pid] = [bin_copy_hr, hr_ini_path]
+                                self.render_processes.append(hr_render_proc)
+                            else:
+                                pufferlib.utils.render_videos(
+                                    self.config,
+                                    env_cfg,
+                                    self.logger.run_id,
+                                    wandb_log,
+                                    self.epoch,
+                                    self.global_step,
+                                    bin_path,
+                                    False,
+                                    wandb_run=wandb_run,
+                                    config_path=hr_ini_path,
+                                    wandb_prefix="human_replay",
+                                )
+                                if os.path.exists(hr_ini_path):
+                                    os.remove(hr_ini_path)
+                        except Exception as e:
+                            print(f"Failed to run human replay render: {e}")
+
                 except Exception as e:
                     print(f"Failed to export model weights: {e}")
                 finally:
                     if os.path.exists(bin_path):
                         os.remove(bin_path)
 
-        if self.config["eval"]["wosac_realism_eval"] and (
-            self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training
-        ):
+        # WOSAC and human replay metric subprocesses (don't need bin_path,
+        # they load from checkpoint via _run_eval_subprocess)
+        if should_wosac:
             self._run_eval(
                 pufferlib.utils.run_wosac_eval_in_subprocess,
                 self.config,
@@ -660,9 +713,7 @@ def train(self):
                 self.global_step,
             )
 
-        if self.config["eval"]["human_replay_eval"] and (
-            self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training
-        ):
+        if should_human_replay:
             self._run_eval(
                 pufferlib.utils.run_human_replay_eval_in_subprocess,
                 self.config,
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index d8b3657aaa..63a0a57219 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -392,6 +392,29 @@ def generate_safe_eval_ini(safe_eval_config, base_ini_path="pufferlib/config/oce
     return tmp_path
 
 
+def generate_human_replay_ini(eval_config, base_ini_path="pufferlib/config/ocean/drive.ini"):
+    """Generate a temporary ini file for human replay rendering.
+
+    Sets control_mode to control_sdc_only so only the SDC is policy-controlled,
+    with all other agents replaying logged trajectories.
+    """
+    config = configparser.ConfigParser()
+    config.read(base_ini_path)
+
+    config.set("env", "control_mode", '"control_sdc_only"')
+    config.set("env", "init_mode", '"create_all_valid"')
+    config.set("env", "init_steps", "10")
+    # Use eval map_dir (waymo maps), not training map_dir
+    map_dir = eval_config.get("map_dir", "resources/drive/binaries/training")
+    config.set("env", "map_dir", f'"{map_dir}"')
+
+    fd, tmp_path = tempfile.mkstemp(suffix=".ini", prefix="human_replay_")
+    with os.fdopen(fd, "w") as f:
+        config.write(f)
+
+    return tmp_path
+
+
 def run_safe_eval_metrics_in_subprocess(config, logger, global_step, safe_eval_config):
     """Run policy evaluation with safe reward conditioning in a subprocess and log metrics."""
     num_episodes = safe_eval_config.get("num_episodes", 300)

From d9328e602137aea8f49bfa6e47672b4659023bec Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Fri, 13 Mar 2026 09:02:26 -0400
Subject: [PATCH 25/46] Extract _dispatch_render to eliminate triplicated
 render code

- All three render types (training, safe eval, human replay) now use
  the same _dispatch_render() helper for async/sync dispatch
- Fixes bug: async throttling was only applied to training render,
  not safe eval or human replay renders
- Fix inconsistent config["eval"] vs config.get("eval", {}) access

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/pufferl.py | 195 +++++++++++++++++--------------------------
 pufferlib/utils.py   |   2 +-
 2 files changed, 79 insertions(+), 118 deletions(-)

diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 2dbfe8b46e..f6e6d84be6 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -561,87 +561,21 @@ def train(self):
                     wandb_run = self.logger.wandb if hasattr(self.logger, "wandb") else None
 
                     if should_render:
-                        if self.render_async:
-                            self._cleanup_dead_render_processes()
-                            max_processes = self.config.get("num_workers", 1)
-                            while len(self.render_processes) >= max_processes:
-                                time.sleep(1)
-                                self._cleanup_dead_render_processes()
-
-                            bin_copy = f"{model_dir}_epoch_{self.epoch:06d}_render.bin"
-                            shutil.copy2(bin_path, bin_copy)
-                            render_proc = multiprocessing.Process(
-                                target=pufferlib.utils.render_videos,
-                                args=(
-                                    self.config,
-                                    env_cfg,
-                                    self.logger.run_id,
-                                    wandb_log,
-                                    self.epoch,
-                                    self.global_step,
-                                    bin_copy,
-                                    True,
-                                    self.render_queue,
-                                ),
-                            )
-                            render_proc.start()
-                            self._render_proc_temp_files[render_proc.pid] = [bin_copy]
-                            self.render_processes.append(render_proc)
-                        else:
-                            pufferlib.utils.render_videos(
-                                self.config,
-                                env_cfg,
-                                self.logger.run_id,
-                                wandb_log,
-                                self.epoch,
-                                self.global_step,
-                                bin_path,
-                                False,
-                                wandb_run=wandb_run,
-                            )
+                        self._dispatch_render(model_dir, bin_path, env_cfg, wandb_log, wandb_run, "render")
 
                     if should_safe_eval:
                         try:
                             safe_ini_path = pufferlib.utils.generate_safe_eval_ini(safe_eval_config)
-
-                            if self.render_async:
-                                bin_copy_eval = f"{model_dir}_epoch_{self.epoch:06d}_eval.bin"
-                                shutil.copy2(bin_path, bin_copy_eval)
-                                eval_render_proc = multiprocessing.Process(
-                                    target=pufferlib.utils.render_videos,
-                                    args=(
-                                        self.config,
-                                        env_cfg,
-                                        self.logger.run_id,
-                                        wandb_log,
-                                        self.epoch,
-                                        self.global_step,
-                                        bin_copy_eval,
-                                        True,
-                                        self.render_queue,
-                                    ),
-                                    kwargs={"config_path": safe_ini_path, "wandb_prefix": "eval"},
-                                )
-                                eval_render_proc.start()
-                                self._render_proc_temp_files[eval_render_proc.pid] = [bin_copy_eval, safe_ini_path]
-                                self.render_processes.append(eval_render_proc)
-                            else:
-                                pufferlib.utils.render_videos(
-                                    self.config,
-                                    env_cfg,
-                                    self.logger.run_id,
-                                    wandb_log,
-                                    self.epoch,
-                                    self.global_step,
-                                    bin_path,
-                                    False,
-                                    wandb_run=wandb_run,
-                                    config_path=safe_ini_path,
-                                    wandb_prefix="eval",
-                                )
-                                if os.path.exists(safe_ini_path):
-                                    os.remove(safe_ini_path)
-
+                            self._dispatch_render(
+                                model_dir,
+                                bin_path,
+                                env_cfg,
+                                wandb_log,
+                                wandb_run,
+                                "eval",
+                                config_path=safe_ini_path,
+                                wandb_prefix="eval",
+                            )
                             self._run_eval(
                                 pufferlib.utils.run_safe_eval_metrics_in_subprocess,
                                 self.config,
@@ -654,46 +588,17 @@ def train(self):
 
                     if should_human_replay:
                         try:
-                            eval_config = self.config["eval"]
-                            hr_ini_path = pufferlib.utils.generate_human_replay_ini(eval_config)
-
-                            if self.render_async:
-                                bin_copy_hr = f"{model_dir}_epoch_{self.epoch:06d}_human_replay.bin"
-                                shutil.copy2(bin_path, bin_copy_hr)
-                                hr_render_proc = multiprocessing.Process(
-                                    target=pufferlib.utils.render_videos,
-                                    args=(
-                                        self.config,
-                                        env_cfg,
-                                        self.logger.run_id,
-                                        wandb_log,
-                                        self.epoch,
-                                        self.global_step,
-                                        bin_copy_hr,
-                                        True,
-                                        self.render_queue,
-                                    ),
-                                    kwargs={"config_path": hr_ini_path, "wandb_prefix": "human_replay"},
-                                )
-                                hr_render_proc.start()
-                                self._render_proc_temp_files[hr_render_proc.pid] = [bin_copy_hr, hr_ini_path]
-                                self.render_processes.append(hr_render_proc)
-                            else:
-                                pufferlib.utils.render_videos(
-                                    self.config,
-                                    env_cfg,
-                                    self.logger.run_id,
-                                    wandb_log,
-                                    self.epoch,
-                                    self.global_step,
-                                    bin_path,
-                                    False,
-                                    wandb_run=wandb_run,
-                                    config_path=hr_ini_path,
-                                    wandb_prefix="human_replay",
-                                )
-                                if os.path.exists(hr_ini_path):
-                                    os.remove(hr_ini_path)
+                            hr_ini_path = pufferlib.utils.generate_human_replay_ini(self.config["eval"])
+                            self._dispatch_render(
+                                model_dir,
+                                bin_path,
+                                env_cfg,
+                                wandb_log,
+                                wandb_run,
+                                "human_replay",
+                                config_path=hr_ini_path,
+                                wandb_prefix="human_replay",
+                            )
                         except Exception as e:
                             print(f"Failed to run human replay render: {e}")
 
@@ -721,6 +626,62 @@ def train(self):
                 self.global_step,
             )
 
+    def _dispatch_render(
+        self, model_dir, bin_path, env_cfg, wandb_log, wandb_run, suffix, config_path=None, wandb_prefix=None
+    ):
+        """Dispatch a render_videos call, either async (multiprocessing) or sync."""
+        extra_kwargs = {}
+        if config_path is not None:
+            extra_kwargs["config_path"] = config_path
+        if wandb_prefix is not None:
+            extra_kwargs["wandb_prefix"] = wandb_prefix
+
+        if self.render_async:
+            self._cleanup_dead_render_processes()
+            max_processes = self.config.get("num_workers", 1)
+            while len(self.render_processes) >= max_processes:
+                time.sleep(1)
+                self._cleanup_dead_render_processes()
+
+            bin_copy = f"{model_dir}_epoch_{self.epoch:06d}_{suffix}.bin"
+            shutil.copy2(bin_path, bin_copy)
+            proc = multiprocessing.Process(
+                target=pufferlib.utils.render_videos,
+                args=(
+                    self.config,
+                    env_cfg,
+                    self.logger.run_id,
+                    wandb_log,
+                    self.epoch,
+                    self.global_step,
+                    bin_copy,
+                    True,
+                    self.render_queue,
+                ),
+                kwargs=extra_kwargs,
+            )
+            proc.start()
+            temp_files = [bin_copy]
+            if config_path:
+                temp_files.append(config_path)
+            self._render_proc_temp_files[proc.pid] = temp_files
+            self.render_processes.append(proc)
+        else:
+            pufferlib.utils.render_videos(
+                self.config,
+                env_cfg,
+                self.logger.run_id,
+                wandb_log,
+                self.epoch,
+                self.global_step,
+                bin_path,
+                False,
+                wandb_run=wandb_run,
+                **extra_kwargs,
+            )
+            if config_path and os.path.exists(config_path):
+                os.remove(config_path)
+
     def _run_eval(self, fn, *args, **kwargs):
         """Run an eval function, optionally in a background thread."""
         eval_async = self.config.get("eval", {}).get("eval_async", False)
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index 63a0a57219..19462506d2 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -101,7 +101,7 @@ def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_n
 
 
 def run_human_replay_eval_in_subprocess(config, logger, global_step):
-    eval_config = config["eval"]
+    eval_config = config.get("eval", {})
     _run_eval_subprocess(
         config,
         logger,

From 58961db7c2b306929ca324e871e4619cf179ac3a Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Fri, 13 Mar 2026 10:02:14 -0400
Subject: [PATCH 26/46] Organize wandb metrics into separate tabs per eval type

- render/* for training renders
- eval/* for safe eval (videos + metrics)
- human_replay/* for human replay (videos + metrics)
- wosac/* for WOSAC realism metrics
- Use step=global_step consistently instead of custom step metrics

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/pufferl.py |  7 +------
 pufferlib/utils.py   | 25 ++++++++++++-------------
 2 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index f6e6d84be6..2d48b76ded 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -206,12 +206,7 @@ def __init__(self, config, vecenv, policy, logger=None):
         self.logger = logger
         if logger is None:
             self.logger = NoLogger(config)
-        if self.render_async and hasattr(self.logger, "wandb") and self.logger.wandb:
-            self.logger.wandb.define_metric("render_step", hidden=True)
-            self.logger.wandb.define_metric("render/*", step_metric="render_step")
-        if hasattr(self.logger, "wandb") and self.logger.wandb:
-            self.logger.wandb.define_metric("eval_step", hidden=True)
-            self.logger.wandb.define_metric("eval/*", step_metric="eval_step")
+        # No custom step_metric needed — all eval types log with step=global_step
 
         # Learning rate scheduler
         epochs = config["total_timesteps"] // config["batch_size"]
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index 19462506d2..72f9910df1 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -86,8 +86,7 @@ def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_n
                     else:
                         payload = {f"eval/{k}": v for k, v in metrics.items()}
                     if payload:
-                        payload["eval_step"] = global_step
-                        logger.wandb.log(payload)
+                        logger.wandb.log(payload, step=global_step)
         else:
             print(f"{eval_name} evaluation failed with exit code {result.returncode}: {result.stderr[-1000:]}")
 
@@ -123,9 +122,9 @@ def run_human_replay_eval_in_subprocess(config, logger, global_step):
         ],
         marker_name="HUMAN_REPLAY",
         wandb_keys={
-            "collision_rate": "eval/human_replay_collision_rate",
-            "offroad_rate": "eval/human_replay_offroad_rate",
-            "completion_rate": "eval/human_replay_completion_rate",
+            "collision_rate": "human_replay/collision_rate",
+            "offroad_rate": "human_replay/offroad_rate",
+            "completion_rate": "human_replay/completion_rate",
         },
     )
 
@@ -169,14 +168,14 @@ def run_wosac_eval_in_subprocess(config, logger, global_step):
         ],
         marker_name="WOSAC",
         wandb_keys={
-            "realism_meta_score": "eval/wosac_realism_meta_score",
-            "realism_meta_score_std": "eval/wosac_realism_meta_score_std",
-            "kinematic_metrics": "eval/wosac_kinematic_metrics",
-            "interactive_metrics": "eval/wosac_interactive_metrics",
-            "map_based_metrics": "eval/wosac_map_based_metrics",
-            "ade": "eval/wosac_ade",
-            "min_ade": "eval/wosac_min_ade",
-            "total_num_agents": "eval/wosac_total_num_agents",
+            "realism_meta_score": "wosac/realism_meta_score",
+            "realism_meta_score_std": "wosac/realism_meta_score_std",
+            "kinematic_metrics": "wosac/kinematic_metrics",
+            "interactive_metrics": "wosac/interactive_metrics",
+            "map_based_metrics": "wosac/map_based_metrics",
+            "ade": "wosac/ade",
+            "min_ade": "wosac/min_ade",
+            "total_num_agents": "wosac/total_num_agents",
         },
     )
 

From a5788008aa04f13763507a996e5d403e6014eb4e Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Fri, 13 Mar 2026 14:55:57 -0400
Subject: [PATCH 27/46] Fix safe eval env_config key, fix wandb non-monotonic
 step errors

- config["env"] is the env name string, not the env dict. The env
  config dict is at config["env_config"]. Fixes safe eval crash.
- Async evals finish after training moves past their step, so wandb
  rejects step= parameter. Log train_step as data field instead.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/pufferl.py | 3 ++-
 pufferlib/utils.py   | 8 ++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 2d48b76ded..c364b68267 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -736,7 +736,8 @@ def check_render_queue(self):
                         payload[f"{prefix}/agent_view"] = [wandb.Video(p, format="mp4") for p in videos["output_agent"]]
 
                     if payload:
-                        self.logger.wandb.log(payload, step=step)
+                        payload["train_step"] = step
+                        self.logger.wandb.log(payload)
 
         except queue.Empty:
             pass
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index 72f9910df1..46299d9946 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -86,7 +86,11 @@ def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_n
                     else:
                         payload = {f"eval/{k}": v for k, v in metrics.items()}
                     if payload:
-                        logger.wandb.log(payload, step=global_step)
+                        # Don't pass step= here — async evals finish after
+                        # training has moved past this step, and wandb rejects
+                        # non-monotonic steps. Include step as a data field instead.
+                        payload["train_step"] = global_step
+                        logger.wandb.log(payload)
         else:
             print(f"{eval_name} evaluation failed with exit code {result.returncode}: {result.stderr[-1000:]}")
 
@@ -420,7 +424,7 @@ def run_safe_eval_metrics_in_subprocess(config, logger, global_step, safe_eval_c
 
     # Forward training env's map_dir and num_maps so the subprocess uses the
     # same maps as training (the default INI may point elsewhere).
-    env_config = config.get("env", {})
+    env_config = config.get("env_config", {})
     extra_args = [
         "--env.reward-randomization",
         "1",

From e2d758d468a33d30dd22838bc2b506a511c6811f Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Fri, 13 Mar 2026 18:48:04 -0400
Subject: [PATCH 28/46] Add thread-safe wandb logging, restrict evals to rank 0

- Add threading.Lock to WandbLogger, route all async eval logging
  through log_async() to prevent concurrent wandb.log() calls
- Guard all eval types (render, safe eval, WOSAC, human replay) with
  is_rank0 check so they only run on the primary process in
  distributed training

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/pufferl.py | 29 ++++++++++++++++++++++-------
 pufferlib/utils.py   |  8 ++++----
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index c364b68267..b6697f245f 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -19,6 +19,7 @@
 import importlib
 import json
 import configparser
+import threading
 from threading import Thread
 from collections import defaultdict, deque
 from pathlib import Path
@@ -516,17 +517,24 @@ def train(self):
             self.save_checkpoint()
             self.msg = f"Checkpoint saved at update {self.epoch}"
 
+        # Evals only run on rank 0 in distributed training
+        is_rank0 = not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
+
         # Render and safe eval run on their own intervals, independent of checkpointing.
         # They use the latest available checkpoint, so they don't need a fresh one.
-        should_render = self.render and self.epoch % self.render_interval == 0
+        should_render = is_rank0 and self.render and self.epoch % self.render_interval == 0
         safe_eval_config = self.config.get("safe_eval", {})
         run_safe_eval = safe_eval_config.get("enabled", False)
         safe_eval_interval = safe_eval_config.get("interval", self.render_interval)
-        should_safe_eval = run_safe_eval and self.epoch % safe_eval_interval == 0
+        should_safe_eval = is_rank0 and run_safe_eval and self.epoch % safe_eval_interval == 0
         eval_interval = self.config["eval"]["eval_interval"]
-        should_wosac = self.config["eval"]["wosac_realism_eval"] and (self.epoch % eval_interval == 0 or done_training)
-        should_human_replay = self.config["eval"]["human_replay_eval"] and (
-            self.epoch % eval_interval == 0 or done_training
+        should_wosac = (
+            is_rank0
+            and self.config["eval"]["wosac_realism_eval"]
+            and (self.epoch % eval_interval == 0 or done_training)
+        )
+        should_human_replay = (
+            is_rank0 and self.config["eval"]["human_replay_eval"] and (self.epoch % eval_interval == 0 or done_training)
         )
 
         # Any render-based eval needs a .bin export of the current policy
@@ -737,7 +745,7 @@ def check_render_queue(self):
 
                     if payload:
                         payload["train_step"] = step
-                        self.logger.wandb.log(payload)
+                        self.logger.log_async(payload)
 
         except queue.Empty:
             pass
@@ -1218,9 +1226,16 @@ def __init__(self, args, load_id=None, resume="allow"):
         )
         self.wandb = wandb
         self.run_id = wandb.run.id
+        self._log_lock = threading.Lock()
 
     def log(self, logs, step):
-        self.wandb.log(logs, step=step)
+        with self._log_lock:
+            self.wandb.log(logs, step=step)
+
+    def log_async(self, payload):
+        """Thread-safe log without step= (for async evals that finish out of order)."""
+        with self._log_lock:
+            self.wandb.log(payload)
 
     def close(self, model_path):
         artifact = self.wandb.Artifact(self.run_id, type="model")
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index 46299d9946..206e2010cf 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -86,11 +86,11 @@ def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_n
                     else:
                         payload = {f"eval/{k}": v for k, v in metrics.items()}
                     if payload:
-                        # Don't pass step= here — async evals finish after
-                        # training has moved past this step, and wandb rejects
-                        # non-monotonic steps. Include step as a data field instead.
                         payload["train_step"] = global_step
-                        logger.wandb.log(payload)
+                        if hasattr(logger, "log_async"):
+                            logger.log_async(payload)
+                        else:
+                            logger.wandb.log(payload)
         else:
             print(f"{eval_name} evaluation failed with exit code {result.returncode}: {result.stderr[-1000:]}")
 

From e04e0e136e9422166ad954237c28552f0a2660ec Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Fri, 13 Mar 2026 19:07:25 -0400
Subject: [PATCH 29/46] Replace lock-based wandb logging with queue-based
 approach

Eval threads now put results on _eval_results_queue instead of calling
wandb directly. The main thread drains the queue in mean_and_log(),
keeping all wandb.log() calls single-threaded. Also fixes sync render
to not use step= parameter (avoids non-monotonic step warning).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/pufferl.py | 42 ++++++++++++++++++++++++++++++------------
 pufferlib/utils.py   | 21 ++++++++++++++-------
 2 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index b6697f245f..1a52d24460 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -19,7 +19,6 @@
 import importlib
 import json
 import configparser
-import threading
 from threading import Thread
 from collections import defaultdict, deque
 from pathlib import Path
@@ -139,6 +138,7 @@ def __init__(self, config, vecenv, policy, logger=None):
             self._render_proc_temp_files = {}  # pid -> [temp_file_paths]
 
         self.eval_threads = []
+        self._eval_results_queue = queue.Queue()  # thread-safe queue for async eval metrics
 
         # LSTM
         if config["use_rnn"]:
@@ -686,7 +686,13 @@ def _dispatch_render(
                 os.remove(config_path)
 
     def _run_eval(self, fn, *args, **kwargs):
-        """Run an eval function, optionally in a background thread."""
+        """Run an eval function, optionally in a background thread.
+
+        Injects results_queue so eval functions put metrics on the queue
+        instead of logging directly. The main thread drains the queue in
+        mean_and_log().
+        """
+        kwargs["results_queue"] = self._eval_results_queue
         eval_async = self.config.get("eval", {}).get("eval_async", False)
         # Handle string "False"/"True" from INI config
         if isinstance(eval_async, str):
@@ -745,7 +751,7 @@ def check_render_queue(self):
 
                     if payload:
                         payload["train_step"] = step
-                        self.logger.log_async(payload)
+                        self._eval_results_queue.put(payload)
 
         except queue.Empty:
             pass
@@ -786,6 +792,16 @@ def mean_and_log(self):
             return None
 
         self.logger.log(logs, agent_steps)
+
+        # Drain eval results queue (populated by async eval threads and render processes)
+        while not self._eval_results_queue.empty():
+            try:
+                payload = self._eval_results_queue.get_nowait()
+                if hasattr(self.logger, "wandb") and self.logger.wandb:
+                    self.logger.wandb.log(payload)
+            except queue.Empty:
+                break
+
         return logs
 
     def close(self):
@@ -812,7 +828,7 @@ def close(self):
                                 p.kill()
                     except Exception:
                         print(f"Failed to clean up render process {p.pid}")
-            # Drain the queue — all finished processes have put their results
+            # Drain the render queue — moves results to _eval_results_queue
             self.check_render_queue()
             # Clean up any remaining temp files (from crashed processes)
             for pid, files in self._render_proc_temp_files.items():
@@ -828,6 +844,15 @@ def close(self):
                 self.render_queue.close()
                 self.render_queue.join_thread()
 
+        # Final drain of eval results queue before finishing wandb
+        while not self._eval_results_queue.empty():
+            try:
+                payload = self._eval_results_queue.get_nowait()
+                if hasattr(self.logger, "wandb") and self.logger.wandb:
+                    self.logger.wandb.log(payload)
+            except queue.Empty:
+                break
+
         model_path = self.save_checkpoint()
         run_id = self.logger.run_id
         project_name = "puffer_drive"
@@ -1226,16 +1251,9 @@ def __init__(self, args, load_id=None, resume="allow"):
         )
         self.wandb = wandb
         self.run_id = wandb.run.id
-        self._log_lock = threading.Lock()
 
     def log(self, logs, step):
-        with self._log_lock:
-            self.wandb.log(logs, step=step)
-
-    def log_async(self, payload):
-        """Thread-safe log without step= (for async evals that finish out of order)."""
-        with self._log_lock:
-            self.wandb.log(payload)
+        self.wandb.log(logs, step=step)
 
     def close(self, model_path):
         artifact = self.wandb.Artifact(self.run_id, type="model")
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index 206e2010cf..66f7d91601 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -29,7 +29,9 @@ def _get_env_reward_bound_names(ini_path="pufferlib/config/ocean/drive.ini"):
     return bounds
 
 
-def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_name, wandb_keys=None):
+def _run_eval_subprocess(
+    config, logger, global_step, mode, extra_args, marker_name, wandb_keys=None, results_queue=None
+):
     """Run an evaluation subprocess and log metrics to wandb.
 
     Args:
@@ -40,6 +42,7 @@ def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_n
         extra_args: List of extra CLI args appended to the base command
         marker_name: Marker prefix for JSON extraction (e.g. "WOSAC" looks for WOSAC_METRICS_START/END)
         wandb_keys: If dict, maps metric keys to wandb keys. If None, logs all as eval/<key>.
+        results_queue: If provided, put results on this queue instead of logging directly.
     """
     eval_name = marker_name.lower().replace("_", " ")
     try:
@@ -87,8 +90,8 @@ def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_n
                         payload = {f"eval/{k}": v for k, v in metrics.items()}
                     if payload:
                         payload["train_step"] = global_step
-                        if hasattr(logger, "log_async"):
-                            logger.log_async(payload)
+                        if results_queue is not None:
+                            results_queue.put(payload)
                         else:
                             logger.wandb.log(payload)
         else:
@@ -103,7 +106,7 @@ def _run_eval_subprocess(config, logger, global_step, mode, extra_args, marker_n
         traceback.print_exc()
 
 
-def run_human_replay_eval_in_subprocess(config, logger, global_step):
+def run_human_replay_eval_in_subprocess(config, logger, global_step, results_queue=None):
     eval_config = config.get("eval", {})
     _run_eval_subprocess(
         config,
@@ -130,10 +133,11 @@ def run_human_replay_eval_in_subprocess(config, logger, global_step):
             "offroad_rate": "human_replay/offroad_rate",
             "completion_rate": "human_replay/completion_rate",
         },
+        results_queue=results_queue,
     )
 
 
-def run_wosac_eval_in_subprocess(config, logger, global_step):
+def run_wosac_eval_in_subprocess(config, logger, global_step, results_queue=None):
     eval_config = config.get("eval", {})
     _run_eval_subprocess(
         config,
@@ -181,6 +185,7 @@ def run_wosac_eval_in_subprocess(config, logger, global_step):
             "min_ade": "wosac/min_ade",
             "total_num_agents": "wosac/total_num_agents",
         },
+        results_queue=results_queue,
     )
 
 
@@ -360,7 +365,8 @@ def render_videos(
                 payload[f"{wandb_prefix}/world_state"] = videos_to_log_world
             if videos_to_log_agent:
                 payload[f"{wandb_prefix}/agent_view"] = videos_to_log_agent
-            wandb_run.log(payload, step=global_step)
+            payload["train_step"] = global_step
+            wandb_run.log(payload)
 
     except subprocess.TimeoutExpired:
         print("C rendering timed out")
@@ -418,7 +424,7 @@ def generate_human_replay_ini(eval_config, base_ini_path="pufferlib/config/ocean
     return tmp_path
 
 
-def run_safe_eval_metrics_in_subprocess(config, logger, global_step, safe_eval_config):
+def run_safe_eval_metrics_in_subprocess(config, logger, global_step, safe_eval_config, results_queue=None):
     """Run policy evaluation with safe reward conditioning in a subprocess and log metrics."""
     num_episodes = safe_eval_config.get("num_episodes", 300)
 
@@ -461,4 +467,5 @@ def run_safe_eval_metrics_in_subprocess(config, logger, global_step, safe_eval_c
         mode="safe_eval",
         extra_args=extra_args,
         marker_name="SAFE_EVAL",
+        results_queue=results_queue,
     )

From f2df83c48817be071c190e3e94aa5aa7fdc6b369 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Fri, 13 Mar 2026 20:56:56 -0400
Subject: [PATCH 30/46] Fix memory leak in shared() map counting loop

Maps with active_agent_count > 0 were never freed during the shared
init loop that counts agents. Only maps with 0 agents were freed.
This leaked ~20MB per map, causing 20+ GB memory usage when
num_maps=1000 (e.g. WOSAC eval).

Fixed by: moving the store logic into an else branch for active maps,
and always freeing the temporary env after both branches.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/ocean/drive/binding.c | 34 ++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/pufferlib/ocean/drive/binding.c b/pufferlib/ocean/drive/binding.c
index 729a112459..aa4a744134 100644
--- a/pufferlib/ocean/drive/binding.c
+++ b/pufferlib/ocean/drive/binding.c
@@ -256,30 +256,30 @@ static PyObject *my_shared(PyObject *self, PyObject *args, PyObject *kwargs) {
                 PyErr_SetString(PyExc_ValueError, error_msg);
                 return NULL;
             }
-
-            // Store map_id
+        } else {
+            // Map has active agents — record it
             PyObject *map_id_obj = PyLong_FromLong(map_id);
             PyList_SetItem(map_ids, env_count, map_id_obj);
-            // Store agent offset
             PyObject *offset = PyLong_FromLong(total_agent_count);
             PyList_SetItem(agent_offsets, env_count, offset);
             total_agent_count += env->active_agent_count;
             env_count++;
-            for (int j = 0; j < env->num_objects; j++) {
-                free_agent(&env->agents[j]);
-            }
-            for (int j = 0; j < env->num_roads; j++) {
-                free_road_element(&env->road_elements[j]);
-            }
-            free(env->agents);
-            free(env->road_elements);
-            free(env->road_scenario_ids);
-            free(env->active_agent_indices);
-            free(env->static_agent_indices);
-            free(env->expert_static_agent_indices);
-            free(env);
-            continue;
         }
+
+        // Free the temporary env (actual envs are created in init)
+        for (int j = 0; j < env->num_objects; j++) {
+            free_agent(&env->agents[j]);
+        }
+        for (int j = 0; j < env->num_roads; j++) {
+            free_road_element(&env->road_elements[j]);
+        }
+        free(env->agents);
+        free(env->road_elements);
+        free(env->road_scenario_ids);
+        free(env->active_agent_indices);
+        free(env->static_agent_indices);
+        free(env->expert_static_agent_indices);
+        free(env);
     }
 
     if (total_agent_count >= num_agents) {

From 8a1c261270f8784997de5af28d5a2fc79bb9de6a Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Fri, 13 Mar 2026 22:14:14 -0400
Subject: [PATCH 31/46] Update evaluation docs: async defaults, human replay
 eval types

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 docs/src/evaluation.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/src/evaluation.md b/docs/src/evaluation.md
index 4d5c249456..653463e1f6 100644
--- a/docs/src/evaluation.md
+++ b/docs/src/evaluation.md
@@ -12,7 +12,8 @@ PufferDrive supports running evaluations automatically during training. There ar
 | **Safe eval render** | Records videos with safe reward conditioning | `--safe-eval.enabled True` | `--safe-eval.interval N` |
 | **Safe eval metrics** | Runs policy in subprocess, logs driving metrics | `--safe-eval.enabled True` | `--safe-eval.interval N` |
 | **WOSAC realism** | Measures distributional realism (WOSAC benchmark) | `--eval.wosac-realism-eval True` | `--eval.eval-interval N` |
-| **Human replay** | Tests policy alongside replayed human trajectories | `--eval.human-replay-eval True` | `--eval.eval-interval N` |
+| **Human replay render** | Records videos with policy-controlled SDC + replayed humans | `--eval.human-replay-eval True` | `--eval.eval-interval N` |
+| **Human replay metrics** | Logs collision/offroad/completion rates vs human replays | `--eval.human-replay-eval True` | `--eval.eval-interval N` |
 
 All eval types trigger at `epoch % interval == 0`. They require a saved checkpoint, so **`checkpoint-interval` must be <= the smallest eval interval**.
 
@@ -69,9 +70,9 @@ velocity = 0.005
 
 ### Async vs sync evaluation
 
-By default, WOSAC and human replay evals run synchronously, blocking training until they finish. Set `--eval.eval-async True` to run them in background threads instead.
+By default, all evals run asynchronously (`--train.render-async True` and `--eval.eval-async True` in `drive.ini`). Video renders run in separate processes, while metric evals (safe eval, WOSAC, human replay) run in background threads. Results are queued and logged to wandb on the main thread during the next training epoch.
 
-> **Note:** Render and safe eval always run synchronously in the training loop. The `eval_async` flag only affects WOSAC and human replay evaluations.
+Set `--eval.eval-async False` to run metric evals synchronously (blocks training until they finish). Set `--train.render-async False` to run video renders synchronously.
 
 ## Sanity maps
 

From 9f9da0f82db4bce43eef7ae687774f375e8d76d1 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Fri, 13 Mar 2026 22:17:24 -0400
Subject: [PATCH 32/46] Fix bugs: C memory leaks, queue drain, stats mean,
 missing defaults

- Free tracks_to_predict_indices in shared() loop (leaked per map)
- Py_DECREF original lists after PyList_GetSlice (ref count leak)
- Use get_nowait() directly instead of unreliable queue.empty()
- Fix stats mean loop: del then re-insert was keeping bad values
- Add defaults for human_replay config keys to prevent KeyError

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/ocean/drive/binding.c |  6 +++++-
 pufferlib/pufferl.py            | 11 ++++-------
 pufferlib/utils.py              |  4 ++--
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/pufferlib/ocean/drive/binding.c b/pufferlib/ocean/drive/binding.c
index aa4a744134..0239c2035a 100644
--- a/pufferlib/ocean/drive/binding.c
+++ b/pufferlib/ocean/drive/binding.c
@@ -245,6 +245,7 @@ static PyObject *my_shared(PyObject *self, PyObject *args, PyObject *kwargs) {
                 free(env->agents);
                 free(env->road_elements);
                 free(env->road_scenario_ids);
+                free(env->tracks_to_predict_indices);
                 free(env->active_agent_indices);
                 free(env->static_agent_indices);
                 free(env->expert_static_agent_indices);
@@ -276,6 +277,7 @@ static PyObject *my_shared(PyObject *self, PyObject *args, PyObject *kwargs) {
         free(env->agents);
         free(env->road_elements);
         free(env->road_scenario_ids);
+        free(env->tracks_to_predict_indices);
         free(env->active_agent_indices);
         free(env->static_agent_indices);
         free(env->expert_static_agent_indices);
@@ -290,9 +292,11 @@ static PyObject *my_shared(PyObject *self, PyObject *args, PyObject *kwargs) {
     PyList_SetItem(agent_offsets, env_count, final_total_agent_count);
     PyObject *final_env_count = PyLong_FromLong(env_count);
 
-    // resize lists
+    // resize lists (GetSlice returns new refs; release originals)
     PyObject *resized_agent_offsets = PyList_GetSlice(agent_offsets, 0, env_count + 1);
     PyObject *resized_map_ids = PyList_GetSlice(map_ids, 0, env_count);
+    Py_DECREF(agent_offsets);
+    Py_DECREF(map_ids);
     PyObject *tuple = PyTuple_New(3);
     PyTuple_SetItem(tuple, 0, resized_agent_offsets);
     PyTuple_SetItem(tuple, 1, resized_map_ids);
diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 1a52d24460..5f74e0c1ac 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -764,14 +764,11 @@ def mean_and_log(self):
 
         config = self.config
         for k in list(self.stats.keys()):
-            v = self.stats[k]
             try:
-                v = np.mean(v)
-            except:
+                self.stats[k] = np.mean(self.stats[k])
+            except Exception:
                 del self.stats[k]
 
-            self.stats[k] = v
-
         device = config["device"]
         agent_steps = int(dist_sum(self.global_step, device))
         logs = {
@@ -794,7 +791,7 @@ def mean_and_log(self):
         self.logger.log(logs, agent_steps)
 
         # Drain eval results queue (populated by async eval threads and render processes)
-        while not self._eval_results_queue.empty():
+        while True:
             try:
                 payload = self._eval_results_queue.get_nowait()
                 if hasattr(self.logger, "wandb") and self.logger.wandb:
@@ -845,7 +842,7 @@ def close(self):
                 self.render_queue.join_thread()
 
         # Final drain of eval results queue before finishing wandb
-        while not self._eval_results_queue.empty():
+        while True:
             try:
                 payload = self._eval_results_queue.get_nowait()
                 if hasattr(self.logger, "wandb") and self.logger.wandb:
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index 66f7d91601..eca1e5576a 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -119,9 +119,9 @@ def run_human_replay_eval_in_subprocess(config, logger, global_step, results_que
             "--eval.human-replay-eval",
             "True",
             "--eval.human-replay-num-agents",
-            str(eval_config["human_replay_num_agents"]),
+            str(eval_config.get("human_replay_num_agents", 16)),
             "--eval.human-replay-control-mode",
-            str(eval_config["human_replay_control_mode"]),
+            str(eval_config.get("human_replay_control_mode", "control_sdc_only")),
             "--eval.map-dir",
             str(eval_config.get("map_dir", "resources/drive/binaries/training")),
             "--env.num-maps",

From 455ab5537acf5022e7f53b4bc14e8e1010e4baf7 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Fri, 13 Mar 2026 22:26:58 -0400
Subject: [PATCH 33/46] Default evals to sync instead of async

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 docs/src/evaluation.md           | 4 +---
 pufferlib/config/ocean/drive.ini | 4 ++--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/docs/src/evaluation.md b/docs/src/evaluation.md
index 653463e1f6..00a23ed1a2 100644
--- a/docs/src/evaluation.md
+++ b/docs/src/evaluation.md
@@ -70,9 +70,7 @@ velocity = 0.005
 
 ### Async vs sync evaluation
 
-By default, all evals run asynchronously (`--train.render-async True` and `--eval.eval-async True` in `drive.ini`). Video renders run in separate processes, while metric evals (safe eval, WOSAC, human replay) run in background threads. Results are queued and logged to wandb on the main thread during the next training epoch.
-
-Set `--eval.eval-async False` to run metric evals synchronously (blocks training until they finish). Set `--train.render-async False` to run video renders synchronously.
+By default, all evals run synchronously (blocking training until they finish). Set `--train.render-async True` to run video renders in separate processes, and `--eval.eval-async True` to run metric evals (safe eval, WOSAC, human replay) in background threads. When async, results are queued and logged to wandb on the main thread during the next training epoch.
 
 ## Sanity maps
 
diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index 354cb0e1a7..9e9f43045c 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -158,7 +158,7 @@ vtrace_rho_clip = 1
 checkpoint_interval = 1000
 ; Rendering options
 render = True
-render_async = True
+render_async = False
 render_interval = 1000
 ; If True, show exactly what the agent sees in agent observation
 obs_only = True
@@ -176,7 +176,7 @@ render_map = none
 [eval]
 eval_interval = 1000
 ; If True, run eval subprocesses (wosac, human replay, safe eval metrics) in background threads
-eval_async = True    # Run eval subprocesses (wosac, human replay, safe eval metrics) in background threads
+eval_async = False    # Run eval subprocesses (wosac, human replay, safe eval metrics) in background threads
 ; Path to dataset used for evaluation
 map_dir = "resources/drive/binaries/training"
 ; Number of scenarios to process per batch

From 43413b20ccd69420ba92b94d0c23ca50f16c9deb Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Fri, 13 Mar 2026 22:58:39 -0400
Subject: [PATCH 34/46] =?UTF-8?q?Revert=20Py=5FDECREF=20change=20in=20shar?=
 =?UTF-8?q?ed()=20=E2=80=94=20pre-existing,=20not=20our=20bug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/ocean/drive/binding.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pufferlib/ocean/drive/binding.c b/pufferlib/ocean/drive/binding.c
index 0239c2035a..7dcec1ebf6 100644
--- a/pufferlib/ocean/drive/binding.c
+++ b/pufferlib/ocean/drive/binding.c
@@ -292,11 +292,9 @@ static PyObject *my_shared(PyObject *self, PyObject *args, PyObject *kwargs) {
     PyList_SetItem(agent_offsets, env_count, final_total_agent_count);
     PyObject *final_env_count = PyLong_FromLong(env_count);
 
-    // resize lists (GetSlice returns new refs; release originals)
+    // resize lists
     PyObject *resized_agent_offsets = PyList_GetSlice(agent_offsets, 0, env_count + 1);
     PyObject *resized_map_ids = PyList_GetSlice(map_ids, 0, env_count);
-    Py_DECREF(agent_offsets);
-    Py_DECREF(map_ids);
     PyObject *tuple = PyTuple_New(3);
     PyTuple_SetItem(tuple, 0, resized_agent_offsets);
     PyTuple_SetItem(tuple, 1, resized_map_ids);

From 14eaa4272dd1addc4e0ee502708a7cd66287a23d Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Fri, 13 Mar 2026 23:05:22 -0400
Subject: [PATCH 35/46] Refactor shared() to use continue pattern instead of
 if/else

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/ocean/drive/binding.c | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/pufferlib/ocean/drive/binding.c b/pufferlib/ocean/drive/binding.c
index 7dcec1ebf6..8990d8e770 100644
--- a/pufferlib/ocean/drive/binding.c
+++ b/pufferlib/ocean/drive/binding.c
@@ -257,17 +257,32 @@ static PyObject *my_shared(PyObject *self, PyObject *args, PyObject *kwargs) {
                 PyErr_SetString(PyExc_ValueError, error_msg);
                 return NULL;
             }
-        } else {
-            // Map has active agents — record it
-            PyObject *map_id_obj = PyLong_FromLong(map_id);
-            PyList_SetItem(map_ids, env_count, map_id_obj);
-            PyObject *offset = PyLong_FromLong(total_agent_count);
-            PyList_SetItem(agent_offsets, env_count, offset);
-            total_agent_count += env->active_agent_count;
-            env_count++;
+
+            for (int j = 0; j < env->num_objects; j++) {
+                free_agent(&env->agents[j]);
+            }
+            for (int j = 0; j < env->num_roads; j++) {
+                free_road_element(&env->road_elements[j]);
+            }
+            free(env->agents);
+            free(env->road_elements);
+            free(env->road_scenario_ids);
+            free(env->tracks_to_predict_indices);
+            free(env->active_agent_indices);
+            free(env->static_agent_indices);
+            free(env->expert_static_agent_indices);
+            free(env);
+            continue;
         }
 
-        // Free the temporary env (actual envs are created in init)
+        // Map has active agents — record it
+        PyObject *map_id_obj = PyLong_FromLong(map_id);
+        PyList_SetItem(map_ids, env_count, map_id_obj);
+        PyObject *offset = PyLong_FromLong(total_agent_count);
+        PyList_SetItem(agent_offsets, env_count, offset);
+        total_agent_count += env->active_agent_count;
+        env_count++;
+
         for (int j = 0; j < env->num_objects; j++) {
             free_agent(&env->agents[j]);
         }

From afb8a33f108f74f94a5ab291ab0fa10303f47014 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Fri, 13 Mar 2026 23:08:06 -0400
Subject: [PATCH 36/46] Remove bare try/except blocks from eval launching code

Let errors propagate instead of silently swallowing them.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/pufferl.py |  69 ++++----
 pufferlib/utils.py   | 382 +++++++++++++++++++++----------------------
 2 files changed, 214 insertions(+), 237 deletions(-)

diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 5f74e0c1ac..761c47b874 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -548,8 +548,8 @@ def train(self):
                 latest_cpt = max(model_files)
                 bin_path = f"{model_dir}.bin"
 
+                export_args = {"env_name": self.config["env"], "load_model_path": latest_cpt, **self.config}
                 try:
-                    export_args = {"env_name": self.config["env"], "load_model_path": latest_cpt, **self.config}
                     export(
                         args=export_args,
                         env_name=self.config["env"],
@@ -567,46 +567,37 @@ def train(self):
                         self._dispatch_render(model_dir, bin_path, env_cfg, wandb_log, wandb_run, "render")
 
                     if should_safe_eval:
-                        try:
-                            safe_ini_path = pufferlib.utils.generate_safe_eval_ini(safe_eval_config)
-                            self._dispatch_render(
-                                model_dir,
-                                bin_path,
-                                env_cfg,
-                                wandb_log,
-                                wandb_run,
-                                "eval",
-                                config_path=safe_ini_path,
-                                wandb_prefix="eval",
-                            )
-                            self._run_eval(
-                                pufferlib.utils.run_safe_eval_metrics_in_subprocess,
-                                self.config,
-                                self.logger,
-                                self.global_step,
-                                safe_eval_config,
-                            )
-                        except Exception as e:
-                            print(f"Failed to run safe eval: {e}")
+                        safe_ini_path = pufferlib.utils.generate_safe_eval_ini(safe_eval_config)
+                        self._dispatch_render(
+                            model_dir,
+                            bin_path,
+                            env_cfg,
+                            wandb_log,
+                            wandb_run,
+                            "eval",
+                            config_path=safe_ini_path,
+                            wandb_prefix="eval",
+                        )
+                        self._run_eval(
+                            pufferlib.utils.run_safe_eval_metrics_in_subprocess,
+                            self.config,
+                            self.logger,
+                            self.global_step,
+                            safe_eval_config,
+                        )
 
                     if should_human_replay:
-                        try:
-                            hr_ini_path = pufferlib.utils.generate_human_replay_ini(self.config["eval"])
-                            self._dispatch_render(
-                                model_dir,
-                                bin_path,
-                                env_cfg,
-                                wandb_log,
-                                wandb_run,
-                                "human_replay",
-                                config_path=hr_ini_path,
-                                wandb_prefix="human_replay",
-                            )
-                        except Exception as e:
-                            print(f"Failed to run human replay render: {e}")
-
-                except Exception as e:
-                    print(f"Failed to export model weights: {e}")
+                        hr_ini_path = pufferlib.utils.generate_human_replay_ini(self.config["eval"])
+                        self._dispatch_render(
+                            model_dir,
+                            bin_path,
+                            env_cfg,
+                            wandb_log,
+                            wandb_run,
+                            "human_replay",
+                            config_path=hr_ini_path,
+                            wandb_prefix="human_replay",
+                        )
                 finally:
                     if os.path.exists(bin_path):
                         os.remove(bin_path)
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index eca1e5576a..cf7f4de867 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -45,65 +45,56 @@ def _run_eval_subprocess(
         results_queue: If provided, put results on this queue instead of logging directly.
     """
     eval_name = marker_name.lower().replace("_", " ")
-    try:
-        run_id = logger.run_id
-        model_dir = os.path.join(config["data_dir"], f"{config['env']}_{run_id}")
-        model_files = glob.glob(os.path.join(model_dir, "model_*.pt"))
+    run_id = logger.run_id
+    model_dir = os.path.join(config["data_dir"], f"{config['env']}_{run_id}")
+    model_files = glob.glob(os.path.join(model_dir, "model_*.pt"))
 
-        if not model_files:
-            print(f"No model files found for {eval_name} evaluation")
-            return
+    if not model_files:
+        print(f"No model files found for {eval_name} evaluation")
+        return
 
-        latest_cpt = max(model_files)
-
-        cmd = [
-            sys.executable,
-            "-m",
-            "pufferlib.pufferl",
-            mode,
-            config["env"],
-            "--load-model-path",
-            latest_cpt,
-            "--train.device",
-            _normalize_device(config.get("device", "cuda")),
-        ]
-
-        cmd += extra_args
-
-        result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, cwd=os.getcwd())
-
-        start_marker = f"{marker_name}_METRICS_START"
-        end_marker = f"{marker_name}_METRICS_END"
-
-        if result.returncode == 0:
-            stdout = result.stdout
-            has_markers = start_marker in stdout and end_marker in stdout
-            if has_markers:
-                start = stdout.find(start_marker) + len(start_marker)
-                end = stdout.find(end_marker)
-                metrics = json.loads(stdout[start:end].strip())
-
-                if hasattr(logger, "wandb") and logger.wandb:
-                    if wandb_keys is not None:
-                        payload = {wandb_keys[k]: metrics[k] for k in wandb_keys if k in metrics}
-                    else:
-                        payload = {f"eval/{k}": v for k, v in metrics.items()}
-                    if payload:
-                        payload["train_step"] = global_step
-                        if results_queue is not None:
-                            results_queue.put(payload)
-                        else:
-                            logger.wandb.log(payload)
-        else:
-            print(f"{eval_name} evaluation failed with exit code {result.returncode}: {result.stderr[-1000:]}")
+    latest_cpt = max(model_files)
+
+    cmd = [
+        sys.executable,
+        "-m",
+        "pufferlib.pufferl",
+        mode,
+        config["env"],
+        "--load-model-path",
+        latest_cpt,
+        "--train.device",
+        _normalize_device(config.get("device", "cuda")),
+    ]
+
+    cmd += extra_args
+
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, cwd=os.getcwd())
+
+    start_marker = f"{marker_name}_METRICS_START"
+    end_marker = f"{marker_name}_METRICS_END"
 
-    except subprocess.TimeoutExpired:
-        print(f"{eval_name} evaluation timed out")
-    except Exception as e:
-        import traceback
+    if result.returncode == 0:
+        stdout = result.stdout
+        has_markers = start_marker in stdout and end_marker in stdout
+        if has_markers:
+            start = stdout.find(start_marker) + len(start_marker)
+            end = stdout.find(end_marker)
+            metrics = json.loads(stdout[start:end].strip())
 
-        print(f"Failed to run {eval_name} evaluation: {e}")
-        traceback.print_exc()
+            if hasattr(logger, "wandb") and logger.wandb:
+                if wandb_keys is not None:
+                    payload = {wandb_keys[k]: metrics[k] for k in wandb_keys if k in metrics}
+                else:
+                    payload = {f"eval/{k}": v for k, v in metrics.items()}
+                if payload:
+                    payload["train_step"] = global_step
+                    if results_queue is not None:
+                        results_queue.put(payload)
+                    else:
+                        logger.wandb.log(payload)
+    else:
+        print(f"{eval_name} evaluation failed with exit code {result.returncode}: {result.stderr[-1000:]}")
 
 
 def run_human_replay_eval_in_subprocess(config, logger, global_step, results_queue=None):
@@ -226,152 +217,147 @@ def render_videos(
 
     model_dir = os.path.join(config["data_dir"], f"{config['env']}_{run_id}")
 
-    try:
-        video_output_dir = os.path.join(model_dir, "videos")
-        os.makedirs(video_output_dir, exist_ok=True)
-
-        # TODO: Fix memory leaks so that this is not needed
-        env_vars = os.environ.copy()
-        env_vars["ASAN_OPTIONS"] = "exitcode=0"
-
-        base_cmd = ["xvfb-run", "-a", "-s", "-screen 0 1280x720x24", "./visualize"]
-
-        if config_path:
-            base_cmd.extend(["--config", config_path])
-
-        if config.get("show_grid", False):
-            base_cmd.append("--show-grid")
-        if config.get("obs_only", False):
-            base_cmd.append("--obs-only")
-        if config.get("show_lasers", False):
-            base_cmd.append("--lasers")
-        if config.get("show_human_logs", False):
-            base_cmd.append("--show-human-logs")
-        if config.get("zoom_in", False):
-            base_cmd.append("--zoom-in")
-
-        frame_skip = config.get("frame_skip", 1)
-        if frame_skip > 1:
-            base_cmd.extend(["--frame-skip", str(frame_skip)])
-
-        view_mode = config.get("view_mode", "both")
-        base_cmd.extend(["--view", view_mode])
-
-        if env_cfg is not None and getattr(env_cfg, "num_maps", None):
-            base_cmd.extend(["--num-maps", str(env_cfg.num_maps)])
-
-        base_cmd.extend(["--policy-name", bin_path])
-
-        # Handle single or multiple map rendering
-        render_maps = config.get("render_map", None)
-        if render_maps is None or render_maps == "none":
-            map_dir = None
-            if env_cfg is not None and hasattr(env_cfg, "map_dir"):
-                map_dir = env_cfg.map_dir
-            if map_dir and os.path.isdir(map_dir):
-                import random
-
-                bin_files = [f for f in os.listdir(map_dir) if f.endswith(".bin")]
-                if bin_files:
-                    render_maps = [os.path.join(map_dir, random.choice(bin_files))]
-                else:
-                    print(f"Warning: No .bin files found in {map_dir}, skipping render")
-                    return
+    video_output_dir = os.path.join(model_dir, "videos")
+    os.makedirs(video_output_dir, exist_ok=True)
+
+    # TODO: Fix memory leaks so that this is not needed
+    env_vars = os.environ.copy()
+    env_vars["ASAN_OPTIONS"] = "exitcode=0"
+
+    base_cmd = ["xvfb-run", "-a", "-s", "-screen 0 1280x720x24", "./visualize"]
+
+    if config_path:
+        base_cmd.extend(["--config", config_path])
+
+    if config.get("show_grid", False):
+        base_cmd.append("--show-grid")
+    if config.get("obs_only", False):
+        base_cmd.append("--obs-only")
+    if config.get("show_lasers", False):
+        base_cmd.append("--lasers")
+    if config.get("show_human_logs", False):
+        base_cmd.append("--show-human-logs")
+    if config.get("zoom_in", False):
+        base_cmd.append("--zoom-in")
+
+    frame_skip = config.get("frame_skip", 1)
+    if frame_skip > 1:
+        base_cmd.extend(["--frame-skip", str(frame_skip)])
+
+    view_mode = config.get("view_mode", "both")
+    base_cmd.extend(["--view", view_mode])
+
+    if env_cfg is not None and getattr(env_cfg, "num_maps", None):
+        base_cmd.extend(["--num-maps", str(env_cfg.num_maps)])
+
+    base_cmd.extend(["--policy-name", bin_path])
+
+    # Handle single or multiple map rendering
+    render_maps = config.get("render_map", None)
+    if render_maps is None or render_maps == "none":
+        map_dir = None
+        if env_cfg is not None and hasattr(env_cfg, "map_dir"):
+            map_dir = env_cfg.map_dir
+        if map_dir and os.path.isdir(map_dir):
+            import random
+
+            bin_files = [f for f in os.listdir(map_dir) if f.endswith(".bin")]
+            if bin_files:
+                render_maps = [os.path.join(map_dir, random.choice(bin_files))]
             else:
-                print(f"Warning: map_dir not found or invalid ({map_dir}), skipping render")
+                print(f"Warning: No .bin files found in {map_dir}, skipping render")
                 return
-        elif isinstance(render_maps, (str, os.PathLike)):
-            render_maps = [render_maps]
         else:
-            render_maps = list(render_maps)
-
-        file_prefix = f"{wandb_prefix}_" if wandb_prefix != "render" else ""
-        videos_to_log_world = []
-        videos_to_log_agent = []
-        generated_videos = {"output_topdown": [], "output_agent": []}
-        output_topdown = f"resources/drive/{file_prefix}output_topdown_{epoch}"
-        output_agent = f"resources/drive/{file_prefix}output_agent_{epoch}"
-
-        for i, map_path in enumerate(render_maps):
-            cmd = list(base_cmd)
-            if map_path is not None and os.path.exists(map_path):
-                cmd.extend(["--map-name", str(map_path)])
-
-            output_topdown_map = output_topdown + (f"_map{i:02d}.mp4" if len(render_maps) > 1 else ".mp4")
-            output_agent_map = output_agent + (f"_map{i:02d}.mp4" if len(render_maps) > 1 else ".mp4")
-
-            cmd.extend(["--output-topdown", output_topdown_map])
-            cmd.extend(["--output-agent", output_agent_map])
-
-            result = subprocess.run(cmd, cwd=os.getcwd(), capture_output=True, text=True, timeout=1200, env=env_vars)
-
-            vids_exist = os.path.exists(output_topdown_map) and os.path.exists(output_agent_map)
-
-            if result.returncode == 0 or (result.returncode == 1 and vids_exist):
-                videos = [
-                    (
-                        "output_topdown",
-                        output_topdown_map,
-                        f"{file_prefix}epoch_{epoch:06d}_map{i:02d}_topdown.mp4"
-                        if map_path
-                        else f"{file_prefix}epoch_{epoch:06d}_topdown.mp4",
-                    ),
-                    (
-                        "output_agent",
-                        output_agent_map,
-                        f"{file_prefix}epoch_{epoch:06d}_map{i:02d}_agent.mp4"
-                        if map_path
-                        else f"{file_prefix}epoch_{epoch:06d}_agent.mp4",
-                    ),
-                ]
-
-                for vid_type, source_vid, target_filename in videos:
-                    if os.path.exists(source_vid):
-                        target_path = os.path.join(video_output_dir, target_filename)
-                        shutil.move(source_vid, target_path)
-                        generated_videos[vid_type].append(target_path)
-                        if render_async:
-                            continue
-                        if wandb_log:
-                            import wandb
-
-                            if "topdown" in target_filename:
-                                videos_to_log_world.append(wandb.Video(target_path, format="mp4"))
-                            else:
-                                videos_to_log_agent.append(wandb.Video(target_path, format="mp4"))
-                    else:
-                        print(f"Video generation completed but {source_vid} not found")
-                        if result.stdout:
-                            print(f"StdOUT: {result.stdout}")
-                        if result.stderr:
-                            print(f"StdERR: {result.stderr}")
-            else:
-                print(f"C rendering failed (map index {i}) with exit code {result.returncode}: {result.stdout}")
-
-        if render_async:
-            render_queue.put(
-                {
-                    "videos": generated_videos,
-                    "step": global_step,
-                    "wandb_prefix": wandb_prefix,
-                    "bin_path": bin_path,
-                    "config_path": config_path,
-                }
-            )
-
-        if wandb_log and (videos_to_log_world or videos_to_log_agent) and not render_async:
-            payload = {}
-            if videos_to_log_world:
-                payload[f"{wandb_prefix}/world_state"] = videos_to_log_world
-            if videos_to_log_agent:
-                payload[f"{wandb_prefix}/agent_view"] = videos_to_log_agent
-            payload["train_step"] = global_step
-            wandb_run.log(payload)
-
-    except subprocess.TimeoutExpired:
-        print("C rendering timed out")
-    except Exception as e:
-        print(f"Failed to generate videos: {e}")
+            print(f"Warning: map_dir not found or invalid ({map_dir}), skipping render")
+            return
+    elif isinstance(render_maps, (str, os.PathLike)):
+        render_maps = [render_maps]
+    else:
+        render_maps = list(render_maps)
+
+    file_prefix = f"{wandb_prefix}_" if wandb_prefix != "render" else ""
+    videos_to_log_world = []
+    videos_to_log_agent = []
+    generated_videos = {"output_topdown": [], "output_agent": []}
+    output_topdown = f"resources/drive/{file_prefix}output_topdown_{epoch}"
+    output_agent = f"resources/drive/{file_prefix}output_agent_{epoch}"
+
+    for i, map_path in enumerate(render_maps):
+        cmd = list(base_cmd)
+        if map_path is not None and os.path.exists(map_path):
+            cmd.extend(["--map-name", str(map_path)])
+
+        output_topdown_map = output_topdown + (f"_map{i:02d}.mp4" if len(render_maps) > 1 else ".mp4")
+        output_agent_map = output_agent + (f"_map{i:02d}.mp4" if len(render_maps) > 1 else ".mp4")
+
+        cmd.extend(["--output-topdown", output_topdown_map])
+        cmd.extend(["--output-agent", output_agent_map])
+
+        result = subprocess.run(cmd, cwd=os.getcwd(), capture_output=True, text=True, timeout=1200, env=env_vars)
+
+        vids_exist = os.path.exists(output_topdown_map) and os.path.exists(output_agent_map)
+
+        if result.returncode == 0 or (result.returncode == 1 and vids_exist):
+            videos = [
+                (
+                    "output_topdown",
+                    output_topdown_map,
+                    f"{file_prefix}epoch_{epoch:06d}_map{i:02d}_topdown.mp4"
+                    if map_path
+                    else f"{file_prefix}epoch_{epoch:06d}_topdown.mp4",
+                ),
+                (
+                    "output_agent",
+                    output_agent_map,
+                    f"{file_prefix}epoch_{epoch:06d}_map{i:02d}_agent.mp4"
+                    if map_path
+                    else f"{file_prefix}epoch_{epoch:06d}_agent.mp4",
+                ),
+            ]
+
+            for vid_type, source_vid, target_filename in videos:
+                if os.path.exists(source_vid):
+                    target_path = os.path.join(video_output_dir, target_filename)
+                    shutil.move(source_vid, target_path)
+                    generated_videos[vid_type].append(target_path)
+                    if render_async:
+                        continue
+                    if wandb_log:
+                        import wandb
+
+                        if "topdown" in target_filename:
+                            videos_to_log_world.append(wandb.Video(target_path, format="mp4"))
+                        else:
+                            videos_to_log_agent.append(wandb.Video(target_path, format="mp4"))
+                else:
+                    print(f"Video generation completed but {source_vid} not found")
+                    if result.stdout:
+                        print(f"StdOUT: {result.stdout}")
+                    if result.stderr:
+                        print(f"StdERR: {result.stderr}")
+        else:
+            print(f"C rendering failed (map index {i}) with exit code {result.returncode}: {result.stdout}")
+
+    if render_async:
+        render_queue.put(
+            {
+                "videos": generated_videos,
+                "step": global_step,
+                "wandb_prefix": wandb_prefix,
+                "bin_path": bin_path,
+                "config_path": config_path,
+            }
+        )
+
+    if wandb_log and (videos_to_log_world or videos_to_log_agent) and not render_async:
+        payload = {}
+        if videos_to_log_world:
+            payload[f"{wandb_prefix}/world_state"] = videos_to_log_world
+        if videos_to_log_agent:
+            payload[f"{wandb_prefix}/agent_view"] = videos_to_log_agent
+        payload["train_step"] = global_step
+        wandb_run.log(payload)
+
 
 
 def generate_safe_eval_ini(safe_eval_config, base_ini_path="pufferlib/config/ocean/drive.ini"):

From b00ec641e42898fd8917942b4c8b3866e2673658 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Fri, 13 Mar 2026 23:20:33 -0400
Subject: [PATCH 37/46] Reduce eval thread join timeout from 660s to 10s in
 close()

Daemon threads die with the process anyway; no need to block shutdown
for 11 minutes per stuck thread.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/pufferl.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 761c47b874..1cc3cd8ce0 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -796,9 +796,12 @@ def close(self):
         self.vecenv.close()
         self.utilization.stop()
 
-        # Wait for any background eval threads to finish
+        # Wait briefly for any background eval threads to finish.
+        # These are daemon threads, so they'll die when the process exits.
         for t in self.eval_threads:
-            t.join(timeout=660)  # slightly longer than subprocess timeout (600s)
+            t.join(timeout=10)
+            if t.is_alive():
+                log.warning(f"Eval thread {t.name} still running after 10s, abandoning")
         self.eval_threads = []
 
         if self.render_async:

From 7a77642d1a42d80cacb325a52bcf51b051d4dac7 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Sun, 15 Mar 2026 00:22:05 -0400
Subject: [PATCH 38/46] Enable human replay eval by default, fix ruff
 formatting

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/config/ocean/drive.ini | 2 +-
 pufferlib/utils.py               | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index 9e9f43045c..c92f659e0c 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -211,7 +211,7 @@ wosac_episode_length = 91
 ; Evaluation mode: "policy", "ground_truth"
 wosac_eval_mode = "policy"
 ; If True, enable human replay evaluation (pair policy-controlled agent with human replays)
-human_replay_eval = False
+human_replay_eval = True
 ; Number of agents for human replay evaluation
 human_replay_num_agents = 16
 ; Control only the self-driving car
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index cf7f4de867..3c4ea8db53 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -359,7 +359,6 @@ def render_videos(
         wandb_run.log(payload)
 
 
-
 def generate_safe_eval_ini(safe_eval_config, base_ini_path="pufferlib/config/ocean/drive.ini"):
     """Generate a temporary ini file with safe/law-abiding reward conditioning values.
 

From 2c17312db720bb060e167150165f5b85198931b2 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Sun, 15 Mar 2026 01:14:43 -0400
Subject: [PATCH 39/46] Make safe eval render match metrics subprocess setup

Set episode_length, resample_frequency=0, num_agents, and goal
distances in the generated ini so the render shows the same behavior
the metrics subprocess measures.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/utils.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index 3c4ea8db53..43bc2d94ec 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -379,6 +379,13 @@ def generate_safe_eval_ini(safe_eval_config, base_ini_path="pufferlib/config/oce
     config.set("env", "reward_randomization", "1")
     config.set("env", "reward_conditioning", "1")
 
+    # Match the metrics subprocess setup so the render shows the same behavior
+    config.set("env", "episode_length", str(safe_eval_config.get("episode_length", 1000)))
+    config.set("env", "resample_frequency", "0")
+    config.set("env", "num_agents", str(safe_eval_config.get("num_agents", 64)))
+    config.set("env", "min_goal_distance", str(safe_eval_config.get("min_goal_distance", 0.5)))
+    config.set("env", "max_goal_distance", str(safe_eval_config.get("max_goal_distance", 1000.0)))
+
     fd, tmp_path = tempfile.mkstemp(suffix=".ini", prefix="safe_eval_")
     with os.fdopen(fd, "w") as f:
         config.write(f)

From 0a42f383a9e0ccf6c48c0bea094f2bd7c7ef47ca Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Mon, 16 Mar 2026 13:34:15 +0000
Subject: [PATCH 40/46] Add --scale CLI arg to visualize binary for controlling
 render resolution

Default remains 6px/world-unit. Large CARLA maps (900m+) render at 5000+px
which causes timeouts with software rendering on cluster.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/ocean/drive/visualize.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/pufferlib/ocean/drive/visualize.c b/pufferlib/ocean/drive/visualize.c
index 511c7454c8..a3fca755f6 100644
--- a/pufferlib/ocean/drive/visualize.c
+++ b/pufferlib/ocean/drive/visualize.c
@@ -193,7 +193,7 @@ static int make_gif_from_frames(const char *pattern, int fps, const char *palett
 
 int eval_gif(const char *map_name, const char *policy_name, int show_grid, int obs_only, int lasers,
              int show_human_logs, int frame_skip, const char *view_mode, const char *output_topdown,
-             const char *output_agent, int num_maps, int zoom_in, const char *config_path) {
+             const char *output_agent, int num_maps, int zoom_in, const char *config_path, float render_scale) {
 
     // Parse configuration from INI file
     env_init_config conf = {0};
@@ -316,7 +316,7 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o
     float map_height = env.grid_map->top_left_y - env.grid_map->bottom_right_y;
 
     printf("Map size: %.1fx%.1f\n", map_width, map_height);
-    float scale = 6.0f;
+    float scale = render_scale > 0 ? render_scale : 6.0f;
 
     int img_width = (int)roundf(map_width * scale / 2.0f) * 2;
     int img_height = (int)roundf(map_height * scale / 2.0f) * 2;
@@ -477,6 +477,7 @@ int main(int argc, char *argv[]) {
     const char *output_topdown = NULL;
     const char *output_agent = NULL;
     int num_maps = conf.num_maps;
+    float render_scale = 0;
 
     // Parse command line arguments
     for (int i = 1; i < argc; i++) {
@@ -547,10 +548,15 @@ int main(int argc, char *argv[]) {
             if (i + 1 < argc) {
                 i++;
             }
+        } else if (strcmp(argv[i], "--scale") == 0) {
+            if (i + 1 < argc) {
+                render_scale = atof(argv[i + 1]);
+                i++;
+            }
         }
     }
 
     eval_gif(map_name, policy_name, show_grid, obs_only, lasers, show_human_logs, frame_skip, view_mode, output_topdown,
-             output_agent, num_maps, zoom_in, config_path);
+             output_agent, num_maps, zoom_in, config_path, render_scale);
     return 0;
 }

From 8125393d8c5a4489895a8e026d3d157f093782ae Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Mon, 16 Mar 2026 13:41:08 +0000
Subject: [PATCH 41/46] Increase render subprocess timeout to 3600s

Render runs async so a long timeout doesn't block training. Large CARLA
maps with software rendering can take 5+ minutes at default scale.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index 43bc2d94ec..40b87b2381 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -293,7 +293,7 @@ def render_videos(
         cmd.extend(["--output-topdown", output_topdown_map])
         cmd.extend(["--output-agent", output_agent_map])
 
-        result = subprocess.run(cmd, cwd=os.getcwd(), capture_output=True, text=True, timeout=1200, env=env_vars)
+        result = subprocess.run(cmd, cwd=os.getcwd(), capture_output=True, text=True, timeout=3600, env=env_vars)
 
         vids_exist = os.path.exists(output_topdown_map) and os.path.exists(output_agent_map)
 

From 2c49401d83a9a9f69408ac2df372cc412871bd9b Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Mon, 16 Mar 2026 13:43:24 +0000
Subject: [PATCH 42/46] Enable async rendering by default

Prevents render from blocking training, especially important for large
CARLA maps that can take 5+ minutes with software rendering.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/config/ocean/drive.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index c92f659e0c..f3e093a5c3 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -158,7 +158,7 @@ vtrace_rho_clip = 1
 checkpoint_interval = 1000
 ; Rendering options
 render = True
-render_async = False
+render_async = True
 render_interval = 1000
 ; If True, show exactly what the agent sees in agent observation
 obs_only = True

From 47caa0c6e88a377d1d5ff9fd44013d5bd98cd804 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Mon, 16 Mar 2026 23:42:57 +0000
Subject: [PATCH 43/46] Randomize agent positions on every respawn in variable
 agent mode

Previously agents always reset to their initial spawn position. Now in
INIT_VARIABLE_AGENT_NUMBER mode, both mid-episode respawns and full
episode resets pick a new random collision-free position on a drivable
lane via length-weighted sampling.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/ocean/drive/drive.h | 120 +++++++++++++++++++++++++++++++---
 1 file changed, 111 insertions(+), 9 deletions(-)

diff --git a/pufferlib/ocean/drive/drive.h b/pufferlib/ocean/drive/drive.h
index 3a5c0e468f..99ea10f1e6 100644
--- a/pufferlib/ocean/drive/drive.h
+++ b/pufferlib/ocean/drive/drive.h
@@ -2634,16 +2634,110 @@ void compute_observations(Drive *env) {
     }
 }
 
+// Find a random collision-free position on a drivable lane for an existing agent.
+// Returns true if a valid position was found and updates the agent's sim_x/y/z/heading.
+static bool randomize_agent_position(Drive *env, int agent_idx) {
+    Agent *agent = &env->agents[agent_idx];
+
+    // Pre-compute drivable lanes
+    int drivable_lanes[env->num_roads];
+    float lane_lengths[env->num_roads];
+    int num_drivable = 0;
+    float total_lane_length = 0.0f;
+    for (int i = 0; i < env->num_roads; i++) {
+        if (env->road_elements[i].type == ROAD_LANE && env->road_elements[i].polyline_length > 0.0f) {
+            drivable_lanes[num_drivable] = i;
+            lane_lengths[num_drivable] = env->road_elements[i].polyline_length;
+            total_lane_length += lane_lengths[num_drivable];
+            num_drivable++;
+        }
+    }
+
+    if (num_drivable == 0) return false;
+
+    for (int attempt = 0; attempt < MAX_SPAWN_ATTEMPTS; attempt++) {
+        // Length-weighted lane selection
+        float r = ((float)rand() / (float)RAND_MAX) * total_lane_length;
+        float cumulative = 0.0f;
+        int selected = num_drivable - 1;
+        for (int k = 0; k < num_drivable; k++) {
+            cumulative += lane_lengths[k];
+            if (r < cumulative) {
+                selected = k;
+                break;
+            }
+        }
+        RoadMapElement *lane = &env->road_elements[drivable_lanes[selected]];
+
+        float spawn_x, spawn_y, spawn_z, spawn_heading;
+        get_random_point_on_lane(lane, &spawn_x, &spawn_y, &spawn_z, &spawn_heading);
+        spawn_z += agent->sim_height / 2.0f;
+
+        // Check collision with all other active agents (excluding this one)
+        bool collision = false;
+        for (int j = 0; j < env->active_agent_count; j++) {
+            int other_idx = env->active_agent_indices[j];
+            if (other_idx == agent_idx) continue;
+            Agent *other = &env->agents[other_idx];
+            if (other->sim_x == INVALID_POSITION || other->removed) continue;
+            float dx = spawn_x - other->sim_x;
+            float dy = spawn_y - other->sim_y;
+            float dist = sqrtf(dx * dx + dy * dy);
+            float min_dist = (agent->sim_length + other->sim_length) / 2.0f;
+            if (dist < min_dist) {
+                collision = true;
+                break;
+            }
+        }
+        if (collision) continue;
+
+        // Check offroad
+        if (check_spawn_offroad(env, spawn_x, spawn_y, spawn_z, spawn_heading,
+                                agent->sim_length, agent->sim_width, agent->sim_height))
+            continue;
+
+        agent->sim_x = spawn_x;
+        agent->sim_y = spawn_y;
+        agent->sim_z = spawn_z;
+        agent->sim_heading = spawn_heading;
+        agent->heading_x = cosf(spawn_heading);
+        agent->heading_y = sinf(spawn_heading);
+        // Update stored initial position so future non-random resets are consistent
+        agent->log_trajectory_x[0] = spawn_x;
+        agent->log_trajectory_y[0] = spawn_y;
+        agent->log_trajectory_z[0] = spawn_z;
+        agent->log_heading[0] = spawn_heading;
+        return true;
+    }
+    return false;
+}
+
 void respawn_agent(Drive *env, int agent_idx) {
     Agent *agent = &env->agents[agent_idx];
-    agent->sim_x = agent->log_trajectory_x[0];
-    agent->sim_y = agent->log_trajectory_y[0];
-    agent->sim_z = agent->log_trajectory_z[0];
-    agent->sim_heading = agent->log_heading[0];
-    agent->heading_x = cosf(agent->sim_heading);
-    agent->heading_y = sinf(agent->sim_heading);
-    agent->sim_vx = agent->log_velocity_x[0];
-    agent->sim_vy = agent->log_velocity_y[0];
+
+    if (env->init_mode == INIT_VARIABLE_AGENT_NUMBER) {
+        if (!randomize_agent_position(env, agent_idx)) {
+            // Fallback to original position if no valid spawn found
+            agent->sim_x = agent->log_trajectory_x[0];
+            agent->sim_y = agent->log_trajectory_y[0];
+            agent->sim_z = agent->log_trajectory_z[0];
+            agent->sim_heading = agent->log_heading[0];
+            agent->heading_x = cosf(agent->sim_heading);
+            agent->heading_y = sinf(agent->sim_heading);
+        }
+    } else {
+        agent->sim_x = agent->log_trajectory_x[0];
+        agent->sim_y = agent->log_trajectory_y[0];
+        agent->sim_z = agent->log_trajectory_z[0];
+        agent->sim_heading = agent->log_heading[0];
+        agent->heading_x = cosf(agent->sim_heading);
+        agent->heading_y = sinf(agent->sim_heading);
+    }
+
+    agent->sim_vx = 0.0f;
+    agent->sim_vy = 0.0f;
+    agent->sim_speed = 0.0f;
+    agent->sim_speed_signed = 0.0f;
     agent->metrics_array[COLLISION_IDX] = 0.0f;
     agent->metrics_array[OFFROAD_IDX] = 0.0f;
     agent->metrics_array[REACHED_GOAL_IDX] = 0.0f;
@@ -2908,7 +3002,15 @@ void move_dynamics(Drive *env, int action_idx, int agent_idx) {
 
 void c_reset(Drive *env) {
     env->timestep = env->init_steps;
-    set_start_position(env);
+    if (env->init_mode == INIT_VARIABLE_AGENT_NUMBER) {
+        // Randomize all agent positions on reset
+        for (int x = 0; x < env->active_agent_count; x++) {
+            int agent_idx = env->active_agent_indices[x];
+            randomize_agent_position(env, agent_idx);
+        }
+    } else {
+        set_start_position(env);
+    }
     reset_goal_positions(env);
     for (int x = 0; x < env->active_agent_count; x++) {
         env->logs[x] = (Log){0};

From 4c9bcdb51c5c44d26fe3ff7a682e09d9c19c71ff Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Tue, 17 Mar 2026 00:03:19 +0000
Subject: [PATCH 44/46] Fix: sample new goals after randomizing agent positions

After moving an agent to a new random position, must also sample a new
goal relative to that position. Previously reset_goal_positions would
restore the original init goal, which could be far from the new spawn.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/ocean/drive/drive.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/pufferlib/ocean/drive/drive.h b/pufferlib/ocean/drive/drive.h
index 99ea10f1e6..897ada4502 100644
--- a/pufferlib/ocean/drive/drive.h
+++ b/pufferlib/ocean/drive/drive.h
@@ -2725,6 +2725,8 @@ void respawn_agent(Drive *env, int agent_idx) {
             agent->heading_x = cosf(agent->sim_heading);
             agent->heading_y = sinf(agent->sim_heading);
         }
+        // Sample a new goal relative to the new position
+        sample_new_goal(env, agent_idx);
     } else {
         agent->sim_x = agent->log_trajectory_x[0];
         agent->sim_y = agent->log_trajectory_y[0];
@@ -3008,10 +3010,15 @@ void c_reset(Drive *env) {
             int agent_idx = env->active_agent_indices[x];
             randomize_agent_position(env, agent_idx);
         }
+        // Sample new goals relative to new positions
+        for (int x = 0; x < env->active_agent_count; x++) {
+            int agent_idx = env->active_agent_indices[x];
+            sample_new_goal(env, agent_idx);
+        }
     } else {
         set_start_position(env);
+        reset_goal_positions(env);
     }
-    reset_goal_positions(env);
     for (int x = 0; x < env->active_agent_count; x++) {
         env->logs[x] = (Log){0};
         int agent_idx = env->active_agent_indices[x];

From 66f9ba38bed9d536b1fbdcabeda5c431178f3034 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Tue, 17 Mar 2026 00:14:52 +0000
Subject: [PATCH 45/46] Fix: don't overwrite sampled goals with stale init_goal
 in variable agent mode

c_reset's GOAL_GENERATE_NEW block was restoring init_goal_x/y/z after
sample_new_goal had already set fresh goals relative to the new position.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/ocean/drive/drive.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pufferlib/ocean/drive/drive.h b/pufferlib/ocean/drive/drive.h
index 897ada4502..0f9470e20d 100644
--- a/pufferlib/ocean/drive/drive.h
+++ b/pufferlib/ocean/drive/drive.h
@@ -3048,7 +3048,7 @@ void c_reset(Drive *env) {
         agent->prev_goal_z = agent->sim_z;
         generate_reward_coefs(env, agent);
 
-        if (env->goal_behavior == GOAL_GENERATE_NEW) {
+        if (env->goal_behavior == GOAL_GENERATE_NEW && env->init_mode != INIT_VARIABLE_AGENT_NUMBER) {
             agent->goal_position_x = agent->init_goal_x;
             agent->goal_position_y = agent->init_goal_y;
             agent->goal_position_z = agent->init_goal_z;

From 7fcebc2bf527306f77855d4b184203a4d875ceda Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugene@percepta.ai>
Date: Tue, 17 Mar 2026 00:54:14 +0000
Subject: [PATCH 46/46] Fix collision check and velocity restoration in respawn

- Use proper OBB collision check (check_spawn_collision) instead of
  rough distance approximation in randomize_agent_position
- Restore original log_velocity on respawn for non-variable-agent modes
  instead of zeroing it (preserves data-driven replay behavior)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pufferlib/ocean/drive/drive.h | 33 ++++++++++++---------------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/pufferlib/ocean/drive/drive.h b/pufferlib/ocean/drive/drive.h
index 0f9470e20d..373c32a583 100644
--- a/pufferlib/ocean/drive/drive.h
+++ b/pufferlib/ocean/drive/drive.h
@@ -2673,22 +2673,12 @@ static bool randomize_agent_position(Drive *env, int agent_idx) {
         get_random_point_on_lane(lane, &spawn_x, &spawn_y, &spawn_z, &spawn_heading);
         spawn_z += agent->sim_height / 2.0f;
 
-        // Check collision with all other active agents (excluding this one)
-        bool collision = false;
-        for (int j = 0; j < env->active_agent_count; j++) {
-            int other_idx = env->active_agent_indices[j];
-            if (other_idx == agent_idx) continue;
-            Agent *other = &env->agents[other_idx];
-            if (other->sim_x == INVALID_POSITION || other->removed) continue;
-            float dx = spawn_x - other->sim_x;
-            float dy = spawn_y - other->sim_y;
-            float dist = sqrtf(dx * dx + dy * dy);
-            float min_dist = (agent->sim_length + other->sim_length) / 2.0f;
-            if (dist < min_dist) {
-                collision = true;
-                break;
-            }
-        }
+        // Temporarily invalidate this agent so check_spawn_collision skips it
+        float saved_x = agent->sim_x;
+        agent->sim_x = INVALID_POSITION;
+        bool collision = check_spawn_collision(env, env->active_agent_count, spawn_x, spawn_y, spawn_z,
+                                              spawn_heading, agent->sim_length, agent->sim_width, agent->sim_height);
+        agent->sim_x = saved_x;
         if (collision) continue;
 
         // Check offroad
@@ -2727,6 +2717,10 @@ void respawn_agent(Drive *env, int agent_idx) {
         }
         // Sample a new goal relative to the new position
         sample_new_goal(env, agent_idx);
+        agent->sim_vx = 0.0f;
+        agent->sim_vy = 0.0f;
+        agent->sim_speed = 0.0f;
+        agent->sim_speed_signed = 0.0f;
     } else {
         agent->sim_x = agent->log_trajectory_x[0];
         agent->sim_y = agent->log_trajectory_y[0];
@@ -2734,12 +2728,9 @@ void respawn_agent(Drive *env, int agent_idx) {
         agent->sim_heading = agent->log_heading[0];
         agent->heading_x = cosf(agent->sim_heading);
         agent->heading_y = sinf(agent->sim_heading);
+        agent->sim_vx = agent->log_velocity_x[0];
+        agent->sim_vy = agent->log_velocity_y[0];
     }
-
-    agent->sim_vx = 0.0f;
-    agent->sim_vy = 0.0f;
-    agent->sim_speed = 0.0f;
-    agent->sim_speed_signed = 0.0f;
     agent->metrics_array[COLLISION_IDX] = 0.0f;
     agent->metrics_array[OFFROAD_IDX] = 0.0f;
     agent->metrics_array[REACHED_GOAL_IDX] = 0.0f;