Emerge-Lab · m2kulkarni · Feb 24, 2026 · Feb 24, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/.gitignore b/.gitignore
@@ -194,3 +194,4 @@ pufferlib/resources/drive/output*.mp4
 
 # Local TODO tracking
 TODO.md
+*.mp4
diff --git a/evaluate_human_logs.py b/evaluate_human_logs.py
@@ -104,7 +104,7 @@ def main():
     print("Beginning human evaluations using HumanReplayEvaluator")
     parser = argparse.ArgumentParser()
     parser.add_argument("--policy-path", type=str, required=True)
-    parser.add_argument("--policy-architecture", type=str, default="Recurrent")
+    parser.add_argument("--rnn-name", type=str, default="Recurrent")
     parser.add_argument("--num-maps", type=int, default=10)
     parser.add_argument("--num-rollouts", type=int, default=100)
     parser.add_argument("--num-agents", type=int, default=64)
@@ -119,7 +119,7 @@ def main():
 
     print(f"Evaluation Configuration:")
     print(f"  Policy: {args_parsed.policy_path}")
-    print(f"  Policy Architecture: {args_parsed.policy_architecture}")
+    print(f"  RNN Name: {args_parsed.rnn_name}")
     print(f"  Num maps: {args_parsed.num_maps}")
     print(f"  Total rollouts: {args_parsed.num_rollouts}")
     print(f"  Num agents per env: {args_parsed.num_agents}")
@@ -133,14 +133,13 @@ def main():
     make_env = env_creator(env_name)
 
     scenario_length = 91
-    context_length = args_parsed.k_scenarios * scenario_length
+    horizon = args_parsed.k_scenarios * scenario_length
 
     args = {
+        "rnn_name": args_parsed.rnn_name,
         "train": {
             "device": args_parsed.device,
-            "use_rnn": args_parsed.policy_architecture == "Recurrent",
-            "policy_architecture": args_parsed.policy_architecture,
-            "context_window": context_length,
+            "horizon": horizon,
         },
         "env": {
             "num_agents": args_parsed.num_agents,
@@ -176,10 +175,10 @@ def main():
     print("Loading policy...")
     temp_env = make_env(**args["env"])
 
-    if args_parsed.policy_architecture == "Recurrent":
-        base_policy = Drive(temp_env, input_size=64, hidden_size=256)
+    if args_parsed.rnn_name == "Recurrent":
+        base_policy = Drive(temp_env, input_size=128, hidden_size=256)
         policy = Recurrent(temp_env, base_policy, input_size=256, hidden_size=256).to(args_parsed.device)
-    elif args_parsed.policy_architecture == "Transformer":
+    elif args_parsed.rnn_name == "Transformer":
         base_policy = Drive(temp_env, input_size=128, hidden_size=256)
         policy = Transformer(
             temp_env,
@@ -188,7 +187,7 @@ def main():
             hidden_size=256,
             num_layers=2,
             num_heads=4,
-            context_length=context_length,
+            horizon=horizon,
         ).to(args_parsed.device)
 
     state_dict = torch.load(args_parsed.policy_path, map_location=args_parsed.device)
@@ -242,6 +241,13 @@ def main():
             values = [r.get(key, 0) for r in all_results]
             aggregated[key] = float(np.mean(values))
 
+    # Aggregate scenario-specific metrics (scenario_0_*, scenario_1_*, etc.)
+    scenario_keys = [k for k in all_keys if k.startswith("scenario_")]
+    for key in scenario_keys:
+        values = [r.get(key, 0) for r in all_results]
+        aggregated[key] = float(np.mean(values))
+
+    if delta_keys:
         # Derive last scenario metrics from first + delta
         # Extract first scenario metrics
         first_scenario_keys = [k for k in metric_keys if k not in ["n"]]
@@ -272,24 +278,22 @@ def main():
         json.dump(aggregated, f, indent=2)
 
     # Print results
-    if args_parsed.adaptive_driving_agent and delta_keys:
-        print(f"\n0-Shot Performance (First Scenario):")
-        print(f"  Score: {aggregated.get('first_scenario_score', float('nan')):.3f}")
-        print(f"  Collision: {aggregated.get('first_scenario_collision_rate', float('nan')):.3f}")
-        print(f"  Offroad: {aggregated.get('first_scenario_offroad_rate', float('nan')):.3f}")
-        print(f"  Return: {aggregated.get('first_scenario_episode_return', float('nan')):.2f}")
-
-        print(f"\nAdapted Performance (Last Scenario):")
-        print(f"  Score: {aggregated.get('last_scenario_score', float('nan')):.3f}")
-        print(f"  Collision: {aggregated.get('last_scenario_collision_rate', float('nan')):.3f}")
-        print(f"  Offroad: {aggregated.get('last_scenario_offroad_rate', float('nan')):.3f}")
-        print(f"  Return: {aggregated.get('last_scenario_episode_return', float('nan')):.2f}")
-
-        print(f"\nAdaptive Metrics (Delta):")
-        print(f"  Score: {aggregated.get('ada_delta_score', float('nan')):.4f}")
-        print(f"  Collision rate: {aggregated.get('ada_delta_collision_rate', float('nan')):.4f}")
-        print(f"  Offroad rate: {aggregated.get('ada_delta_offroad_rate', float('nan')):.4f}")
-        print(f"  Episode return: {aggregated.get('ada_delta_episode_return', float('nan')):.4f}")
+    if args_parsed.adaptive_driving_agent:
+        # Print per-scenario metrics
+        for i in range(args_parsed.k_scenarios):
+            label = "0-Shot" if i == 0 else f"Scenario {i}"
+            print(f"\n{label} Performance:")
+            print(f"  Score: {aggregated.get(f'scenario_{i}_score', float('nan')):.3f}")
+            print(f"  Collision: {aggregated.get(f'scenario_{i}_collision_rate', float('nan')):.3f}")
+            print(f"  Offroad: {aggregated.get(f'scenario_{i}_offroad_rate', float('nan')):.3f}")
+            print(f"  Return: {aggregated.get(f'scenario_{i}_episode_return', float('nan')):.2f}")
+
+        if delta_keys:
+            print(f"\nAdaptive Metrics (Delta: Last - First):")
+            print(f"  Score: {aggregated.get('ada_delta_score', float('nan')):.4f}")
+            print(f"  Collision rate: {aggregated.get('ada_delta_collision_rate', float('nan')):.4f}")
+            print(f"  Offroad rate: {aggregated.get('ada_delta_offroad_rate', float('nan')):.4f}")
+            print(f"  Episode return: {aggregated.get('ada_delta_episode_return', float('nan')):.4f}")
 
     print(f"\nSaved to {args_parsed.output}")
     import sys

diff --git a/pufferlib/config/default.ini b/pufferlib/config/default.ini
@@ -49,7 +49,7 @@ minibatch_size = 8192
 
 # Accumulate gradients above this size
 max_minibatch_size = 32768
-bptt_horizon = 64
+horizon = 64
 compile = False
 compile_mode = max-autotune-no-cudagraphs
 compile_fullgraph = True
@@ -81,7 +81,7 @@ downsample = 10
 ; mean = 1e8
 ; scale = time
 
-; [sweep.train.bptt_horizon]
+; [sweep.train.horizon]
 ; distribution = uniform_pow2
 ; min = 16
 ; max = 64

diff --git a/pufferlib/config/ocean/adaptive.ini b/pufferlib/config/ocean/adaptive.ini
@@ -2,18 +2,20 @@
 package = ocean
 env_name = puffer_adaptive_drive
 policy_name = Drive
-transformer_name = Transformer
- ; Changed from rnn_name
+policy_architecture = Transformer
 
 [vec]
 num_workers = 16
 num_envs = 16
-batch_size = 2
+batch_size = 1
 ; backend = Serial
 
 [policy]
 input_size = 128
-; Increased from 64 for richer representations
+hidden_size = 256
+
+[rnn]
+input_size = 256
 hidden_size = 256
 
 [transformer]
@@ -23,14 +25,13 @@ num_layers = 2
 ; Number of transformer layers
 num_heads = 4
 ; Number of attention heads (must divide hidden_size)
-; context_length = 182
-; k_scenarios (2) * scenario_length (91) = maximum attention span
+; Transformer uses `horizon` from [train] section for attention span
 dropout = 0.0
 ; Dropout (keep at 0 for RL stability initially)
 
 [env]
-num_agents = 1512
-num_ego_agents = 756
+num_agents = 1024
+num_ego_agents = 512
 ; Options: discrete, continuous
 action_type = discrete
 ; Options: classic, jerk
@@ -60,15 +61,15 @@ k_scenarios = 2
 termination_mode = 1
 ; 0 - terminate at episode_length, 1 - terminate after all agents have been reset
 map_dir = "resources/drive/binaries/training"
-num_maps = 1000
+num_maps = 10000
 ; Determines which step of the trajectory to initialize the agents at upon reset
 init_steps = 0
 ; Options: "control_vehicles", "control_agents", "control_wosac", "control_sdc_only"
 control_mode = "control_vehicles"
 ; Options: "created_all_valid", "create_only_controlled"
 init_mode = "create_all_valid"
 ; train with co players
-co_player_enabled = False
+co_player_enabled = True
 
 
 [env.conditioning]
@@ -87,15 +88,24 @@ discount_weight_ub = 0.98
 
 [env.co_player_policy]
 policy_name = Drive
-rnn_name = Recurrent
+; Options: "Recurrent", "Transformer"
+architecture = Recurrent
 policy_path = "pufferlib/resources/drive/policies/varied_discount.pt"
-input_size = 64
+input_size = 128
 hidden_size = 256
 
 [env.co_player_policy.rnn]
 input_size = 256
 hidden_size = 256
 
+[env.co_player_policy.transformer]
+input_size = 256
+hidden_size = 256
+num_layers = 2
+num_heads = 4
+horizon = 91
+dropout = 0.0
+
 [env.co_player_policy.conditioning]
 ; Options: "none", "reward", "entropy", "discount", "all"
 type = "all"
@@ -114,24 +124,22 @@ discount_weight_ub = 0.98
 seed=42
 total_timesteps = 2_000_000_000
 anneal_lr = True
-; Needs to be: num_agents * num_workers * context_window
+; Needs to be: num_agents * num_workers * horizon
 batch_size = auto
 minibatch_size = 36400
-; 400 * 91
+; 200 * 182 (must be divisible by horizon = k_scenarios * scenario_length)
 max_minibatch_size = 36400
 minibatch_multiplier = 400
-policy_architecture = Transformer
-; Matches scenario_length for buffer organization
-bptt_horizon = 32
-; Keep for backward compatibility
+; Sequence length - overridden to k_scenarios * scenario_length for adaptive
+horizon = 91
 adam_beta1 = 0.9
 adam_beta2 = 0.999
 adam_eps = 1e-8
 clip_coef = 0.2
 ent_coef = 0.005
 gae_lambda = 0.95
 gamma = 0.98
-learning_rate = 0.0003
+learning_rate = 0.003
 ; Reduced from 0.003 (transformers often need lower LR)
 max_grad_norm = 1.0
 prio_alpha = 0.85
@@ -193,6 +201,8 @@ human_replay_num_agents = 32
 human_replay_num_rollouts = 100
 ; Number of maps to use for human replay evaluation
 human_replay_num_maps = 100
+; Number of maps to render for human replay (subset of eval maps)
+human_replay_render_num_maps = 2
 
 [sweep.train.learning_rate]
 distribution = log_normal

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
@@ -2,7 +2,7 @@
 package = ocean
 env_name = puffer_drive
 policy_name = Drive
-rnn_name = Transformer
+policy_architecture = Transformer
 
 [vec]
 num_workers = 16
@@ -11,12 +11,12 @@ batch_size = 2
 ; backend = Serial
 
 [policy]
-input_size = 64
+input_size = 128
 hidden_size = 256
 
-; [rnn]
-; input_size = 256
-; hidden_size = 256
+[rnn]
+input_size = 256
+hidden_size = 256
 
 [transformer]
 input_size = 256
@@ -25,13 +25,12 @@ num_layers = 2
 ; Number of transformer layers
 num_heads = 4
 ; Number of attention heads (must divide hidden_size)
-context_window = 32
 ; k_scenarios (2) * scenario_length (91) = maximum attention span
 dropout = 0.0
 ; Dropout (keep at 0 for RL stability initially)
 
 [env]
-num_agents = 512
+num_agents = 1024
 num_ego_agents = 512
 ; Options: discrete, continuous
 action_type = discrete
@@ -47,7 +46,7 @@ goal_radius = 2.0
 ; Max target speed in m/s for the agent to maintain towards the goal
 goal_speed = 100.0
 ; What to do when the goal is reached. Options: 0:"respawn", 1:"generate_new_goals", 2:"stop"
-goal_behavior = 1
+goal_behavior = 0
 ; Determines the target distance to the new goal in the case of goal_behavior = generate_new_goals.
 ; Large numbers will select a goal point further away from the agent's current position.
 goal_target_distance = 30.0
@@ -112,16 +111,15 @@ discount_weight_ub = 0.80
 [train]
 seed=42
 total_timesteps = 2_000_000_000
-# learning_rate = 0.02
-# gamma = 0.985
 anneal_lr = True
-; Needs to be: num_agents * num_workers * BPTT horizon
+; Needs to be: num_agents * num_workers * horizon
 batch_size = auto
-minibatch_size = 32768
-max_minibatch_size = 32768
-; minibatch_size = 256
-; max_minibatch_size = 256
-bptt_horizon = 32
+minibatch_size = 32760
+; 360 * 91
+max_minibatch_size = 32760
+minibatch_multiplier = 360
+; Sequence length for training - matches scenario_length for full episode context
+horizon = 91
 adam_beta1 = 0.9
 adam_beta2 = 0.999
 adam_eps = 1e-8
@@ -130,17 +128,16 @@ ent_coef = 0.005
 gae_lambda = 0.95
 gamma = 0.98
 learning_rate = 0.003
-max_grad_norm = 1
-prio_alpha = 0.8499999999999999
-prio_beta0 = 0.8499999999999999
+max_grad_norm = 1.0
+prio_alpha = 0.85
+prio_beta0 = 0.85
 update_epochs = 1
-vf_clip_coef = 0.1999999999999999
-vf_coef = 2
+vf_clip_coef = 0.2
+vf_coef = 2.0
 vtrace_c_clip = 1
 vtrace_rho_clip = 1
 checkpoint_interval = 100
-use_transformer = True
-context_window = 32
+context_length = 32
 # Rendering options
 render = True
 render_interval = 100
@@ -158,7 +155,7 @@ zoom_in = True
 render_map = none
 
 [eval]
-eval_interval = 1000
+eval_interval = 100
 ; Path to dataset used for evaluation
 map_dir = "resources/drive/binaries/training"
 ; Evaluation will run on the first num_maps maps in the map_dir directory
@@ -183,9 +180,17 @@ wosac_sanity_check = False
 ; Only return aggregate results across all scenes
 wosac_aggregate_results = True
 ; If True, enable human replay evaluation (pair policy-controlled agent with human replays)
-human_replay_eval = False
-; Control only the self-driving car
-human_replay_control_mode = "control_sdc_only"
+human_replay_eval = True
+; Control mode for human replay (control_vehicles with max_controlled_agents=1 controls one agent)
+human_replay_control_mode = "control_vehicles"
+; Number of agents in human replay evaluation environment
+human_replay_num_agents = 32
+; Number of rollouts for human replay evaluation
+human_replay_num_rollouts = 100
+; Number of maps to use for human replay evaluation
+human_replay_num_maps = 100
+; Number of maps to render for human replay (subset of eval maps)
+human_replay_render_num_maps = 2
 
 [sweep.train.learning_rate]
 distribution = log_normal
Original file line number	Diff line number	Diff line change
Expand Up		@@ -194,3 +194,4 @@ pufferlib/resources/drive/output*.mp4

		# Local TODO tracking
		TODO.md
		*.mp4