diff --git a/.gitignore b/.gitignore
index 66e4b066be..71497ea416 100644
--- a/.gitignore
+++ b/.gitignore
@@ -194,3 +194,4 @@ pufferlib/resources/drive/output*.mp4
 
 # Local TODO tracking
 TODO.md
+*.mp4
diff --git a/evaluate_human_logs.py b/evaluate_human_logs.py
index a91e8bd2ce..fb743398c3 100644
--- a/evaluate_human_logs.py
+++ b/evaluate_human_logs.py
@@ -104,7 +104,7 @@ def main():
     print("Beginning human evaluations using HumanReplayEvaluator")
     parser = argparse.ArgumentParser()
     parser.add_argument("--policy-path", type=str, required=True)
-    parser.add_argument("--policy-architecture", type=str, default="Recurrent")
+    parser.add_argument("--rnn-name", type=str, default="Recurrent")
     parser.add_argument("--num-maps", type=int, default=10)
     parser.add_argument("--num-rollouts", type=int, default=100)
     parser.add_argument("--num-agents", type=int, default=64)
@@ -119,7 +119,7 @@ def main():
 
     print(f"Evaluation Configuration:")
     print(f"  Policy: {args_parsed.policy_path}")
-    print(f"  Policy Architecture: {args_parsed.policy_architecture}")
+    print(f"  RNN Name: {args_parsed.rnn_name}")
     print(f"  Num maps: {args_parsed.num_maps}")
     print(f"  Total rollouts: {args_parsed.num_rollouts}")
     print(f"  Num agents per env: {args_parsed.num_agents}")
@@ -133,14 +133,13 @@ def main():
     make_env = env_creator(env_name)
 
     scenario_length = 91
-    context_length = args_parsed.k_scenarios * scenario_length
+    horizon = args_parsed.k_scenarios * scenario_length
 
     args = {
+        "rnn_name": args_parsed.rnn_name,
         "train": {
             "device": args_parsed.device,
-            "use_rnn": args_parsed.policy_architecture == "Recurrent",
-            "policy_architecture": args_parsed.policy_architecture,
-            "context_window": context_length,
+            "horizon": horizon,
         },
         "env": {
             "num_agents": args_parsed.num_agents,
@@ -176,10 +175,10 @@ def main():
     print("Loading policy...")
     temp_env = make_env(**args["env"])
 
-    if args_parsed.policy_architecture == "Recurrent":
-        base_policy = Drive(temp_env, input_size=64, hidden_size=256)
+    if args_parsed.rnn_name == "Recurrent":
+        base_policy = Drive(temp_env, input_size=128, hidden_size=256)
         policy = Recurrent(temp_env, base_policy, input_size=256, hidden_size=256).to(args_parsed.device)
-    elif args_parsed.policy_architecture == "Transformer":
+    elif args_parsed.rnn_name == "Transformer":
         base_policy = Drive(temp_env, input_size=128, hidden_size=256)
         policy = Transformer(
             temp_env,
@@ -188,7 +187,7 @@ def main():
             hidden_size=256,
             num_layers=2,
             num_heads=4,
-            context_length=context_length,
+            horizon=horizon,
         ).to(args_parsed.device)
 
     state_dict = torch.load(args_parsed.policy_path, map_location=args_parsed.device)
@@ -242,6 +241,13 @@ def main():
             values = [r.get(key, 0) for r in all_results]
             aggregated[key] = float(np.mean(values))
 
+    # Aggregate scenario-specific metrics (scenario_0_*, scenario_1_*, etc.)
+    scenario_keys = [k for k in all_keys if k.startswith("scenario_")]
+    for key in scenario_keys:
+        values = [r.get(key, 0) for r in all_results]
+        aggregated[key] = float(np.mean(values))
+
+    if delta_keys:
         # Derive last scenario metrics from first + delta
         # Extract first scenario metrics
         first_scenario_keys = [k for k in metric_keys if k not in ["n"]]
@@ -272,24 +278,22 @@ def main():
         json.dump(aggregated, f, indent=2)
 
     # Print results
-    if args_parsed.adaptive_driving_agent and delta_keys:
-        print(f"\n0-Shot Performance (First Scenario):")
-        print(f"  Score: {aggregated.get('first_scenario_score', float('nan')):.3f}")
-        print(f"  Collision: {aggregated.get('first_scenario_collision_rate', float('nan')):.3f}")
-        print(f"  Offroad: {aggregated.get('first_scenario_offroad_rate', float('nan')):.3f}")
-        print(f"  Return: {aggregated.get('first_scenario_episode_return', float('nan')):.2f}")
-
-        print(f"\nAdapted Performance (Last Scenario):")
-        print(f"  Score: {aggregated.get('last_scenario_score', float('nan')):.3f}")
-        print(f"  Collision: {aggregated.get('last_scenario_collision_rate', float('nan')):.3f}")
-        print(f"  Offroad: {aggregated.get('last_scenario_offroad_rate', float('nan')):.3f}")
-        print(f"  Return: {aggregated.get('last_scenario_episode_return', float('nan')):.2f}")
-
-        print(f"\nAdaptive Metrics (Delta):")
-        print(f"  Score: {aggregated.get('ada_delta_score', float('nan')):.4f}")
-        print(f"  Collision rate: {aggregated.get('ada_delta_collision_rate', float('nan')):.4f}")
-        print(f"  Offroad rate: {aggregated.get('ada_delta_offroad_rate', float('nan')):.4f}")
-        print(f"  Episode return: {aggregated.get('ada_delta_episode_return', float('nan')):.4f}")
+    if args_parsed.adaptive_driving_agent:
+        # Print per-scenario metrics
+        for i in range(args_parsed.k_scenarios):
+            label = "0-Shot" if i == 0 else f"Scenario {i}"
+            print(f"\n{label} Performance:")
+            print(f"  Score: {aggregated.get(f'scenario_{i}_score', float('nan')):.3f}")
+            print(f"  Collision: {aggregated.get(f'scenario_{i}_collision_rate', float('nan')):.3f}")
+            print(f"  Offroad: {aggregated.get(f'scenario_{i}_offroad_rate', float('nan')):.3f}")
+            print(f"  Return: {aggregated.get(f'scenario_{i}_episode_return', float('nan')):.2f}")
+
+        if delta_keys:
+            print(f"\nAdaptive Metrics (Delta: Last - First):")
+            print(f"  Score: {aggregated.get('ada_delta_score', float('nan')):.4f}")
+            print(f"  Collision rate: {aggregated.get('ada_delta_collision_rate', float('nan')):.4f}")
+            print(f"  Offroad rate: {aggregated.get('ada_delta_offroad_rate', float('nan')):.4f}")
+            print(f"  Episode return: {aggregated.get('ada_delta_episode_return', float('nan')):.4f}")
 
     print(f"\nSaved to {args_parsed.output}")
     import sys
diff --git a/pufferlib/config/default.ini b/pufferlib/config/default.ini
index 810de828c0..248eb8d3b3 100644
--- a/pufferlib/config/default.ini
+++ b/pufferlib/config/default.ini
@@ -49,7 +49,7 @@ minibatch_size = 8192
 
 # Accumulate gradients above this size
 max_minibatch_size = 32768
-bptt_horizon = 64
+horizon = 64
 compile = False
 compile_mode = max-autotune-no-cudagraphs
 compile_fullgraph = True
@@ -81,7 +81,7 @@ downsample = 10
 ; mean = 1e8
 ; scale = time
 
-; [sweep.train.bptt_horizon]
+; [sweep.train.horizon]
 ; distribution = uniform_pow2
 ; min = 16
 ; max = 64
diff --git a/pufferlib/config/ocean/adaptive.ini b/pufferlib/config/ocean/adaptive.ini
index 598faf50ff..f7ab1b2b0c 100644
--- a/pufferlib/config/ocean/adaptive.ini
+++ b/pufferlib/config/ocean/adaptive.ini
@@ -2,18 +2,20 @@
 package = ocean
 env_name = puffer_adaptive_drive
 policy_name = Drive
-transformer_name = Transformer
- ; Changed from rnn_name
+policy_architecture = Transformer
 
 [vec]
 num_workers = 16
 num_envs = 16
-batch_size = 2
+batch_size = 1
 ; backend = Serial
 
 [policy]
 input_size = 128
-; Increased from 64 for richer representations
+hidden_size = 256
+
+[rnn]
+input_size = 256
 hidden_size = 256
 
 [transformer]
@@ -23,14 +25,13 @@ num_layers = 2
 ; Number of transformer layers
 num_heads = 4
 ; Number of attention heads (must divide hidden_size)
-; context_length = 182
-; k_scenarios (2) * scenario_length (91) = maximum attention span
+; Transformer uses `horizon` from [train] section for attention span
 dropout = 0.0
 ; Dropout (keep at 0 for RL stability initially)
 
 [env]
-num_agents = 1512
-num_ego_agents = 756
+num_agents = 1024
+num_ego_agents = 512
 ; Options: discrete, continuous
 action_type = discrete
 ; Options: classic, jerk
@@ -60,7 +61,7 @@ k_scenarios = 2
 termination_mode = 1
 ; 0 - terminate at episode_length, 1 - terminate after all agents have been reset
 map_dir = "resources/drive/binaries/training"
-num_maps = 1000
+num_maps = 10000
 ; Determines which step of the trajectory to initialize the agents at upon reset
 init_steps = 0
 ; Options: "control_vehicles", "control_agents", "control_wosac", "control_sdc_only"
@@ -68,7 +69,7 @@ control_mode = "control_vehicles"
 ; Options: "created_all_valid", "create_only_controlled"
 init_mode = "create_all_valid"
 ; train with co players
-co_player_enabled = False
+co_player_enabled = True
 
 
 [env.conditioning]
@@ -87,15 +88,24 @@ discount_weight_ub = 0.98
 
 [env.co_player_policy]
 policy_name = Drive
-rnn_name = Recurrent
+; Options: "Recurrent", "Transformer"
+architecture = Recurrent
 policy_path = "pufferlib/resources/drive/policies/varied_discount.pt"
-input_size = 64
+input_size = 128
 hidden_size = 256
 
 [env.co_player_policy.rnn]
 input_size = 256
 hidden_size = 256
 
+[env.co_player_policy.transformer]
+input_size = 256
+hidden_size = 256
+num_layers = 2
+num_heads = 4
+horizon = 91
+dropout = 0.0
+
 [env.co_player_policy.conditioning]
 ; Options: "none", "reward", "entropy", "discount", "all"
 type = "all"
@@ -114,16 +124,14 @@ discount_weight_ub = 0.98
 seed=42
 total_timesteps = 2_000_000_000
 anneal_lr = True
-; Needs to be: num_agents * num_workers * context_window
+; Needs to be: num_agents * num_workers * horizon
 batch_size = auto
 minibatch_size = 36400
-; 400 * 91
+; 200 * 182 (must be divisible by horizon = k_scenarios * scenario_length)
 max_minibatch_size = 36400
 minibatch_multiplier = 400
-policy_architecture = Transformer
-; Matches scenario_length for buffer organization
-bptt_horizon = 32
-; Keep for backward compatibility
+; Sequence length - overridden to k_scenarios * scenario_length for adaptive
+horizon = 91
 adam_beta1 = 0.9
 adam_beta2 = 0.999
 adam_eps = 1e-8
@@ -131,7 +139,7 @@ clip_coef = 0.2
 ent_coef = 0.005
 gae_lambda = 0.95
 gamma = 0.98
-learning_rate = 0.0003
+learning_rate = 0.003
 ; Reduced from 0.003 (transformers often need lower LR)
 max_grad_norm = 1.0
 prio_alpha = 0.85
@@ -193,6 +201,8 @@ human_replay_num_agents = 32
 human_replay_num_rollouts = 100
 ; Number of maps to use for human replay evaluation
 human_replay_num_maps = 100
+; Number of maps to render for human replay (subset of eval maps)
+human_replay_render_num_maps = 2
 
 [sweep.train.learning_rate]
 distribution = log_normal
diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index 7d5efd43be..af2dfef40c 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -2,7 +2,7 @@
 package = ocean
 env_name = puffer_drive
 policy_name = Drive
-rnn_name = Transformer
+policy_architecture = Transformer
 
 [vec]
 num_workers = 16
@@ -11,12 +11,12 @@ batch_size = 2
 ; backend = Serial
 
 [policy]
-input_size = 64
+input_size = 128
 hidden_size = 256
 
-; [rnn]
-; input_size = 256
-; hidden_size = 256
+[rnn]
+input_size = 256
+hidden_size = 256
 
 [transformer]
 input_size = 256
@@ -25,13 +25,12 @@ num_layers = 2
 ; Number of transformer layers
 num_heads = 4
 ; Number of attention heads (must divide hidden_size)
-context_window = 32
 ; k_scenarios (2) * scenario_length (91) = maximum attention span
 dropout = 0.0
 ; Dropout (keep at 0 for RL stability initially)
 
 [env]
-num_agents = 512
+num_agents = 1024
 num_ego_agents = 512
 ; Options: discrete, continuous
 action_type = discrete
@@ -47,7 +46,7 @@ goal_radius = 2.0
 ; Max target speed in m/s for the agent to maintain towards the goal
 goal_speed = 100.0
 ; What to do when the goal is reached. Options: 0:"respawn", 1:"generate_new_goals", 2:"stop"
-goal_behavior = 1
+goal_behavior = 0
 ; Determines the target distance to the new goal in the case of goal_behavior = generate_new_goals.
 ; Large numbers will select a goal point further away from the agent's current position.
 goal_target_distance = 30.0
@@ -112,16 +111,15 @@ discount_weight_ub = 0.80
 [train]
 seed=42
 total_timesteps = 2_000_000_000
-# learning_rate = 0.02
-# gamma = 0.985
 anneal_lr = True
-; Needs to be: num_agents * num_workers * BPTT horizon
+; Needs to be: num_agents * num_workers * horizon
 batch_size = auto
-minibatch_size = 32768
-max_minibatch_size = 32768
-; minibatch_size = 256
-; max_minibatch_size = 256
-bptt_horizon = 32
+minibatch_size = 32760
+; 360 * 91
+max_minibatch_size = 32760
+minibatch_multiplier = 360
+; Sequence length for training - matches scenario_length for full episode context
+horizon = 91
 adam_beta1 = 0.9
 adam_beta2 = 0.999
 adam_eps = 1e-8
@@ -130,17 +128,16 @@ ent_coef = 0.005
 gae_lambda = 0.95
 gamma = 0.98
 learning_rate = 0.003
-max_grad_norm = 1
-prio_alpha = 0.8499999999999999
-prio_beta0 = 0.8499999999999999
+max_grad_norm = 1.0
+prio_alpha = 0.85
+prio_beta0 = 0.85
 update_epochs = 1
-vf_clip_coef = 0.1999999999999999
-vf_coef = 2
+vf_clip_coef = 0.2
+vf_coef = 2.0
 vtrace_c_clip = 1
 vtrace_rho_clip = 1
 checkpoint_interval = 100
-use_transformer = True
-context_window = 32
+context_length = 32
 # Rendering options
 render = True
 render_interval = 100
@@ -158,7 +155,7 @@ zoom_in = True
 render_map = none
 
 [eval]
-eval_interval = 1000
+eval_interval = 100
 ; Path to dataset used for evaluation
 map_dir = "resources/drive/binaries/training"
 ; Evaluation will run on the first num_maps maps in the map_dir directory
@@ -183,9 +180,17 @@ wosac_sanity_check = False
 ; Only return aggregate results across all scenes
 wosac_aggregate_results = True
 ; If True, enable human replay evaluation (pair policy-controlled agent with human replays)
-human_replay_eval = False
-; Control only the self-driving car
-human_replay_control_mode = "control_sdc_only"
+human_replay_eval = True
+; Control mode for human replay (control_vehicles with max_controlled_agents=1 controls one agent)
+human_replay_control_mode = "control_vehicles"
+; Number of agents in human replay evaluation environment
+human_replay_num_agents = 32
+; Number of rollouts for human replay evaluation
+human_replay_num_rollouts = 100
+; Number of maps to use for human replay evaluation
+human_replay_num_maps = 100
+; Number of maps to render for human replay (subset of eval maps)
+human_replay_render_num_maps = 2
 
 [sweep.train.learning_rate]
 distribution = log_normal
diff --git a/pufferlib/models.py b/pufferlib/models.py
index d0b438f9a3..9c6228468b 100644
--- a/pufferlib/models.py
+++ b/pufferlib/models.py
@@ -208,7 +208,7 @@ def __init__(
         hidden_size=128,
         num_layers=4,
         num_heads=8,
-        context_length=512,
+        horizon=512,
         dropout=0.0,
     ):
         """Wraps your policy with a Transformer for temporal modeling.
@@ -220,7 +220,7 @@ def __init__(
             hidden_size: Transformer hidden dimension
             num_layers: Number of transformer layers
             num_heads: Number of attention heads
-            context_length: Maximum sequence length to attend over
+            horizon: Maximum sequence length to attend over
             dropout: Dropout probability
         """
         super().__init__()
@@ -228,7 +228,7 @@ def __init__(
         self.policy = policy
         self.input_size = input_size
         self.hidden_size = hidden_size
-        self.context_length = context_length
+        self.horizon = horizon
         self.num_layers = num_layers
         self.num_heads = num_heads
         self.is_continuous = self.policy.is_continuous
@@ -240,7 +240,7 @@ def __init__(
             self.input_projection = nn.Identity()
 
         # Learnable positional embeddings
-        self.positional_embedding = nn.Parameter(torch.zeros(1, context_length, hidden_size))
+        self.positional_embedding = nn.Parameter(torch.zeros(1, horizon, hidden_size))
         nn.init.normal_(self.positional_embedding, std=0.02)
 
         # Transformer encoder
@@ -307,33 +307,29 @@ def forward_eval(self, observations, state):
         hidden = self.input_projection(hidden)
 
         if "transformer_context" not in state or state["transformer_context"] is None:
-            context = torch.zeros(B, self.context_length, self.hidden_size, device=device)
+            context = torch.zeros(B, self.horizon, self.hidden_size, device=device)
             pos = torch.zeros(1, dtype=torch.long, device=device)
         else:
             context = state["transformer_context"]
             pos = state.get("transformer_position", torch.zeros(1, dtype=torch.long, device=device))
 
-            if (
-                context.shape[-1] != self.hidden_size
-                or context.shape[0] != B
-                or context.shape[1] != self.context_length
-            ):
-                context = torch.zeros(B, self.context_length, self.hidden_size, device=device)
+            if context.shape[-1] != self.hidden_size or context.shape[0] != B or context.shape[1] != self.horizon:
+                context = torch.zeros(B, self.horizon, self.hidden_size, device=device)
                 pos = torch.zeros(1, dtype=torch.long, device=device)
 
-        write_idx = (pos % self.context_length).long()
+        write_idx = (pos % self.horizon).long()
         context[:, write_idx, :] = hidden.unsqueeze(1)
         pos = pos + 1
 
-        pos_embed = self.positional_embedding[:, : self.context_length]
+        pos_embed = self.positional_embedding[:, : self.horizon]
         context_with_pos = context + pos_embed
 
-        causal_mask = self.get_causal_mask(self.context_length, device)
+        causal_mask = self.get_causal_mask(self.horizon, device)
 
         output = self.transformer(context_with_pos, mask=causal_mask, is_causal=True)
         output = self.output_norm(output)
 
-        read_idx = ((pos - 1) % self.context_length).long()
+        read_idx = ((pos - 1) % self.horizon).long()
         hidden_out = output[:, read_idx, :].squeeze(1)
 
         state["transformer_context"] = context
@@ -361,7 +357,7 @@ def forward(self, observations, state):
         hidden = self.input_projection(hidden)
 
         # Remove dynamic truncation - use clamp instead of if
-        T_actual = min(T, self.context_length)  # Python int, fine
+        T_actual = min(T, self.horizon)  # Python int, fine
         if T_actual < T:
             hidden = hidden[:, -T_actual:]
             T = T_actual
@@ -390,7 +386,7 @@ def forward(self, observations, state):
         values = values.view(B, T)
 
         # Use Python int for context_len - no sync
-        context_len = min(T, self.context_length)
+        context_len = min(T, self.horizon)
         state["hidden"] = hidden
         state["transformer_context"] = hidden[:, -context_len:].detach()
         state["transformer_position"] = torch.full((B,), context_len - 1, dtype=torch.long, device=device)
diff --git a/pufferlib/ocean/benchmark/evaluator.py b/pufferlib/ocean/benchmark/evaluator.py
index dbf84c1906..03a36eabe8 100644
--- a/pufferlib/ocean/benchmark/evaluator.py
+++ b/pufferlib/ocean/benchmark/evaluator.py
@@ -638,7 +638,7 @@ def rollout(self, args, puffer_env, policy):
         the policy is with (static) human partners.
 
         Args:
-            args: Config dict with train settings (device, use_rnn, policy_architecture, etc.)
+            args: Config dict with train settings (device, rnn_name, etc.)
             puffer_env: PufferLib environment wrapper
             policy: Trained policy to evaluate
 
@@ -654,24 +654,27 @@ def rollout(self, args, puffer_env, policy):
 
         obs, info = puffer_env.reset()
 
-        policy_architecture = args["train"].get("policy_architecture", "Recurrent")
         k_scenarios = args["env"].get("k_scenarios", 1)
 
-        if policy_architecture == "Recurrent":
+        # Detect architecture from policy object
+        is_transformer = hasattr(policy, "horizon") and hasattr(policy, "transformer")
+        is_recurrent = hasattr(policy, "lstm")
+
+        if is_recurrent:
             state = dict(
                 lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device),
                 lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device),
             )
-        elif policy_architecture == "Transformer":
-            context_length = args["train"].get("context_window", 182)
+        elif is_transformer:
             state = dict(
-                transformer_context=torch.zeros(num_agents, context_length, policy.hidden_size, device=device),
+                transformer_context=torch.zeros(num_agents, policy.horizon, policy.hidden_size, device=device),
                 transformer_position=torch.zeros(1, dtype=torch.long, device=device),
             )
         else:
             state = {}
 
         collected_infos = []
+        scenario_metrics = {}  # Store scenario-specific metrics (scenario_0_*, scenario_1_*, etc.)
         delta_metrics = None
 
         # Loop through scenarios
@@ -690,7 +693,7 @@ def rollout(self, args, puffer_env, policy):
                 obs, rewards, dones, truncs, info_list = puffer_env.step(action_np)
 
                 # Reset transformer context on mid-scenario terminations (not at scenario boundaries)
-                if policy_architecture == "Transformer":
+                if is_transformer:
                     is_last_step = time_idx == self.sim_steps - 1
                     if not is_last_step:
                         done_mask = dones | truncs
@@ -704,6 +707,9 @@ def rollout(self, args, puffer_env, policy):
                         if isinstance(info_dict, dict):
                             if "ada_delta_score" in info_dict:
                                 delta_metrics = info_dict
+                            elif any(k.startswith("scenario_") for k in info_dict.keys()):
+                                # Scenario-specific metrics (scenario_0_*, scenario_1_*, etc.)
+                                scenario_metrics.update(info_dict)
                             elif "score" in info_dict:
                                 collected_infos.append(info_dict)
 
@@ -715,6 +721,10 @@ def rollout(self, args, puffer_env, policy):
                 values = [info.get(key, 0) for info in collected_infos]
                 aggregated[key] = np.mean(values)
 
+            # Merge scenario-specific metrics if they exist
+            if scenario_metrics:
+                aggregated.update(scenario_metrics)
+
             # Merge delta metrics if they exist
             if delta_metrics:
                 aggregated.update(delta_metrics)
diff --git a/pufferlib/ocean/drive/drive.c b/pufferlib/ocean/drive/drive.c
index 9f6337051c..bae514790c 100644
--- a/pufferlib/ocean/drive/drive.c
+++ b/pufferlib/ocean/drive/drive.c
@@ -34,20 +34,18 @@ void test_drivenet() {
 void demo() {
 
     // Note: The settings below are hardcoded for demo purposes. Since the policy was
-    // trained with these exact settings, that changing them may lead to
-    // weird behavior.
+    // trained with these exact settings, changing them may lead to weird behavior.
     Drive env = {
         .human_agent_idx = 0,
-        .dynamics_model = conf.dynamics_model,
-        .reward_vehicle_collision = conf.reward_vehicle_collision,
-        .reward_offroad_collision = conf.reward_offroad_collision,
-        .reward_ade = conf.reward_ade,
-        .goal_radius = conf.goal_radius,
-        .dt = conf.dt,
+        .dynamics_model = CLASSIC,
+        .reward_vehicle_collision = -1.0f,
+        .reward_offroad_collision = -1.0f,
+        .goal_radius = 2.0f,
+        .dt = 0.1f,
         .map_name = "resources/drive/binaries/training/map_000.bin",
-        .init_steps = conf.init_steps,
-        .collision_behavior = conf.collision_behavior,
-        .offroad_behavior = conf.offroad_behavior,
+        .init_steps = 0,
+        .collision_behavior = 0,
+        .offroad_behavior = 0,
     };
     allocate(&env);
     c_reset(&env);
diff --git a/pufferlib/ocean/drive/drive.h b/pufferlib/ocean/drive/drive.h
index 009b2d6206..6b44f49398 100644
--- a/pufferlib/ocean/drive/drive.h
+++ b/pufferlib/ocean/drive/drive.h
@@ -549,10 +549,16 @@ Entity *load_map_binary(const char *filename, Drive *env) {
         return NULL;
 
     // Read sdc_track_index
-    fread(&env->sdc_track_index, sizeof(int), 1, file);
+    if (fread(&env->sdc_track_index, sizeof(int), 1, file) != 1) {
+        fclose(file);
+        return NULL;
+    }
 
     // Read tracks_to_predict
-    fread(&env->num_tracks_to_predict, sizeof(int), 1, file);
+    if (fread(&env->num_tracks_to_predict, sizeof(int), 1, file) != 1) {
+        fclose(file);
+        return NULL;
+    }
     if (env->num_tracks_to_predict > 0) {
         env->tracks_to_predict_indices = (int *)malloc(env->num_tracks_to_predict * sizeof(int));
 
@@ -569,10 +575,22 @@ Entity *load_map_binary(const char *filename, Drive *env) {
     Entity *entities = (Entity *)malloc(env->num_entities * sizeof(Entity));
     for (int i = 0; i < env->num_entities; i++) {
         // Read base entity data
-        fread(&entities[i].scenario_id, sizeof(int), 1, file);
-        fread(&entities[i].type, sizeof(int), 1, file);
-        fread(&entities[i].id, sizeof(int), 1, file);
-        fread(&entities[i].array_size, sizeof(int), 1, file);
+        if (fread(&entities[i].scenario_id, sizeof(int), 1, file) != 1 ||
+            fread(&entities[i].type, sizeof(int), 1, file) != 1 || fread(&entities[i].id, sizeof(int), 1, file) != 1 ||
+            fread(&entities[i].array_size, sizeof(int), 1, file) != 1) {
+            // File truncated - adjust entity count and break
+            env->num_entities = i;
+            env->num_objects = (i < env->num_objects) ? i : env->num_objects;
+            env->num_roads = env->num_entities - env->num_objects;
+            break;
+        }
+        // Validate array_size is reasonable (max 1000 timesteps = 100s at 0.1s dt)
+        if (entities[i].array_size <= 0 || entities[i].array_size > 1000) {
+            env->num_entities = i;
+            env->num_objects = (i < env->num_objects) ? i : env->num_objects;
+            env->num_roads = env->num_entities - env->num_objects;
+            break;
+        }
         // Allocate arrays based on type
         int size = entities[i].array_size;
         entities[i].traj_x = (float *)malloc(size * sizeof(float));
@@ -773,12 +791,14 @@ void init_grid_map(Drive *env) {
                 float x_center = (env->entities[i].traj_x[j] + env->entities[i].traj_x[j + 1]) / 2;
                 float y_center = (env->entities[i].traj_y[j] + env->entities[i].traj_y[j + 1]) / 2;
                 int grid_index = getGridIndex(env, x_center, y_center);
-                env->grid_map->cell_entities_count[grid_index]++;
+                if (grid_index != -1) {
+                    env->grid_map->cell_entities_count[grid_index]++;
+                }
             }
         }
     }
-    int cell_entities_insert_index[grid_cell_count]; // Helper array for insertion index
-    memset(cell_entities_insert_index, 0, grid_cell_count * sizeof(int));
+    // Use heap allocation instead of VLA to avoid stack overflow on large maps
+    int *cell_entities_insert_index = (int *)calloc(grid_cell_count, sizeof(int));
 
     // Initialize grid cells
     for (int grid_index = 0; grid_index < grid_cell_count; grid_index++) {
@@ -804,6 +824,7 @@ void init_grid_map(Drive *env) {
             }
         }
     }
+    free(cell_entities_insert_index);
 }
 
 void init_neighbor_offsets(Drive *env) {
diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py
index 420e726b5e..02c1fe626b 100644
--- a/pufferlib/ocean/drive/drive.py
+++ b/pufferlib/ocean/drive/drive.py
@@ -79,6 +79,9 @@ def __init__(
 
         if episode_length != None:
             self.scenario_length = episode_length
+        # Only set episode_length if not already set (adaptive.py sets it before calling super())
+        if not hasattr(self, "episode_length"):
+            self.episode_length = self.scenario_length
 
         # Adaptive driving agent setup
         self.adaptive_driving_agent = int(adaptive_driving_agent)
@@ -223,11 +226,11 @@ def __init__(
 
         self._action_type_flag = 0 if action_type == "discrete" else 1
 
-        # Check if resources directory exists
-        binary_path = f"{map_dir}/map_000.bin"
+        # Check if resources directory exists (check map_001 since some datasets start at 001)
+        binary_path = f"{map_dir}/map_001.bin"
         if not os.path.exists(binary_path):
             raise FileNotFoundError(
-                f"Required directory {binary_path} not found. Please ensure the Drive maps are downloaded and installed correctly per docs."
+                f"Required file {binary_path} not found. Please ensure the Drive maps are downloaded and installed correctly per docs."
             )
 
         # Check maps availability
@@ -407,9 +410,11 @@ def _set_env_variables(self):
             self.agent_offsets, self.map_ids, self.num_envs = my_shared_tuple
             self.ego_ids = [i for i in range(self.agent_offsets[-1])]
             if len(self.ego_ids) != self.num_agents:
-                raise ValueError(
-                    f"mismatch between number of ego agents {len(self.ego_ids)} and number of agents {self.num_agents}"
+                print(
+                    f"Warning: requested {self.num_agents} agents but maps contain {len(self.ego_ids)} valid agents. Adjusting.",
+                    flush=True,
                 )
+                self.num_agents = len(self.ego_ids)
             self.local_co_player_ids = [[] for i in range(self.num_envs)]
             self.local_ego_ids = [[0] for i in range(self.num_envs)]
 
@@ -593,6 +598,10 @@ def step(self, actions):
                 scenario_log["scenario_id"] = self.current_scenario
                 self.scenario_metrics.append(scenario_log)
 
+                # Log metrics for all scenarios with scenario-specific prefixes
+                prefixed_log = {f"scenario_{self.current_scenario}_{k}": v for k, v in scenario_log.items() if k != "scenario_id"}
+                info.append(prefixed_log)
+
                 if self.current_scenario == self.k_scenarios - 1:
                     delta_metrics = self._compute_delta_metrics()
                     if delta_metrics:
@@ -604,6 +613,10 @@ def step(self, actions):
 
             self.current_scenario = (self.current_scenario + 1) % self.k_scenarios
 
+            # Reset coplayer LSTM state at scenario boundary so coplayer behaves consistently
+            if self.population_play:
+                self._reset_co_player_state()
+
         if self.tick > 0 and self.resample_frequency > 0 and self.tick % self.resample_frequency == 0:
             self.tick = 0
             will_resample = 1
@@ -872,7 +885,11 @@ def save_map_binary(map_data, output_file, unique_map_id):
             elif obj_type == "cyclist":
                 obj_type = 3
             f.write(struct.pack("i", obj_type))  # type
-            f.write(struct.pack("i", obj.get("id", 0)))  # id
+            # Truncate large IDs to fit in int32 range
+            obj_id = obj.get("id", 0)
+            if isinstance(obj_id, int) and (obj_id > 2147483647 or obj_id < -2147483648):
+                obj_id = obj_id % 2147483647
+            f.write(struct.pack("i", obj_id))  # id
             f.write(struct.pack("i", trajectory_length))  # array_size
             # Write position arrays
             positions = obj.get("position", [])
@@ -952,7 +969,11 @@ def save_map_binary(map_data, output_file, unique_map_id):
                 road_type = 10
             # Write base entity data
             f.write(struct.pack("i", road_type))  # type
-            f.write(struct.pack("i", road.get("id", 0)))  # id
+            # Truncate large IDs to fit in int32 range
+            road_id = road.get("id", 0)
+            if isinstance(road_id, int) and (road_id > 2147483647 or road_id < -2147483648):
+                road_id = road_id % 2147483647
+            f.write(struct.pack("i", road_id))  # id
             f.write(struct.pack("i", size))  # array_size
 
             # Write position arrays
@@ -1084,7 +1105,8 @@ def test_performance(timeout=10, atn_cache=1024, num_agents=1024):
 if __name__ == "__main__":
     # test_performance()
     # Process the train dataset
-    process_all_maps(data_folder="/data/processed/training")
+    # process_all_maps(data_folder="/data/processed/training")
+    process_all_maps(data_folder="/data/nuplan_gpudrive/nuplan")
     # Process the validation/test dataset
     # process_all_maps(data_folder="data/processed/validation")
     # # Process the validation_interactive dataset
diff --git a/pufferlib/ocean/drive/visualize.c b/pufferlib/ocean/drive/visualize.c
index 4820bd9be0..6f484e641d 100644
--- a/pufferlib/ocean/drive/visualize.c
+++ b/pufferlib/ocean/drive/visualize.c
@@ -65,11 +65,15 @@ void CloseVideo(VideoRecorder *recorder) {
     waitpid(recorder->pid, NULL, 0);
 }
 
-void renderTopDownView(Drive *env, Client *client, int map_height, int obs, int lasers, int trajectories,
-                       int frame_count, float *path, int show_human_logs, int show_grid, int img_width, int img_height,
-                       int zoom_in) {
+void renderTopDownView(Drive *env, Client *client, float map_width, float map_height, int obs, int lasers,
+                       int trajectories, int frame_count, float *path, int show_human_logs, int show_grid,
+                       int img_width, int img_height, int zoom_in, int current_scenario, int total_scenarios) {
     BeginDrawing();
 
+    // Calculate map center
+    float center_x = (env->grid_map->top_left_x + env->grid_map->bottom_right_x) / 2.0f;
+    float center_y = (env->grid_map->top_left_y + env->grid_map->bottom_right_y) / 2.0f;
+
     // Top-down orthographic camera
     Camera3D camera = {0};
 
@@ -77,10 +81,11 @@ void renderTopDownView(Drive *env, Client *client, int map_height, int obs, int
         camera.position = (Vector3){0.0f, 0.0f, 500.0f}; // above the scene
         camera.target = (Vector3){0.0f, 0.0f, 0.0f};     // look at origin
         camera.fovy = map_height;
-    } else { // Show full map
-        camera.position = (Vector3){env->grid_map->top_left_x, env->grid_map->bottom_right_y, 500.0f};
-        camera.target = (Vector3){env->grid_map->top_left_x, env->grid_map->bottom_right_y, 0.0f};
-        camera.fovy = 2 * map_height;
+    } else { // Show full map - center camera on map
+        camera.position = (Vector3){center_x, center_y, 500.0f};
+        camera.target = (Vector3){center_x, center_y, 0.0f};
+        // Use the larger dimension to ensure full map is visible
+        camera.fovy = (map_height > map_width) ? map_height * 1.1f : map_width * 1.1f;
     }
 
     camera.up = (Vector3){0.0f, -1.0f, 0.0f};
@@ -133,6 +138,12 @@ void renderTopDownView(Drive *env, Client *client, int map_height, int obs, int
     // Draw scene
     draw_scene(env, client, 1, obs, lasers, show_grid);
     EndMode3D();
+
+    // Draw scenario counter overlay (2D text on top of 3D scene)
+    char scenario_text[64];
+    snprintf(scenario_text, sizeof(scenario_text), "Scenario %d / %d", current_scenario, total_scenarios);
+    DrawText(scenario_text, 20, 20, 30, WHITE);
+
     EndDrawing();
 }
 
@@ -189,18 +200,125 @@ static int make_gif_from_frames(const char *pattern, int fps, const char *palett
     return 0;
 }
 
+// Transform observations from ego format to co-player format by inserting conditioning values
+// src_obs: Source observations (may include ego conditioning)
+// dst_obs: Destination buffer for co-player format (with co-player conditioning)
+// num_agents: Number of agents to transform
+// ego_base_dim: Base ego features (7 for CLASSIC, 10 for JERK)
+// co_use_rc/ec/dc: Co-player conditioning flags (determines which features to insert)
+void transform_obs_for_coplayer(float *src_obs, float *dst_obs, int num_agents, int ego_obs_size, int coplayer_obs_size,
+                                int ego_base_dim, int co_use_rc, int co_use_ec, int co_use_dc, float collision_lb,
+                                float collision_ub, float offroad_lb, float offroad_ub, float goal_lb, float goal_ub,
+                                float entropy_lb, float entropy_ub, float discount_lb, float discount_ub) {
+    // Fixed sizes for partner and road features (from drive.h constants)
+    int partner_features = (MAX_AGENTS - 1) * PARTNER_FEATURES;
+    int road_features = MAX_ROAD_SEGMENT_OBSERVATIONS * ROAD_FEATURES;
+    int partner_road_features = partner_features + road_features;
+
+    // Derive source conditioning size from ego_obs_size
+    // This handles cases where ego policy has conditioning (type != "none")
+    int src_conditioning = ego_obs_size - ego_base_dim - partner_road_features;
+
+    // Calculate destination conditioning size from flags
+    int dst_conditioning = (co_use_rc ? 3 : 0) + (co_use_ec ? 1 : 0) + (co_use_dc ? 1 : 0);
+
+    for (int i = 0; i < num_agents; i++) {
+        float *src = src_obs + i * ego_obs_size;
+        float *dst = dst_obs + i * coplayer_obs_size;
+
+        // Copy ego base features (without conditioning)
+        memcpy(dst, src, ego_base_dim * sizeof(float));
+
+        // Sample and insert conditioning values based on flags
+        // Order must match: reward (3), entropy (1), discount (1)
+        int cond_idx = ego_base_dim;
+        if (co_use_rc) {
+            // Reward conditioning (3 features: collision, offroad, goal)
+            dst[cond_idx++] = collision_lb + (float)rand() / RAND_MAX * (collision_ub - collision_lb);
+            dst[cond_idx++] = offroad_lb + (float)rand() / RAND_MAX * (offroad_ub - offroad_lb);
+            dst[cond_idx++] = goal_lb + (float)rand() / RAND_MAX * (goal_ub - goal_lb);
+        }
+        if (co_use_ec) {
+            // Entropy conditioning (1 feature)
+            dst[cond_idx++] = entropy_lb + (float)rand() / RAND_MAX * (entropy_ub - entropy_lb);
+        }
+        if (co_use_dc) {
+            // Discount conditioning (1 feature)
+            dst[cond_idx++] = discount_lb + (float)rand() / RAND_MAX * (discount_ub - discount_lb);
+        }
+
+        // Copy partner + road features, skipping over any source conditioning
+        memcpy(dst + ego_base_dim + dst_conditioning, src + ego_base_dim + src_conditioning,
+               partner_road_features * sizeof(float));
+    }
+}
+
+// Helper function for dual-policy forward pass
+// Runs ego policy on first num_ego_agents, co-player policy on the rest
+// Handles different observation sizes between ego and co-player policies
+void forward_population(DriveNet *ego_net, DriveNet *co_player_net, float *observations, int *actions,
+                        int num_ego_agents, int num_co_players, int ego_obs_size, int coplayer_obs_size,
+                        int ego_base_dim, int co_use_rc, int co_use_ec, int co_use_dc, float co_collision_lb,
+                        float co_collision_ub, float co_offroad_lb, float co_offroad_ub, float co_goal_lb,
+                        float co_goal_ub, float co_entropy_lb, float co_entropy_ub, float co_discount_lb,
+                        float co_discount_ub) {
+    if (co_player_net == NULL || num_co_players == 0) {
+        // Single policy mode - use ego net for all agents
+        forward(ego_net, observations, actions);
+        return;
+    }
+
+    // Allocate temporary buffers for ego observations/actions
+    float *ego_obs = (float *)malloc(num_ego_agents * ego_obs_size * sizeof(float));
+    int *ego_actions = (int *)malloc(num_ego_agents * sizeof(int));
+
+    // Allocate temporary buffers for co-player observations/actions
+    float *co_obs_raw = observations + num_ego_agents * ego_obs_size;
+    float *co_obs_transformed = (float *)malloc(num_co_players * coplayer_obs_size * sizeof(float));
+    int *co_actions = (int *)malloc(num_co_players * sizeof(int));
+
+    // Copy ego observations (already correct format)
+    memcpy(ego_obs, observations, num_ego_agents * ego_obs_size * sizeof(float));
+
+    // Transform co-player observations (add conditioning features)
+    transform_obs_for_coplayer(co_obs_raw, co_obs_transformed, num_co_players, ego_obs_size, coplayer_obs_size,
+                               ego_base_dim, co_use_rc, co_use_ec, co_use_dc, co_collision_lb, co_collision_ub,
+                               co_offroad_lb, co_offroad_ub, co_goal_lb, co_goal_ub, co_entropy_lb, co_entropy_ub,
+                               co_discount_lb, co_discount_ub);
+
+    // Run forward on each network
+    forward(ego_net, ego_obs, ego_actions);
+    forward(co_player_net, co_obs_transformed, co_actions);
+
+    // Combine actions back
+    memcpy(actions, ego_actions, num_ego_agents * sizeof(int));
+    memcpy(actions + num_ego_agents, co_actions, num_co_players * sizeof(int));
+
+    // Cleanup
+    free(ego_obs);
+    free(ego_actions);
+    free(co_obs_transformed);
+    free(co_actions);
+}
+
 int eval_gif(const char *map_name, const char *policy_name, int show_grid, int obs_only, int lasers,
              int show_human_logs, int frame_skip, const char *view_mode, const char *output_topdown,
-             const char *output_agent, int num_maps, int zoom_in) {
+             const char *output_agent, int num_maps, int zoom_in, const char *ini_file, int k_scenarios_cli,
+             int max_controlled_agents_cli, const char *co_player_policy_name, const char *map_dir_cli) {
 
     // Parse configuration from INI file
     env_init_config conf = {0};
-    const char *ini_file = "pufferlib/config/ocean/drive.ini";
     if (ini_parse(ini_file, handler, &conf) < 0) {
         fprintf(stderr, "Error: Could not load %s. Cannot determine environment configuration.\n", ini_file);
         return -1;
     }
 
+    // Override map_dir if provided via CLI
+    if (map_dir_cli != NULL) {
+        strncpy(conf.map_dir, map_dir_cli, sizeof(conf.map_dir) - 1);
+        conf.map_dir[sizeof(conf.map_dir) - 1] = '\0';
+    }
+
     char map_buffer[100];
     if (map_name == NULL) {
         srand(time(NULL));
@@ -269,7 +387,7 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o
         .entropy_weight_ub = (conf.conditioning != NULL) ? conf.conditioning->entropy_weight_ub : 0.0f,
         .discount_weight_lb = (conf.conditioning != NULL) ? conf.conditioning->discount_weight_lb : 0.0f,
         .discount_weight_ub = (conf.conditioning != NULL) ? conf.conditioning->discount_weight_ub : 0.0f,
-        .max_controlled_agents = 32,
+        .max_controlled_agents = (max_controlled_agents_cli > 0) ? max_controlled_agents_cli : 32,
     };
 
     allocate(&env);
@@ -317,11 +435,92 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o
     client->cyclist = LoadModel("resources/drive/cyclist.glb");
     client->pedestrian = LoadModel("resources/drive/pedestrian.glb");
 
+    // Determine number of ego agents vs co-players
+    int num_ego_agents = env.active_agent_count;
+    int num_co_players = 0;
+    DriveNet *co_player_net = NULL;
+
+    // Co-player conditioning flags (hoisted to outer scope for later use)
+    int co_use_rc = 0, co_use_ec = 0, co_use_dc = 0;
+
+    // Check if co-player policy is provided (either via CLI or INI)
+    const char *actual_co_player_policy = co_player_policy_name;
+    if (actual_co_player_policy == NULL && conf.co_player_enabled && strlen(conf.co_player_policy_path) > 0) {
+        actual_co_player_policy = conf.co_player_policy_path;
+    }
+
+    if (actual_co_player_policy != NULL) {
+        // Population play mode - split agents between ego and co-player
+        // Use num_ego_agents from config, or default to half
+        if (conf.num_ego_agents > 0 && conf.num_ego_agents < env.active_agent_count) {
+            num_ego_agents = conf.num_ego_agents;
+        } else {
+            num_ego_agents = env.active_agent_count / 2;
+        }
+        num_co_players = env.active_agent_count - num_ego_agents;
+
+        printf("Population play: %d ego agents, %d co-players\n", num_ego_agents, num_co_players);
+
+        // Load co-player policy
+        FILE *co_policy_file = fopen(actual_co_player_policy, "rb");
+        if (co_policy_file != NULL) {
+            fclose(co_policy_file);
+            Weights *co_weights = load_weights(actual_co_player_policy);
+
+            // Determine co-player conditioning from config
+            if (conf.co_player_conditioning != NULL) {
+                co_use_rc = (strcmp(conf.co_player_conditioning->type, "reward") == 0 ||
+                             strcmp(conf.co_player_conditioning->type, "all") == 0);
+                co_use_ec = (strcmp(conf.co_player_conditioning->type, "entropy") == 0 ||
+                             strcmp(conf.co_player_conditioning->type, "all") == 0);
+                co_use_dc = (strcmp(conf.co_player_conditioning->type, "discount") == 0 ||
+                             strcmp(conf.co_player_conditioning->type, "all") == 0);
+            }
+
+            co_player_net =
+                init_drivenet(co_weights, num_co_players, env.dynamics_model, co_use_rc, co_use_ec, co_use_dc);
+            printf("Co-player policy loaded with conditioning: rc=%d, ec=%d, dc=%d\n", co_use_rc, co_use_ec, co_use_dc);
+        } else {
+            printf("Warning: Could not load co-player policy from %s. Using main policy for all agents.\n",
+                   actual_co_player_policy);
+            num_ego_agents = env.active_agent_count;
+            num_co_players = 0;
+        }
+    }
+
+    // Extract co-player conditioning bounds from config
+    float co_collision_lb = 0, co_collision_ub = 0;
+    float co_offroad_lb = 0, co_offroad_ub = 0;
+    float co_goal_lb = 0, co_goal_ub = 0;
+    float co_entropy_lb = 0, co_entropy_ub = 0;
+    float co_discount_lb = 0, co_discount_ub = 0;
+
+    // Get conditioning dims directly from co_player_net to ensure consistency
+    int coplayer_num_conditioning = (co_player_net != NULL) ? co_player_net->conditioning_dims : 0;
+
+    if (conf.co_player_conditioning != NULL) {
+        co_collision_lb = conf.co_player_conditioning->reward_collision_weight_lb;
+        co_collision_ub = conf.co_player_conditioning->reward_collision_weight_ub;
+        co_offroad_lb = conf.co_player_conditioning->reward_offroad_weight_lb;
+        co_offroad_ub = conf.co_player_conditioning->reward_offroad_weight_ub;
+        co_goal_lb = conf.co_player_conditioning->reward_goal_weight_lb;
+        co_goal_ub = conf.co_player_conditioning->reward_goal_weight_ub;
+        co_entropy_lb = conf.co_player_conditioning->entropy_weight_lb;
+        co_entropy_ub = conf.co_player_conditioning->entropy_weight_ub;
+        co_discount_lb = conf.co_player_conditioning->discount_weight_lb;
+        co_discount_ub = conf.co_player_conditioning->discount_weight_ub;
+    }
+
+    // Load main (ego) policy
     Weights *weights = load_weights(policy_name);
     printf("Active agents in map: %d\n", env.active_agent_count);
-    DriveNet *net = init_drivenet(weights, env.active_agent_count, env.dynamics_model, use_rc, use_ec, use_dc);
+    DriveNet *net = init_drivenet(weights, num_ego_agents, env.dynamics_model, use_rc, use_ec, use_dc);
 
-    int frame_count = env.scenario_length > 0 ? env.scenario_length : TRAJECTORY_LENGTH_DEFAULT;
+    // Calculate frame count: k_scenarios * scenario_length for adaptive agents
+    int scenario_length = env.scenario_length > 0 ? env.scenario_length : TRAJECTORY_LENGTH_DEFAULT;
+    int k_scenarios = (k_scenarios_cli > 0) ? k_scenarios_cli : (conf.k_scenarios > 0 ? conf.k_scenarios : 1);
+    int frame_count = k_scenarios * scenario_length;
+    printf("Rendering %d scenarios x %d steps = %d total frames\n", k_scenarios, scenario_length, frame_count);
     char filename_topdown[256];
     char filename_agent[256];
 
@@ -373,16 +572,39 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o
         }
     }
 
+    // Calculate observation sizes per agent
+    // ego_base_dim: 7 for CLASSIC dynamics, 10 for JERK dynamics
+    int ego_base_dim = (env.dynamics_model == 1) ? 10 : 7; // 1 = JERK
+
+    // Ego observation size (environment generates observations without conditioning for ego)
+    int ego_obs_size =
+        net->ego_dim + (MAX_AGENTS - 1) * PARTNER_FEATURES + MAX_ROAD_SEGMENT_OBSERVATIONS * ROAD_FEATURES;
+
+    // Co-player observation size (includes conditioning features)
+    int coplayer_obs_size = ego_obs_size;
+    if (co_player_net != NULL) {
+        coplayer_obs_size = co_player_net->ego_dim + (MAX_AGENTS - 1) * PARTNER_FEATURES +
+                            MAX_ROAD_SEGMENT_OBSERVATIONS * ROAD_FEATURES;
+    }
+
+    printf("Observation sizes: ego=%d, coplayer=%d, ego_base_dim=%d, coplayer_conditioning=%d\n", ego_obs_size,
+           coplayer_obs_size, ego_base_dim, coplayer_num_conditioning);
+
     if (render_topdown) {
         printf("Recording topdown view...\n");
         for (int i = 0; i < frame_count; i++) {
+            // Calculate current scenario (1-indexed for display)
+            int current_scenario = (i / scenario_length) + 1;
             if (i % frame_skip == 0) {
-                renderTopDownView(&env, client, map_height, 0, 0, 0, frame_count, NULL, show_human_logs, show_grid,
-                                  img_width, img_height, zoom_in);
+                renderTopDownView(&env, client, map_width, map_height, 0, 0, 0, frame_count, NULL, show_human_logs,
+                                  show_grid, img_width, img_height, zoom_in, current_scenario, k_scenarios);
                 WriteFrame(&topdown_recorder, img_width, img_height);
                 rendered_frames++;
             }
-            forward(net, env.observations, (int *)env.actions);
+            forward_population(net, co_player_net, env.observations, (int *)env.actions, num_ego_agents, num_co_players,
+                               ego_obs_size, coplayer_obs_size, ego_base_dim, co_use_rc, co_use_ec, co_use_dc,
+                               co_collision_lb, co_collision_ub, co_offroad_lb, co_offroad_ub, co_goal_lb, co_goal_ub,
+                               co_entropy_lb, co_entropy_ub, co_discount_lb, co_discount_ub);
             c_step(&env);
         }
     }
@@ -400,7 +622,10 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o
                 WriteFrame(&agent_recorder, img_width, img_height);
                 rendered_frames++;
             }
-            forward(net, env.observations, (int *)env.actions);
+            forward_population(net, co_player_net, env.observations, (int *)env.actions, num_ego_agents, num_co_players,
+                               ego_obs_size, coplayer_obs_size, ego_base_dim, co_use_rc, co_use_ec, co_use_dc,
+                               co_collision_lb, co_collision_ub, co_offroad_lb, co_offroad_ub, co_goal_lb, co_goal_ub,
+                               co_entropy_lb, co_entropy_ub, co_discount_lb, co_discount_ub);
             c_step(&env);
         }
     }
@@ -424,6 +649,9 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o
     free_allocated(&env);
     free_drivenet(net);
     free(weights);
+    if (co_player_net != NULL) {
+        free_drivenet(co_player_net);
+    }
     return 0;
 }
 
@@ -440,10 +668,15 @@ int main(int argc, char *argv[]) {
     // File paths and num_maps (not in [env] section)
     const char *map_name = NULL;
     const char *policy_name = "resources/drive/puffer_drive_weights.bin";
+    const char *co_player_policy_name = NULL;
     const char *output_topdown = NULL;
     const char *output_agent = NULL;
+    const char *ini_file = "pufferlib/config/ocean/drive.ini";
+    const char *map_dir_cli = NULL; // CLI override for map_dir
     int num_maps = 1;
     int scenario_length_cli = -1;
+    int k_scenarios_cli = -1;
+    int max_controlled_agents_cli = -1;
     int use_rc = 0;
     int use_ec = 0;
     int use_dc = 0;
@@ -515,10 +748,51 @@ int main(int argc, char *argv[]) {
                 num_maps = atoi(argv[i + 1]);
                 i++;
             }
+        } else if (strcmp(argv[i], "--ini-file") == 0) {
+            if (i + 1 < argc) {
+                ini_file = argv[i + 1];
+                i++;
+            } else {
+                fprintf(stderr, "Error: --ini-file option requires a file path\n");
+                return 1;
+            }
+        } else if (strcmp(argv[i], "--k-scenarios") == 0) {
+            if (i + 1 < argc) {
+                k_scenarios_cli = atoi(argv[i + 1]);
+                i++;
+            } else {
+                fprintf(stderr, "Error: --k-scenarios option requires a number\n");
+                return 1;
+            }
+        } else if (strcmp(argv[i], "--max-controlled-agents") == 0) {
+            if (i + 1 < argc) {
+                max_controlled_agents_cli = atoi(argv[i + 1]);
+                i++;
+            } else {
+                fprintf(stderr, "Error: --max-controlled-agents option requires a number\n");
+                return 1;
+            }
+        } else if (strcmp(argv[i], "--co-player-policy") == 0) {
+            if (i + 1 < argc) {
+                co_player_policy_name = argv[i + 1];
+                i++;
+            } else {
+                fprintf(stderr, "Error: --co-player-policy option requires a file path\n");
+                return 1;
+            }
+        } else if (strcmp(argv[i], "--map-dir") == 0) {
+            if (i + 1 < argc) {
+                map_dir_cli = argv[i + 1];
+                i++;
+            } else {
+                fprintf(stderr, "Error: --map-dir option requires a directory path\n");
+                return 1;
+            }
         }
     }
 
     eval_gif(map_name, policy_name, show_grid, obs_only, lasers, show_human_logs, frame_skip, view_mode, output_topdown,
-             output_agent, num_maps, zoom_in);
+             output_agent, num_maps, zoom_in, ini_file, k_scenarios_cli, max_controlled_agents_cli,
+             co_player_policy_name, map_dir_cli);
     return 0;
 }
diff --git a/pufferlib/ocean/env_binding.h b/pufferlib/ocean/env_binding.h
index b0ea15458f..3262620d60 100644
--- a/pufferlib/ocean/env_binding.h
+++ b/pufferlib/ocean/env_binding.h
@@ -616,12 +616,18 @@ static PyObject *vec_log(PyObject *self, PyObject *args) {
     float n = aggregate.n;
     // Average across EGO agents only
     if (n > 0) {
+        // Compute completion_rate from raw totals BEFORE averaging
+        float total_goals_reached = aggregate.goals_reached_this_episode;
+        float total_goals_sampled = aggregate.goals_sampled_this_episode;
+        if (total_goals_sampled > 0) {
+            aggregate.completion_rate = total_goals_reached / total_goals_sampled;
+        } else {
+            aggregate.completion_rate = 0.0f;
+        }
+
         for (int i = 0; i < num_keys; i++) {
             ((float *)&aggregate)[i] /= n;
         }
-
-        // Compute completion_rate from aggregated counts
-        aggregate.completion_rate = aggregate.goals_reached_this_episode / aggregate.goals_sampled_this_episode;
     }
 
     // User populates dict
@@ -632,15 +638,20 @@ static PyObject *vec_log(PyObject *self, PyObject *args) {
     if (has_co_players && co_player_aggregate.n > 0.0f) {
         float co_player_n = co_player_aggregate.n;
 
+        // Compute co-player completion rate from raw totals BEFORE averaging
+        float co_total_goals_reached = co_player_aggregate.goals_reached_this_episode;
+        float co_total_goals_sampled = co_player_aggregate.goals_sampled_this_episode;
+        if (co_total_goals_sampled > 0) {
+            co_player_aggregate.completion_rate = co_total_goals_reached / co_total_goals_sampled;
+        } else {
+            co_player_aggregate.completion_rate = 0.0f;
+        }
+
         // Average co-player metrics across CO-PLAYER agents only
         for (int i = 0; i < num_keys; i++) {
             ((float *)&co_player_aggregate)[i] /= co_player_n;
         }
 
-        // Compute co-player completion rate
-        co_player_aggregate.completion_rate =
-            co_player_aggregate.goals_reached_this_episode / co_player_aggregate.goals_sampled_this_episode;
-
         // Add co-player metrics to dict with co_player_ prefix
         assign_to_dict(dict, "ego_co_player_ratio", n / co_player_n);
         assign_to_dict(dict, "co_player_completion_rate", co_player_aggregate.completion_rate);
diff --git a/pufferlib/ocean/env_config.h b/pufferlib/ocean/env_config.h
index 4f26a8b32e..35f2806806 100644
--- a/pufferlib/ocean/env_config.h
+++ b/pufferlib/ocean/env_config.h
@@ -38,12 +38,18 @@ typedef struct {
     int goal_behavior;
     float goal_target_distance;
     int scenario_length;
+    int k_scenarios;
     int termination_mode;
     int init_steps;
     int init_mode;
     int control_mode;
     char map_dir[256];
     conditioning_config *conditioning;
+    // Population play settings
+    int co_player_enabled;
+    int num_ego_agents;
+    char co_player_policy_path[256];
+    conditioning_config *co_player_conditioning;
 } env_init_config;
 
 // INI file parser handler - parses all environment configuration from drive.ini
@@ -97,6 +103,8 @@ static int handler(void *config, const char *section, const char *name, const ch
         env_config->dt = atof(value);
     } else if (MATCH("env", "scenario_length")) {
         env_config->scenario_length = atoi(value);
+    } else if (MATCH("env", "k_scenarios")) {
+        env_config->k_scenarios = atoi(value);
     } else if (MATCH("env", "termination_mode")) {
         env_config->termination_mode = atoi(value);
     } else if (MATCH("env", "init_steps")) {
@@ -175,6 +183,85 @@ static int handler(void *config, const char *section, const char *name, const ch
         }
         env_config->conditioning->discount_weight_ub = atof(value);
     }
+    // Population play settings
+    else if (MATCH("env", "co_player_enabled")) {
+        if (strcmp(value, "True") == 0 || strcmp(value, "true") == 0 || strcmp(value, "1") == 0) {
+            env_config->co_player_enabled = 1;
+        } else {
+            env_config->co_player_enabled = 0;
+        }
+    } else if (MATCH("env", "num_ego_agents")) {
+        env_config->num_ego_agents = atoi(value);
+    }
+    // Co-player policy settings
+    else if (MATCH("env.co_player_policy", "policy_path")) {
+        if (sscanf(value, "\"%255[^\"]\"", env_config->co_player_policy_path) != 1) {
+            strncpy(env_config->co_player_policy_path, value, sizeof(env_config->co_player_policy_path) - 1);
+            env_config->co_player_policy_path[sizeof(env_config->co_player_policy_path) - 1] = '\0';
+        }
+    } else if (MATCH("env.co_player_policy.conditioning", "type")) {
+        if (env_config->co_player_conditioning == NULL) {
+            env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config));
+        }
+        if (value[0] == '"') {
+            size_t len = strlen(value) - 2;
+            env_config->co_player_conditioning->type = (char *)malloc(len + 1);
+            strncpy(env_config->co_player_conditioning->type, value + 1, len);
+            env_config->co_player_conditioning->type[len] = '\0';
+        } else {
+            env_config->co_player_conditioning->type = strdup(value);
+        }
+    } else if (MATCH("env.co_player_policy.conditioning", "collision_weight_lb")) {
+        if (env_config->co_player_conditioning == NULL) {
+            env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config));
+        }
+        env_config->co_player_conditioning->reward_collision_weight_lb = atof(value);
+    } else if (MATCH("env.co_player_policy.conditioning", "collision_weight_ub")) {
+        if (env_config->co_player_conditioning == NULL) {
+            env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config));
+        }
+        env_config->co_player_conditioning->reward_collision_weight_ub = atof(value);
+    } else if (MATCH("env.co_player_policy.conditioning", "offroad_weight_lb")) {
+        if (env_config->co_player_conditioning == NULL) {
+            env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config));
+        }
+        env_config->co_player_conditioning->reward_offroad_weight_lb = atof(value);
+    } else if (MATCH("env.co_player_policy.conditioning", "offroad_weight_ub")) {
+        if (env_config->co_player_conditioning == NULL) {
+            env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config));
+        }
+        env_config->co_player_conditioning->reward_offroad_weight_ub = atof(value);
+    } else if (MATCH("env.co_player_policy.conditioning", "goal_weight_lb")) {
+        if (env_config->co_player_conditioning == NULL) {
+            env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config));
+        }
+        env_config->co_player_conditioning->reward_goal_weight_lb = atof(value);
+    } else if (MATCH("env.co_player_policy.conditioning", "goal_weight_ub")) {
+        if (env_config->co_player_conditioning == NULL) {
+            env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config));
+        }
+        env_config->co_player_conditioning->reward_goal_weight_ub = atof(value);
+    } else if (MATCH("env.co_player_policy.conditioning", "entropy_weight_lb")) {
+        if (env_config->co_player_conditioning == NULL) {
+            env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config));
+        }
+        env_config->co_player_conditioning->entropy_weight_lb = atof(value);
+    } else if (MATCH("env.co_player_policy.conditioning", "entropy_weight_ub")) {
+        if (env_config->co_player_conditioning == NULL) {
+            env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config));
+        }
+        env_config->co_player_conditioning->entropy_weight_ub = atof(value);
+    } else if (MATCH("env.co_player_policy.conditioning", "discount_weight_lb")) {
+        if (env_config->co_player_conditioning == NULL) {
+            env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config));
+        }
+        env_config->co_player_conditioning->discount_weight_lb = atof(value);
+    } else if (MATCH("env.co_player_policy.conditioning", "discount_weight_ub")) {
+        if (env_config->co_player_conditioning == NULL) {
+            env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config));
+        }
+        env_config->co_player_conditioning->discount_weight_ub = atof(value);
+    }
 
     else {
         return 0; // Unknown section/name, indicate failure to handle
diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 270205ed70..f2affe1427 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -79,10 +79,14 @@ def __init__(self, config, vecenv, policy, logger=None):
             if config.get("policy_architecture", "Recurrent") == "Recurrent":
                 config["bptt_horizon"] = vecenv.driver_env.episode_length
             if config.get("policy_architecture", "Recurrent") == "Transformer":
-                config["context_window"] = self.context_length = vecenv.driver_env.episode_length
+                config["context_length"] = self.context_length = vecenv.driver_env.episode_length
                 config["bptt_horizon"] = (
                     vecenv.driver_env.episode_length
                 )  ## this is used downstream so you need to define it too
+        else:
+            if config.get("policy_architecture", "Recurrent") == "Transformer":
+                self.context_length = config["context_length"]
+                config["bptt_horizon"] = config["context_length"]
 
         vecenv.async_reset(seed)
         obs_space = vecenv.single_observation_space
@@ -95,7 +99,7 @@ def __init__(self, config, vecenv, policy, logger=None):
             if config.get("policy_architecture", "Recurrent") == "Recurrent":
                 batch_size = vecenv.driver_env.num_ego_agents * config["bptt_horizon"] * vecenv.num_workers
             if config.get("policy_architecture", "Recurrent") == "Transformer":
-                batch_size = vecenv.driver_env.num_ego_agents * config["context_window"] * vecenv.num_workers
+                batch_size = vecenv.driver_env.num_ego_agents * config["context_length"] * vecenv.num_workers
             config["batch_size"] = batch_size  ## this is dynamic and based on ego agents
         else:
             agents_for_calc = total_agents
@@ -107,24 +111,24 @@ def __init__(self, config, vecenv, policy, logger=None):
         if (
             config["batch_size"] == "auto"
             and config.get("bptt_horizon", "auto") == "auto"
-            and config.get("context_window", "auto") == "auto"
+            and config.get("context_length", "auto") == "auto"
         ):
-            raise pufferlib.APIUsageError("Must specify batch_size, bptt_horizon, or context_window")
+            raise pufferlib.APIUsageError("Must specify batch_size, bptt_horizon, or context_length")
         elif config["batch_size"] == "auto":
             if config.get("policy_architecture", "Recurrent") == "Recurrent":
                 config["batch_size"] = agents_for_calc * config["bptt_horizon"]
             elif config.get("policy_architecture", "Recurrent") == "Transformer":
-                config["batch_size"] = agents_for_calc * config["context_window"]
+                config["batch_size"] = agents_for_calc * config["context_length"]
         elif (
             config.get("bptt_horizon", "auto") == "auto"
             and config.get("policy_architecture", "Recurrent") == "Recurrent"
         ):
             config["bptt_horizon"] = config["batch_size"] // agents_for_calc
         elif (
-            config.get("context_window", "auto") == "auto"
+            config.get("context_length", "auto") == "auto"
             and config.get("policy_architecture", "Recurrent") == "Transformer"
         ):
-            config["context_window"] = config["batch_size"] // agents_for_calc
+            config["context_length"] = config["batch_size"] // agents_for_calc
 
         batch_size = config["batch_size"]
 
@@ -132,9 +136,9 @@ def __init__(self, config, vecenv, policy, logger=None):
         if config.get("policy_architecture", "Recurrent") == "Recurrent":
             horizon = config["bptt_horizon"]
         elif config.get("policy_architecture", "Recurrent") == "Transformer":
-            horizon = config["context_window"]
+            horizon = config["context_length"]
         else:
-            horizon = config.get("bptt_horizon", config.get("context_window", 1))
+            horizon = config.get("bptt_horizon", config.get("context_length", 1))
 
         config["bptt_horizon"] = horizon  # For backward compatibility
 
@@ -180,7 +184,7 @@ def __init__(self, config, vecenv, policy, logger=None):
             ensure_drive_binary()
 
         # LSTM
-        if config.get("policy_architecture", "Recurrent") == "Recurrent":
+        if config.get("rnn_name", "Recurrent") == "Recurrent":
             h = policy.hidden_size
             if self.population_play:
                 n = vecenv.ego_agents_per_batch  # Use ego agents per batch
@@ -193,7 +197,7 @@ def __init__(self, config, vecenv, policy, logger=None):
                 self.lstm_c = {i * n: torch.zeros(n, h, device=device) for i in range(total_agents // n)}
 
         # TRANSFORMER
-        if config.get("policy_architecture", "Recurrent") == "Transformer":
+        if config.get("rnn_name", "Recurrent") == "Transformer":
             h = policy.hidden_size
 
             if self.population_play:
@@ -345,22 +349,23 @@ def evaluate(self):
         device = config["device"]
 
         # Reset hidden states for both RNN and Transformer
-        if config.get("policy_architecture", "Recurrent") == "Recurrent":
+        if config.get("rnn_name", "Recurrent") == "Recurrent":
             for k in self.lstm_h:
                 self.lstm_h[k] = torch.zeros(self.lstm_h[k].shape, device=device)
                 self.lstm_c[k] = torch.zeros(self.lstm_c[k].shape, device=device)
 
-        if config.get("policy_architecture", "Recurrent") == "Transformer":
+        if config.get("rnn_name", "Recurrent") == "Transformer":
             h = self.policy.hidden_size
             for k in self.transformer_context:
                 n = self.transformer_context[k].shape[0]
                 # Pre-allocate full buffer instead of empty
-                self.transformer_context[k] = torch.zeros(n, self.context_length, h, device=device)
+                self.transformer_context[k] = torch.zeros(n, self.horizon, h, device=device)
                 self.transformer_position[k] = torch.zeros(1, dtype=torch.long, device=device)
 
         self.full_rows = 0
         while self.full_rows < self.segments:
             profile("env", epoch)
+            print(".", end="", flush=True)  # Workaround: visible I/O prevents multiprocessing deadlock
             o, r, d, t, info, env_id, mask = self.vecenv.recv()
             # print(f"o shape is {o.shape}", flush = True)
             if self.population_play:
@@ -418,11 +423,11 @@ def evaluate(self):
                     batch_size = self.vecenv.agents_per_batch
                 state_key = (env_id.start // batch_size) * batch_size
 
-                if config.get("policy_architecture", "Recurrent") == "Recurrent":
+                if config.get("rnn_name", "Recurrent") == "Recurrent":
                     state["lstm_h"] = self.lstm_h[state_key]
                     state["lstm_c"] = self.lstm_c[state_key]
 
-                if config.get("policy_architecture", "Recurrent") == "Transformer":
+                if config.get("rnn_name", "Recurrent") == "Transformer":
                     state["transformer_context"] = self.transformer_context[state_key]
                     state["transformer_position"] = self.transformer_position[state_key]
                     # Note: terminals not needed for eval since we're doing single-step inference
@@ -434,7 +439,7 @@ def evaluate(self):
             profile("eval_copy", epoch)
             with torch.no_grad():
                 # Update hidden states after forward pass
-                if config.get("policy_architecture", "Recurrent") == "Recurrent":
+                if config.get("rnn_name", "Recurrent") == "Recurrent":
                     if self.population_play:
                         batch_size = self.vecenv.ego_agents_per_batch
                     else:
@@ -444,7 +449,7 @@ def evaluate(self):
                     self.lstm_h[lstm_key] = state["lstm_h"]
                     self.lstm_c[lstm_key] = state["lstm_c"]
 
-                if config.get("policy_architecture", "Recurrent") == "Transformer":
+                if config.get("rnn_name", "Recurrent") == "Transformer":
                     if self.population_play:
                         batch_size = self.vecenv.ego_agents_per_batch
                     else:
@@ -483,7 +488,7 @@ def evaluate(self):
                 self.ep_lengths[env_id] += 1
                 # Use appropriate horizon based on model type
                 horizon = (
-                    config.get("context_window")
+                    config.get("context_length")
                     if config.get("policy_architecture", "Recurrent") == "Transformer"
                     else config["bptt_horizon"]
                 )
@@ -597,8 +602,8 @@ def train(self):
 
             # Handle observation reshaping based on model type
             if (
-                not config.get("policy_architecture", "Recurrent") == "Recurrent"
-                and not config.get("policy_architecture", "Recurrent") == "Transformer"
+                not config.get("rnn_name", "Recurrent") == "Recurrent"
+                and not config.get("rnn_name", "Recurrent") == "Transformer"
             ):
                 # Flatten for non-recurrent models
                 mb_obs = mb_obs.reshape(-1, *self.vecenv.single_observation_space.shape)
@@ -608,10 +613,10 @@ def train(self):
             )
 
             # Add appropriate state based on model type
-            if config.get("policy_architecture", "Recurrent") == "Recurrent":
+            if config.get("rnn_name", "Recurrent") == "Recurrent":
                 state["lstm_h"] = None
                 state["lstm_c"] = None
-            elif config.get("policy_architecture", "Recurrent") == "Transformer":
+            elif config.get("rnn_name", "Recurrent") == "Transformer":
                 state["transformer_context"] = None
                 state["transformer_position"] = None
                 state["terminals"] = mb_terminals  # For episode boundary masking
@@ -620,8 +625,8 @@ def train(self):
 
             # Handle action sampling based on observation shape
             if (
-                config.get("policy_architecture", "Recurrent") == "Recurrent"
-                or config.get("policy_architecture", "Recurrent") == "Transformer"
+                config.get("rnn_name", "Recurrent") == "Recurrent"
+                or config.get("rnn_name", "Recurrent") == "Transformer"
             ):
                 # Add this right before calling sample_logits
                 if isinstance(logits, tuple):
@@ -789,6 +794,38 @@ def train(self):
         ):
             pufferlib.utils.run_human_replay_eval_in_subprocess(self.config, self.logger, self.global_step)
 
+        # Eval rendering (ego vs human logs)
+        if self.config["eval"].get("human_replay_eval", False):
+            if self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training:
+                model_dir = os.path.join(self.config["data_dir"], f"{self.config['env']}_{self.logger.run_id}")
+                model_files = glob.glob(os.path.join(model_dir, "model_*.pt"))
+
+                if model_files:
+                    latest_cpt = max(model_files, key=os.path.getctime)
+                    bin_path = f"{model_dir}.bin"
+
+                    try:
+                        export_args = {"env_name": self.config["env"], "load_model_path": latest_cpt, **self.config}
+                        export(
+                            args=export_args,
+                            env_name=self.config["env"],
+                            vecenv=self.vecenv,
+                            policy=self.uncompiled_policy,
+                            path=bin_path,
+                            silent=True,
+                        )
+                        eval_video_dir = os.path.join(model_dir, "eval_videos")
+                        pufferlib.utils.render_human_replay_videos(
+                            config=self.config,
+                            policy_bin_path=bin_path,
+                            output_dir=eval_video_dir,
+                            num_maps=self.config["eval"].get("human_replay_render_num_maps", 3),
+                            logger=self.logger,
+                            global_step=self.global_step,
+                        )
+                    except Exception as e:
+                        print(f"Failed to render eval videos: {e}")
+
     def mean_and_log(self):
         config = self.config
         for k in list(self.stats.keys()):
@@ -1255,7 +1292,13 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None):
     elif args["wandb"]:
         logger = WandbLogger(args)
 
-    train_config = dict(**args["train"], env=env_name, eval=args.get("eval", {}), env_config=args.get("env", {}))
+    train_config = dict(
+        **args["train"],
+        env=env_name,
+        eval=args.get("eval", {}),
+        env_config=args.get("env", {}),
+        policy_architecture=args.get("policy_architecture", "Recurrent"),
+    )
     pufferl = PuffeRL(train_config, vecenv, policy, logger)
 
     all_logs = []
@@ -1670,49 +1713,57 @@ def load_policy(args, vecenv, env_name=""):
     env_module = importlib.import_module(module_name)
 
     device = args["train"]["device"]
-    policy_cls = getattr(env_module.torch, args["policy_name"])
-    policy = policy_cls(vecenv.driver_env, **args["policy"])
-
-    # Handle both RNN and Transformer wrappers
-    rnn_name = args.get("rnn_name")
-    transformer_name = args.get("transformer_name")
-
-    if transformer_name is not None:
-        # Load transformer wrapper
-        transformer_cls = getattr(env_module.torch, transformer_name)
-        args["transformer"]["context_length"] = vecenv.driver_env.episode_length
-        policy = transformer_cls(vecenv.driver_env, policy, **args["transformer"])
-    elif rnn_name is not None:
-        # Load RNN wrapper
-        rnn_cls = getattr(env_module.torch, rnn_name)
-        policy = rnn_cls(vecenv.driver_env, policy, **args["rnn"])
-
-    policy = policy.to(device)
 
     load_id = args["load_id"]
-    if load_id is not None:
+    load_path = args.get("load_model_path")
+    state_dict = None
+    rnn_name = args.get("policy_architecture", "Recurrent")
+
+    if load_path is not None:
+        state_dict = torch.load(load_path, map_location=device)
+        state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
+    elif load_id is not None:
         if args["neptune"]:
             path = NeptuneLogger(args, load_id, mode="read-only").download()
         elif args["wandb"]:
             path = WandbLogger(args, load_id).download()
         else:
             raise pufferlib.APIUsageError("No run id provided for eval")
-
         state_dict = torch.load(path, map_location=device)
         state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
-        policy.load_state_dict(state_dict)
 
-    load_path = args["load_model_path"]
-    if load_path == "latest":
-        load_path = max(glob.glob(f"experiments/{env_name}*.pt"), key=os.path.getctime)
+    # Auto-detect architecture from state_dict keys
+    if state_dict is not None:
+        if "positional_embedding" in state_dict:
+            rnn_name = "Transformer"
+        elif "lstm.weight_ih_l0" in state_dict:
+            rnn_name = "Recurrent"
 
-    if load_path is not None:
-        state_dict = torch.load(load_path, map_location=device)
-        state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
+    policy_cls = getattr(env_module.torch, args["policy_name"])
+    policy = policy_cls(vecenv.driver_env, **args["policy"])
+
+    # Handle both RNN and Transformer wrappers via rnn_name
+    if rnn_name == "Transformer":
+        # Load transformer wrapper
+        transformer_cls = getattr(env_module.torch, rnn_name)
+        # For adaptive_driving_agent, use episode_length as horizon (k_scenarios * scenario_length)
+        # Otherwise, use config horizon with fallback to episode_length
+        is_adaptive = getattr(vecenv.driver_env, "env_name", None) == "adaptive_drive"
+        if is_adaptive:
+            args["transformer"]["horizon"] = vecenv.driver_env.episode_length
+        else:
+            args["transformer"]["horizon"] = args["train"].get("horizon", vecenv.driver_env.episode_length)
+        policy = transformer_cls(vecenv.driver_env, policy, **args["transformer"])
+    elif rnn_name is not None:
+        # Load RNN wrapper (Recurrent)
+        rnn_cls = getattr(env_module.torch, rnn_name)
+        policy = rnn_cls(vecenv.driver_env, policy, **args["rnn"])
+
+    policy = policy.to(device)
+
+    # Load the state dict if we have one
+    if state_dict is not None:
         policy.load_state_dict(state_dict)
-        # state_path = os.path.join(*load_path.split('/')[:-1], 'state.pt')
-        # optim_state = torch.load(state_path)['optimizer_state_dict']
-        # pufferl.optimizer.load_state_dict(optim_state)
 
     return policy
 
diff --git a/pufferlib/resources/drive/binaries/training/map_000.bin b/pufferlib/resources/drive/binaries/training/map_000.bin
index 434b98c255..ef87c59af2 100644
Binary files a/pufferlib/resources/drive/binaries/training/map_000.bin and b/pufferlib/resources/drive/binaries/training/map_000.bin differ
diff --git a/pufferlib/resources/drive/puffer_adaptive_drive_co_player.bin b/pufferlib/resources/drive/puffer_adaptive_drive_co_player.bin
new file mode 100644
index 0000000000..bb1d3f0233
Binary files /dev/null and b/pufferlib/resources/drive/puffer_adaptive_drive_co_player.bin differ
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index 8ef3cb4034..eef8c03c7f 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -38,17 +38,21 @@ def run_human_replay_eval_in_subprocess(config, logger, global_step):
 
         if is_adaptive:
             # Use evaluate_human_logs.py for adaptive agents with human replay
+            # Get architecture from config (determines Recurrent vs Transformer)
+            # Check both policy_architecture and rnn_name for compatibility
+            rnn_name = config.get("policy_architecture", config.get("rnn_name", "Recurrent"))
+
             cmd = [
                 sys.executable,
                 "evaluate_human_logs.py",
                 "--policy-path",
                 latest_cpt,
-                "--policy-architecture",
-                config.get("policy_architecture", "Transformer"),
+                "--rnn-name",
+                rnn_name,
                 "--adaptive-driving-agent",
                 "1",
                 "--k-scenarios",
-                str(env_config.get("k_scenarios", 2)),
+                str(env_config.get("k_scenarios", 1)),
                 "--num-agents",
                 str(eval_config.get("human_replay_num_agents", 32)),
                 "--num-maps",
@@ -302,9 +306,37 @@ def render_videos(config, vecenv, logger, epoch, global_step, bin_path):
         env_vars = os.environ.copy()
         env_vars["ASAN_OPTIONS"] = "exitcode=0"
 
+        # Detect if this is an adaptive agent
+        env_name = config.get("env", "")
+        is_adaptive = "adaptive" in env_name
+
+        # Select correct INI file based on agent type
+        if is_adaptive:
+            ini_file = "pufferlib/config/ocean/adaptive.ini"
+        else:
+            ini_file = "pufferlib/config/ocean/drive.ini"
+
         # Base command with only visualization flags (env config comes from INI)
         base_cmd = ["xvfb-run", "-a", "-s", "-screen 0 1280x720x24", "./visualize"]
 
+        # Pass the correct INI file
+        base_cmd.extend(["--ini-file", ini_file])
+
+        # Get env config for k_scenarios and co-player settings
+        env_config = config.get("env_config", {})
+
+        # Pass k_scenarios for adaptive agents (longer videos)
+        k_scenarios = env_config.get("k_scenarios", 1)
+        if k_scenarios > 1:
+            base_cmd.extend(["--k-scenarios", str(k_scenarios)])
+
+        # Pass co-player policy if population play is enabled
+        co_player_enabled = env_config.get("co_player_enabled", False)
+        if co_player_enabled:
+            co_player_path = f"resources/drive/{config['env']}_co_player.bin"
+            if os.path.exists(co_player_path):
+                base_cmd.extend(["--co-player-policy", co_player_path])
+
         # Visualization config flags only
         if config.get("show_grid", False):
             base_cmd.append("--show-grid")
@@ -407,3 +439,135 @@ def render_videos(config, vecenv, logger, epoch, global_step, bin_path):
         # Clean up bin weights file
         if os.path.exists(expected_weights_path):
             os.remove(expected_weights_path)
+
+
+def render_human_replay_videos(config, policy_bin_path, output_dir, num_maps=5, logger=None, global_step=0):
+    """
+    Render videos for human replay evaluation (1 ego agent + human log trajectories).
+
+    In this mode, only one agent is policy-controlled (the ego), while all other agents
+    follow their logged human trajectories (rendered in GOLD).
+
+    Args:
+        config: Configuration dictionary with env settings
+        policy_bin_path: Path to the policy weights .bin file
+        output_dir: Directory to save output videos
+        num_maps: Number of maps to render
+        logger: Optional logger with wandb attribute for logging
+        global_step: Current training step for wandb logging
+
+    Returns:
+        List of output video paths
+    """
+    if not os.path.exists(policy_bin_path):
+        print(f"Policy weights file does not exist: {policy_bin_path}")
+        return []
+
+    try:
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Copy the binary weights to the expected location
+        expected_weights_path = "resources/drive/puffer_drive_weights.bin"
+        os.makedirs(os.path.dirname(expected_weights_path), exist_ok=True)
+        shutil.copy2(policy_bin_path, expected_weights_path)
+
+        env_vars = os.environ.copy()
+        env_vars["ASAN_OPTIONS"] = "exitcode=0"
+
+        # Get env config
+        env_config = config.get("env_config", config.get("env", {}))
+        k_scenarios = env_config.get("k_scenarios", env_config.get("k-scenarios", 1))
+        map_dir = env_config.get("map_dir", env_config.get("map-dir", None))
+
+        # Build command for human replay rendering
+        cmd = [
+            "xvfb-run",
+            "-a",
+            "-s",
+            "-screen 0 1280x720x24",
+            "./visualize",
+            "--ini-file",
+            "pufferlib/config/ocean/adaptive.ini",
+            "--policy-name",
+            expected_weights_path,
+            "--max-controlled-agents",
+            "1",  # Only 1 ego agent
+            "--k-scenarios",
+            str(k_scenarios),
+            "--num-maps",
+            str(num_maps),
+            "--log-trajectories",  # Show human trajectory logs
+            "--zoom-in",
+            "--view",
+            "both",
+            "--output-topdown",
+            "resources/drive/output_topdown.mp4",
+            "--output-agent",
+            "resources/drive/output_agent.mp4",
+        ]
+
+        # Add map_dir override if specified (for NuPlan or other datasets)
+        if map_dir:
+            cmd.extend(["--map-dir", map_dir])
+
+        output_videos = []
+        videos_to_log_world = []
+        videos_to_log_agent = []
+
+        print(f"[Human Replay Render] Starting render for {num_maps} maps, map_dir={map_dir}", flush=True)
+        print(f"[Human Replay Render] Command: {' '.join(cmd)}", flush=True)
+
+        for map_idx in range(num_maps):
+            print(f"[Human Replay Render] Rendering map {map_idx}...", flush=True)
+            result = subprocess.run(cmd, cwd=os.getcwd(), capture_output=True, text=True, timeout=600, env=env_vars)
+            print(f"[Human Replay Render] Return code: {result.returncode}", flush=True)
+            if result.stderr:
+                print(f"[Human Replay Render] stderr: {result.stderr[:500]}", flush=True)
+
+            vids_exist = os.path.exists("resources/drive/output_topdown.mp4") and os.path.exists(
+                "resources/drive/output_agent.mp4"
+            )
+            print(f"[Human Replay Render] Videos exist: {vids_exist}", flush=True)
+
+            if result.returncode == 0 or (result.returncode == 1 and vids_exist):
+                videos = [
+                    ("resources/drive/output_topdown.mp4", f"human_replay_map{map_idx:02d}_topdown.mp4"),
+                    ("resources/drive/output_agent.mp4", f"human_replay_map{map_idx:02d}_agent.mp4"),
+                ]
+
+                for source_vid, target_filename in videos:
+                    if os.path.exists(source_vid):
+                        target_path = os.path.join(output_dir, target_filename)
+                        shutil.move(source_vid, target_path)
+                        output_videos.append(target_path)
+
+                        if logger and hasattr(logger, "wandb") and logger.wandb:
+                            import wandb
+
+                            if "topdown" in target_filename:
+                                videos_to_log_world.append(wandb.Video(target_path, format="mp4"))
+                            else:
+                                videos_to_log_agent.append(wandb.Video(target_path, format="mp4"))
+            else:
+                print(f"Human replay rendering failed for map {map_idx}: {result.stderr}")
+
+        # Log to wandb
+        if logger and hasattr(logger, "wandb") and logger.wandb and (videos_to_log_world or videos_to_log_agent):
+            payload = {}
+            if videos_to_log_world:
+                payload["eval/human_replay_world_view"] = videos_to_log_world
+            if videos_to_log_agent:
+                payload["eval/human_replay_agent_view"] = videos_to_log_agent
+            logger.wandb.log(payload, step=global_step)
+
+        return output_videos
+
+    except subprocess.TimeoutExpired:
+        print("Human replay rendering timed out")
+        return []
+    except Exception as e:
+        print(f"Failed to render human replay videos: {e}")
+        return []
+    finally:
+        if os.path.exists(expected_weights_path):
+            os.remove(expected_weights_path)
diff --git a/pufferlib/vector.py b/pufferlib/vector.py
index c397ec7e16..73a096f7bc 100644
--- a/pufferlib/vector.py
+++ b/pufferlib/vector.py
@@ -848,6 +848,8 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer
         input_size = co_player_policy.get("input_size", 256)
         hidden_size = co_player_policy.get("hidden_size", 256)
         co_player_rnn = co_player_policy.get("rnn", None)
+        co_player_architecture = co_player_policy.get("architecture", "Recurrent")
+        co_player_transformer = co_player_policy.get("transformer", {})
 
         # Get conditioning type from env_k
         co_player_conditioning = co_player_policy.get("conditioning")
@@ -914,7 +916,18 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer
 
         base_policy = Drive(co_player_env, input_size=input_size, hidden_size=hidden_size)
 
-        if co_player_rnn:
+        if co_player_architecture == "Transformer":
+            policy = pufferlib.models.TransformerWrapper(
+                co_player_env,
+                base_policy,
+                input_size=co_player_transformer.get("input_size", 256),
+                hidden_size=co_player_transformer.get("hidden_size", 256),
+                num_layers=co_player_transformer.get("num_layers", 2),
+                num_heads=co_player_transformer.get("num_heads", 4),
+                horizon=co_player_transformer.get("horizon", 91),
+                dropout=co_player_transformer.get("dropout", 0.0),
+            )
+        elif co_player_rnn:
             policy = pufferlib.models.LSTMWrapper(
                 co_player_env,
                 base_policy,
diff --git a/scripts/adaptive/nuplan_recurrent.sh b/scripts/adaptive/nuplan_recurrent.sh
new file mode 100755
index 0000000000..8bb17717e6
--- /dev/null
+++ b/scripts/adaptive/nuplan_recurrent.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+#SBATCH --job-name=adaptive_nuplan_rnn
+#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out
+#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err
+#SBATCH --mem=128GB
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --account=torch_pr_355_tandon_advanced
+#SBATCH --cpus-per-task=48
+#SBATCH --gres=gpu:1
+#SBATCH --array=0-15
+
+# Train adaptive agents on NuPlan with Recurrent architecture
+# Uses pre-trained NuPlan Recurrent co-players with varied conditioning
+#
+# PREREQUISITE: Train co-players first with scripts/coplayers/nuplan_recurrent.sh
+# Then update ZIPPED_RUNS with the trained policy paths from wandb
+
+# Co-player policies trained with scripts/coplayers/nuplan_recurrent.sh
+# Each entry: "policy_path entropy_weight_ub discount_weight_lb"
+ZIPPED_RUNS=(
+  "experiments/puffer_drive_mwiatx5g.pt 0.5 0.8"
+  "experiments/puffer_drive_old90mw2.pt 0.1 0.8"
+  "TODO_FAILED 0.01 0.8"
+  "TODO_FAILED 0 0.8"
+
+  "experiments/puffer_drive_hous32qj.pt 0.5 0.6"
+  "experiments/puffer_drive_dhxoorxc.pt 0.1 0.6"
+  "experiments/puffer_drive_0h5radmw.pt 0.01 0.6"
+  "experiments/puffer_drive_elet1zj8.pt 0 0.6"
+
+  "experiments/puffer_drive_a57umusk.pt 0.5 0.4"
+  "experiments/puffer_drive_zozl26ek.pt 0.1 0.4"
+  "experiments/puffer_drive_jco8adma.pt 0.01 0.4"
+  "experiments/puffer_drive_5hhwfmmt.pt 0 0.4"
+
+  "experiments/puffer_drive_a56dgt1x.pt 0.5 0.2"
+  "experiments/puffer_drive_dqzqt7qx.pt 0.1 0.2"
+  "experiments/puffer_drive_qetoozcn.pt 0.01 0.2"
+  "experiments/puffer_drive_q30rjcbu.pt 0 0.2"
+)
+
+read -r COPLAYER_PATH ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}"
+
+# Fixed values
+CONDITION_TYPE="all"
+DISCOUNT_UB=1
+ENTROPY_LB=0
+NUPLAN_NUM_MAPS=5000
+
+singularity exec --nv \
+ --overlay "$OVERLAY_FILE:ro" \
+ "$SINGULARITY_IMAGE" \
+ bash -c "
+   set -e
+
+   source ~/.bashrc
+   cd /scratch/mmk9418/projects/Adaptive_Driving_Agent
+   source .venv/bin/activate
+
+   nice -n 19 python scripts/gpu_heartbeat.py &
+   HEARTBEAT_PID=\$!
+
+   puffer train puffer_adaptive_drive --wandb --tag adaptive_nuplan_recurrent_k2 \
+     --env.map-dir resources/drive/binaries/nuplan \
+     --env.num-maps $NUPLAN_NUM_MAPS \
+     --env.conditioning.type none \
+     --env.co-player-enabled 1 \
+     --env.co-player-policy.policy-path $COPLAYER_PATH \
+     --env.co-player-policy.conditioning.type $CONDITION_TYPE \
+     --env.co-player-policy.conditioning.discount-weight-lb $DISCOUNT_LB \
+     --env.co-player-policy.conditioning.discount-weight-ub $DISCOUNT_UB \
+     --env.co-player-policy.conditioning.entropy-weight-lb $ENTROPY_LB \
+     --env.co-player-policy.conditioning.entropy-weight-ub $ENTROPY_UB \
+     --rnn-name Recurrent \
+
+   kill \$HEARTBEAT_PID
+ "
diff --git a/scripts/adaptive/nuplan_transformer.sh b/scripts/adaptive/nuplan_transformer.sh
new file mode 100755
index 0000000000..4c5f4806a0
--- /dev/null
+++ b/scripts/adaptive/nuplan_transformer.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+#SBATCH --job-name=adaptive_nuplan_tfm
+#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out
+#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err
+#SBATCH --mem=128GB
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --account=torch_pr_355_tandon_advanced
+#SBATCH --cpus-per-task=48
+#SBATCH --gres=gpu:1
+#SBATCH --array=0-15
+
+# Train adaptive agents on NuPlan with Transformer architecture
+# Uses pre-trained NuPlan Transformer co-players with varied conditioning
+#
+# PREREQUISITE: Train co-players first with scripts/coplayers/nuplan_transformer.sh
+# Then update ZIPPED_RUNS with the trained policy paths from wandb
+
+# Co-player policies trained with scripts/coplayers/nuplan_transformer.sh
+# Each entry: "policy_path entropy_weight_ub discount_weight_lb"
+ZIPPED_RUNS=(
+"experiments/puffer_drive_joqbmi4s.pt 0.01 0.2"
+"experiments/puffer_drive_0h81rtfi.pt 0 0.4"
+"experiments/puffer_drive_medzmgum.pt 0.1 0.2"
+"experiments/puffer_drive_f4a8yoi9.pt 0.5 0.2"
+"experiments/puffer_drive_b1yx43w2.pt 0.01 0.4"
+"experiments/puffer_drive_lv4x8hlt.pt 0.1 0.4"
+"experiments/puffer_drive_j51yz49e.pt 0.5 0.4"
+"experiments/puffer_drive_iry1wanp.pt 0 0.6"
+"experiments/puffer_drive_kx8bhu3v.pt 0.01 0.6"
+"experiments/puffer_drive_js4mf85k.pt 0.1 0.6"
+"experiments/puffer_drive_52u5onve.pt 0.5 0.6"
+"experiments/puffer_drive_nu7lkmx4.pt 0 0.8"
+"experiments/puffer_drive_f8dpkpbq.pt 0.01 0.8"
+"experiments/puffer_drive_zxsxu6z7.pt 0.1 0.8"
+"experiments/puffer_drive_mfahi5bc.pt 0.5 0.8"
+)
+
+read -r COPLAYER_PATH ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}"
+
+# Fixed values
+CONDITION_TYPE="all"
+DISCOUNT_UB=1
+ENTROPY_LB=0
+NUPLAN_NUM_MAPS=5000
+
+singularity exec --nv \
+ --overlay "$OVERLAY_FILE:ro" \
+ "$SINGULARITY_IMAGE" \
+ bash -c "
+   set -e
+
+   source ~/.bashrc
+   cd /scratch/mmk9418/projects/Adaptive_Driving_Agent
+   source .venv/bin/activate
+
+   nice -n 19 python scripts/gpu_heartbeat.py &
+   HEARTBEAT_PID=\$!
+
+   puffer train puffer_adaptive_drive --wandb --tag adaptive_nuplan_transformer_1apr \
+     --env.map-dir resources/drive/binaries/nuplan \
+     --env.num-maps $NUPLAN_NUM_MAPS \
+     --env.conditioning.type none \
+     --env.co-player-enabled 1 \
+     --env.co-player-policy.policy-path $COPLAYER_PATH \
+     --env.co-player-policy.architecture Transformer \
+     --env.co-player-policy.conditioning.type $CONDITION_TYPE \
+     --env.co-player-policy.conditioning.discount-weight-lb $DISCOUNT_LB \
+     --env.co-player-policy.conditioning.discount-weight-ub $DISCOUNT_UB \
+     --env.co-player-policy.conditioning.entropy-weight-lb $ENTROPY_LB \
+     --env.co-player-policy.conditioning.entropy-weight-ub $ENTROPY_UB \
+     --rnn-name Transformer \
+
+   kill \$HEARTBEAT_PID
+ "
diff --git a/scripts/adaptive/womd_recurrent.sh b/scripts/adaptive/womd_recurrent.sh
new file mode 100755
index 0000000000..96a25ae967
--- /dev/null
+++ b/scripts/adaptive/womd_recurrent.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+#SBATCH --job-name=adaptive_womd_rnn
+#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out
+#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err
+#SBATCH --mem=128GB
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --account=torch_pr_355_tandon_advanced
+#SBATCH --cpus-per-task=48
+#SBATCH --gres=gpu:1
+#SBATCH --array=0-15
+
+# Train adaptive agents on WOMD with Recurrent architecture
+# Uses pre-trained WOMD Recurrent co-players with varied conditioning
+
+# Co-player policies trained with scripts/coplayers/womd_recurrent.sh
+# Each entry: "policy_path entropy_weight_ub discount_weight_lb"
+ZIPPED_RUNS=(
+  "experiments/puffer_drive_o9brlhxk.pt 0.5 0.8"
+  "experiments/puffer_drive_ufgi4za3.pt 0.1 0.8"
+  "experiments/puffer_drive_h97urhep.pt 0.01 0.8"
+  "experiments/puffer_drive_sa0w31jc.pt 0 0.8"
+
+  "experiments/puffer_drive_ojounplt.pt 0.5 0.6"
+  "experiments/puffer_drive_5tr8vzex.pt 0.1 0.6"
+  "experiments/puffer_drive_wr30n6a9.pt 0.01 0.6"
+  "experiments/puffer_drive_use1k7kc.pt 0 0.6"
+
+  "experiments/puffer_drive_nb2vowrz.pt 0.5 0.4"
+  "experiments/puffer_drive_e3e6vion.pt 0.1 0.4"
+  "experiments/puffer_drive_yyu6icxx.pt 0.01 0.4"
+  "experiments/puffer_drive_iuzoucs0.pt 0 0.4"
+
+  "experiments/puffer_drive_u8kntkem.pt 0.5 0.2"
+  "TODO_FAILED 0.1 0.2"
+  "experiments/puffer_drive_f8ccc1f1.pt 0.01 0.2"
+  "experiments/puffer_drive_04p394xa.pt 0 0.2"
+)
+
+read -r COPLAYER_PATH ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}"
+
+# Fixed values
+CONDITION_TYPE="all"
+DISCOUNT_UB=1
+ENTROPY_LB=0
+
+singularity exec --nv \
+ --overlay "$OVERLAY_FILE:ro" \
+ "$SINGULARITY_IMAGE" \
+ bash -c "
+   set -e
+
+   source ~/.bashrc
+   cd /scratch/mmk9418/projects/Adaptive_Driving_Agent
+   source .venv/bin/activate
+
+   nice -n 19 python scripts/gpu_heartbeat.py &
+   HEARTBEAT_PID=\$!
+
+   puffer train puffer_adaptive_drive --wandb --tag adaptive_womd_recurrent_k2 \
+     --env.num-maps 10000 \
+     --env.conditioning.type none \
+     --env.co-player-enabled 1 \
+     --env.co-player-policy.policy-path $COPLAYER_PATH \
+     --env.co-player-policy.conditioning.type $CONDITION_TYPE \
+     --env.co-player-policy.conditioning.discount-weight-lb $DISCOUNT_LB \
+     --env.co-player-policy.conditioning.discount-weight-ub $DISCOUNT_UB \
+     --env.co-player-policy.conditioning.entropy-weight-lb $ENTROPY_LB \
+     --env.co-player-policy.conditioning.entropy-weight-ub $ENTROPY_UB \
+     --rnn-name Recurrent
+
+   kill \$HEARTBEAT_PID
+ "
diff --git a/scripts/adaptive/womd_transformer.sh b/scripts/adaptive/womd_transformer.sh
new file mode 100755
index 0000000000..1adcda4ca9
--- /dev/null
+++ b/scripts/adaptive/womd_transformer.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+#SBATCH --job-name=adaptive_womd_tfm
+#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out
+#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err
+#SBATCH --mem=128GB
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --account=torch_pr_355_tandon_advanced
+#SBATCH --cpus-per-task=48
+#SBATCH --gres=gpu:1
+#SBATCH --array=0-15
+
+# Train adaptive agents on WOMD with Transformer architecture
+# Uses pre-trained WOMD Transformer co-players with varied conditioning
+#
+# PREREQUISITE: Train co-players first with scripts/coplayers/womd_transformer.sh
+# Then update ZIPPED_RUNS with the trained policy paths from wandb
+
+# Co-player policies trained with scripts/coplayers/womd_transformer.sh
+# Each entry: "policy_path entropy_weight_ub discount_weight_lb"
+ZIPPED_RUNS=(
+  "experiments/puffer_drive_zagelrzs.pt 0.5 0.8"
+  "experiments/puffer_drive_d8kb6hwf.pt 0.1 0.8"
+  "experiments/puffer_drive_xdmwezaw.pt 0.01 0.8"
+  "experiments/puffer_drive_0cxi9nf8.pt 0 0.8"
+
+  "experiments/puffer_drive_t69evoxz.pt 0.5 0.6"
+  "experiments/puffer_drive_yuuod9cn.pt 0.1 0.6"
+  "experiments/puffer_drive_436bzeu2.pt 0.01 0.6"
+  "experiments/puffer_drive_ct49w01c.pt 0 0.6"
+
+  "experiments/puffer_drive_1e54zwgz.pt 0.5 0.4"
+  "experiments/puffer_drive_epupe6sw.pt 0.1 0.4"
+  "experiments/puffer_drive_npqu25y1.pt 0.01 0.4"
+  "experiments/puffer_drive_v9urng8s.pt 0 0.4"
+
+  "experiments/puffer_drive_fugsjie2.pt 0.5 0.2"
+  "experiments/puffer_drive_iejlfoo7.pt 0.1 0.2"
+  "experiments/puffer_drive_rl7e091t.pt 0.01 0.2"
+  "experiments/puffer_drive_vztz9mmh.pt 0 0.2"
+)
+
+read -r COPLAYER_PATH ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}"
+
+# Fixed values
+CONDITION_TYPE="all"
+DISCOUNT_UB=1
+ENTROPY_LB=0
+
+singularity exec --nv \
+ --overlay "$OVERLAY_FILE:ro" \
+ "$SINGULARITY_IMAGE" \
+ bash -c "
+   set -e
+
+   source ~/.bashrc
+   cd /scratch/mmk9418/projects/Adaptive_Driving_Agent
+   source .venv/bin/activate
+
+   nice -n 19 python scripts/gpu_heartbeat.py &
+   HEARTBEAT_PID=\$!
+
+   puffer train puffer_adaptive_drive --wandb --tag adaptive_womd_transformer_new \
+     --env.num-maps 10000 \
+     --env.conditioning.type none \
+     --env.co-player-enabled 1 \
+     --env.co-player-policy.policy-path $COPLAYER_PATH \
+     --env.co-player-policy.conditioning.type $CONDITION_TYPE \
+     --env.co-player-policy.conditioning.discount-weight-lb $DISCOUNT_LB \
+     --env.co-player-policy.conditioning.discount-weight-ub $DISCOUNT_UB \
+     --env.co-player-policy.conditioning.entropy-weight-lb $ENTROPY_LB \
+     --env.co-player-policy.conditioning.entropy-weight-ub $ENTROPY_UB \
+     --rnn-name Transformer
+
+   kill \$HEARTBEAT_PID
+ "
diff --git a/scripts/baselines/nuplan_recurrent.sh b/scripts/baselines/nuplan_recurrent.sh
new file mode 100755
index 0000000000..7070d9270c
--- /dev/null
+++ b/scripts/baselines/nuplan_recurrent.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+#SBATCH --job-name=baseline_nuplan_rnn
+#SBATCH --output=/scratch/mmk9418/logs/%A_%x.out
+#SBATCH --error=/scratch/mmk9418/logs/%A_%x.err
+#SBATCH --mem=128GB
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --account=torch_pr_355_tandon_advanced
+#SBATCH --cpus-per-task=48
+#SBATCH --gres=gpu:1
+
+# Vanilla baseline on NuPlan with Recurrent architecture
+# No co-players - other vehicles follow recorded human trajectories
+#
+# PREREQUISITE: Convert NuPlan JSON to binary format first:
+#   python -c "from pufferlib.ocean.drive.drive import process_all_maps; \
+#              process_all_maps('data/nuplan_gpudrive/nuplan', max_maps=5000)"
+
+NUPLAN_NUM_MAPS=5000
+
+singularity exec --nv \
+ --overlay "$OVERLAY_FILE:ro" \
+ "$SINGULARITY_IMAGE" \
+ bash -c "
+   set -e
+
+   source ~/.bashrc
+   cd /scratch/mmk9418/projects/Adaptive_Driving_Agent
+   source .venv/bin/activate
+
+   nice -n 19 python scripts/gpu_heartbeat.py &
+   HEARTBEAT_PID=\$!
+
+   puffer train puffer_adaptive_drive --wandb --tag baseline_nuplan_recurrent \
+     --env.map-dir resources/drive/binaries/nuplan \
+     --env.num-maps $NUPLAN_NUM_MAPS \
+     --env.conditioning.type none \
+     --env.co-player-enabled 0 \
+     --train.seed 42 \
+     --rnn-name Recurrent \
+
+   kill \$HEARTBEAT_PID
+ "
diff --git a/scripts/baselines/nuplan_transformer.sh b/scripts/baselines/nuplan_transformer.sh
new file mode 100755
index 0000000000..7977cb4d4f
--- /dev/null
+++ b/scripts/baselines/nuplan_transformer.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+#SBATCH --job-name=baseline_nuplan_tfm
+#SBATCH --output=/scratch/mmk9418/logs/%A_%x.out
+#SBATCH --error=/scratch/mmk9418/logs/%A_%x.err
+#SBATCH --mem=128GB
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --account=torch_pr_355_tandon_advanced
+#SBATCH --cpus-per-task=48
+#SBATCH --gres=gpu:1
+
+# Vanilla baseline on NuPlan with Transformer architecture
+# No co-players - other vehicles follow recorded human trajectories
+#
+# PREREQUISITE: Convert NuPlan JSON to binary format first:
+#   python -c "from pufferlib.ocean.drive.drive import process_all_maps; \
+#              process_all_maps('data/nuplan_gpudrive/nuplan', max_maps=5000)"
+
+NUPLAN_NUM_MAPS=5000
+
+singularity exec --nv \
+ --overlay "$OVERLAY_FILE:ro" \
+ "$SINGULARITY_IMAGE" \
+ bash -c "
+   set -e
+
+   source ~/.bashrc
+   cd /scratch/mmk9418/projects/Adaptive_Driving_Agent
+   source .venv/bin/activate
+
+   nice -n 19 python scripts/gpu_heartbeat.py &
+   HEARTBEAT_PID=\$!
+
+   puffer train puffer_adaptive_drive --wandb --tag baseline_nuplan_transformer \
+     --env.map-dir resources/drive/binaries/nuplan \
+     --env.num-maps $NUPLAN_NUM_MAPS \
+     --env.conditioning.type none \
+     --env.co-player-enabled 0 \
+     --train.seed 42 \
+     --rnn-name Transformer \
+
+   kill \$HEARTBEAT_PID
+ "
diff --git a/scripts/baselines/womd_recurrent.sh b/scripts/baselines/womd_recurrent.sh
new file mode 100755
index 0000000000..f20ad727b1
--- /dev/null
+++ b/scripts/baselines/womd_recurrent.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+#SBATCH --job-name=baseline_womd_rnn
+#SBATCH --output=/scratch/mmk9418/logs/%A_%x.out
+#SBATCH --error=/scratch/mmk9418/logs/%A_%x.err
+#SBATCH --mem=128GB
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --account=torch_pr_355_tandon_advanced
+#SBATCH --cpus-per-task=48
+#SBATCH --gres=gpu:1
+
+# Vanilla baseline on WOMD with Recurrent architecture
+# No co-players - other vehicles follow recorded human trajectories
+
+singularity exec --nv \
+ --overlay "$OVERLAY_FILE:ro" \
+ "$SINGULARITY_IMAGE" \
+ bash -c "
+   set -e
+
+   source ~/.bashrc
+   cd /scratch/mmk9418/projects/Adaptive_Driving_Agent
+   source .venv/bin/activate
+
+   nice -n 19 python scripts/gpu_heartbeat.py &
+   HEARTBEAT_PID=\$!
+
+   puffer train puffer_adaptive_drive --wandb --tag baseline_womd_recurrent \
+     --env.num-maps 10000 \
+     --env.conditioning.type none \
+     --env.co-player-enabled 0 \
+     --train.seed 42 \
+     --rnn-name Recurrent
+
+   kill \$HEARTBEAT_PID
+ "
diff --git a/scripts/baselines/womd_transformer.sh b/scripts/baselines/womd_transformer.sh
new file mode 100755
index 0000000000..91c028a652
--- /dev/null
+++ b/scripts/baselines/womd_transformer.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+#SBATCH --job-name=baseline_womd_tfm
+#SBATCH --output=/scratch/mmk9418/logs/%A_%x.out
+#SBATCH --error=/scratch/mmk9418/logs/%A_%x.err
+#SBATCH --mem=128GB
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --account=torch_pr_355_tandon_advanced
+#SBATCH --cpus-per-task=48
+#SBATCH --gres=gpu:1
+
+# Vanilla baseline on WOMD with Transformer architecture
+# No co-players - other vehicles follow recorded human trajectories
+
+singularity exec --nv \
+ --overlay "$OVERLAY_FILE:ro" \
+ "$SINGULARITY_IMAGE" \
+ bash -c "
+   set -e
+
+   source ~/.bashrc
+   cd /scratch/mmk9418/projects/Adaptive_Driving_Agent
+   source .venv/bin/activate
+
+   nice -n 19 python scripts/gpu_heartbeat.py &
+   HEARTBEAT_PID=\$!
+
+   puffer train puffer_adaptive_drive --wandb --tag baseline_womd_transformer \
+     --env.num-maps 10000 \
+     --env.conditioning.type none \
+     --env.co-player-enabled 0 \
+     --train.seed 42 \
+     --rnn-name Transformer
+
+   kill \$HEARTBEAT_PID
+ "
diff --git a/scripts/coplayers/nuplan_recurrent.sh b/scripts/coplayers/nuplan_recurrent.sh
new file mode 100755
index 0000000000..8e016b201c
--- /dev/null
+++ b/scripts/coplayers/nuplan_recurrent.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+#SBATCH --job-name=coplayer_nuplan_rnn
+#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out
+#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err
+#SBATCH --mem=128GB
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --account=torch_pr_355_tandon_advanced
+#SBATCH --cpus-per-task=48
+#SBATCH --gres=gpu:1
+#SBATCH --array=0-15
+
+# Train co-player policies on NuPlan with Recurrent architecture
+# Varies entropy/discount conditioning for diverse co-player behaviors
+#
+# PREREQUISITE: Convert NuPlan JSON to binary format first:
+#   python -c "from pufferlib.ocean.drive.drive import process_all_maps; \
+#              process_all_maps('data/nuplan_gpudrive/nuplan', max_maps=5000)"
+
+# Grid: 4 entropy levels × 4 discount levels = 16 configurations
+ZIPPED_RUNS=(
+  "0.5 0.8"
+  "0.1 0.8"
+  "0.01 0.8"
+  "0 0.8"
+
+  "0.5 0.6"
+  "0.1 0.6"
+  "0.01 0.6"
+  "0 0.6"
+
+  "0.5 0.4"
+  "0.1 0.4"
+  "0.01 0.4"
+  "0 0.4"
+
+  "0.5 0.2"
+  "0.1 0.2"
+  "0.01 0.2"
+  "0 0.2"
+)
+
+read -r ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}"
+
+# Fixed values
+CONDITION_TYPE="all"
+DISCOUNT_UB=1
+ENTROPY_LB=0
+NUPLAN_NUM_MAPS=5000
+
+singularity exec --nv \
+ --overlay "$OVERLAY_FILE:ro" \
+ "$SINGULARITY_IMAGE" \
+ bash -c "
+   set -e
+
+   source ~/.bashrc
+   cd /scratch/mmk9418/projects/Adaptive_Driving_Agent
+   source .venv/bin/activate
+
+   nice -n 19 python scripts/gpu_heartbeat.py &
+   HEARTBEAT_PID=\$!
+
+   puffer train puffer_drive --wandb --wandb-project ada_new_coplayers --tag coplayer_nuplan_recurrent \
+     --env.map-dir resources/drive/binaries/nuplan \
+     --env.num-maps $NUPLAN_NUM_MAPS \
+     --env.conditioning.type $CONDITION_TYPE \
+     --env.conditioning.entropy-weight-lb $ENTROPY_LB \
+     --env.conditioning.entropy-weight-ub $ENTROPY_UB \
+     --env.conditioning.discount-weight-lb $DISCOUNT_LB \
+     --env.conditioning.discount-weight-ub $DISCOUNT_UB \
+     --rnn-name Recurrent \
+     --train.checkpoint-interval 50
+
+   kill \$HEARTBEAT_PID
+ "
diff --git a/scripts/coplayers/nuplan_transformer.sh b/scripts/coplayers/nuplan_transformer.sh
new file mode 100755
index 0000000000..465f1bf9eb
--- /dev/null
+++ b/scripts/coplayers/nuplan_transformer.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+#SBATCH --job-name=coplayer_nuplan_tfm
+#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out
+#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err
+#SBATCH --mem=128GB
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --account=torch_pr_355_tandon_advanced
+#SBATCH --cpus-per-task=48
+#SBATCH --gres=gpu:1
+#SBATCH --array=0-15
+
+# Train co-player policies on NuPlan with Transformer architecture
+# Varies entropy/discount conditioning for diverse co-player behaviors
+#
+# PREREQUISITE: Convert NuPlan JSON to binary format first:
+#   python -c "from pufferlib.ocean.drive.drive import process_all_maps; \
+#              process_all_maps('data/nuplan_gpudrive/nuplan', max_maps=5000)"
+
+# Grid: 4 entropy levels × 4 discount levels = 16 configurations
+ZIPPED_RUNS=(
+  "0.5 0.8"
+  "0.1 0.8"
+  "0.01 0.8"
+  "0 0.8"
+
+  "0.5 0.6"
+  "0.1 0.6"
+  "0.01 0.6"
+  "0 0.6"
+
+  "0.5 0.4"
+  "0.1 0.4"
+  "0.01 0.4"
+  "0 0.4"
+
+  "0.5 0.2"
+  "0.1 0.2"
+  "0.01 0.2"
+  "0 0.2"
+)
+
+read -r ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}"
+
+# Fixed values
+CONDITION_TYPE="all"
+DISCOUNT_UB=1
+ENTROPY_LB=0
+NUPLAN_NUM_MAPS=5000
+
+singularity exec --nv \
+ --overlay "$OVERLAY_FILE:ro" \
+ "$SINGULARITY_IMAGE" \
+ bash -c "
+   set -e
+
+   source ~/.bashrc
+   cd /scratch/mmk9418/projects/Adaptive_Driving_Agent
+   source .venv/bin/activate
+
+   nice -n 19 python scripts/gpu_heartbeat.py &
+   HEARTBEAT_PID=\$!
+
+   puffer train puffer_drive --wandb --wandb-project ada_new_coplayers --tag coplayer_nuplan_transformer_lr3e4 \
+     --env.map-dir resources/drive/binaries/nuplan \
+     --env.num-maps $NUPLAN_NUM_MAPS \
+     --env.conditioning.type $CONDITION_TYPE \
+     --env.conditioning.entropy-weight-lb $ENTROPY_LB \
+     --env.conditioning.entropy-weight-ub $ENTROPY_UB \
+     --env.conditioning.discount-weight-lb $DISCOUNT_LB \
+     --env.conditioning.discount-weight-ub $DISCOUNT_UB \
+     --policy-architecture Transformer \
+     --train.context-length 91 \
+     --train.horizon 91 \
+     --train.learning-rate 0.003 \
+     --train.checkpoint-interval 50
+
+   kill \$HEARTBEAT_PID
+ "
diff --git a/scripts/coplayers/womd_recurrent.sh b/scripts/coplayers/womd_recurrent.sh
new file mode 100755
index 0000000000..8f33fce9ff
--- /dev/null
+++ b/scripts/coplayers/womd_recurrent.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+#SBATCH --job-name=coplayer_womd_rnn
+#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out
+#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err
+#SBATCH --mem=128GB
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --account=torch_pr_355_tandon_advanced
+#SBATCH --cpus-per-task=48
+#SBATCH --gres=gpu:1
+#SBATCH --array=0-15
+
+# Train co-player policies on WOMD with Recurrent architecture
+# Varies entropy/discount conditioning for diverse co-player behaviors
+
+# Grid: 4 entropy levels × 4 discount levels = 16 configurations
+ZIPPED_RUNS=(
+  "0.5 0.8"
+  "0.1 0.8"
+  "0.01 0.8"
+  "0 0.8"
+
+  "0.5 0.6"
+  "0.1 0.6"
+  "0.01 0.6"
+  "0 0.6"
+
+  "0.5 0.4"
+  "0.1 0.4"
+  "0.01 0.4"
+  "0 0.4"
+
+  "0.5 0.2"
+  "0.1 0.2"
+  "0.01 0.2"
+  "0 0.2"
+)
+
+read -r ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}"
+
+# Fixed values
+CONDITION_TYPE="all"
+DISCOUNT_UB=1
+ENTROPY_LB=0
+
+singularity exec --nv \
+ --overlay "$OVERLAY_FILE:ro" \
+ "$SINGULARITY_IMAGE" \
+ bash -c "
+   set -e
+
+   source ~/.bashrc
+   cd /scratch/mmk9418/projects/Adaptive_Driving_Agent
+   source .venv/bin/activate
+
+   nice -n 19 python scripts/gpu_heartbeat.py &
+   HEARTBEAT_PID=\$!
+
+   puffer train puffer_drive --wandb --wandb-project ada_new_coplayers --tag coplayer_womd_recurrent \
+     --env.num-maps 10000 \
+     --env.conditioning.type $CONDITION_TYPE \
+     --env.conditioning.entropy-weight-lb $ENTROPY_LB \
+     --env.conditioning.entropy-weight-ub $ENTROPY_UB \
+     --env.conditioning.discount-weight-lb $DISCOUNT_LB \
+     --env.conditioning.discount-weight-ub $DISCOUNT_UB \
+     --rnn-name Recurrent \
+     --train.checkpoint-interval 50
+
+   kill \$HEARTBEAT_PID
+ "
diff --git a/scripts/coplayers/womd_transformer.sh b/scripts/coplayers/womd_transformer.sh
new file mode 100755
index 0000000000..1260547c00
--- /dev/null
+++ b/scripts/coplayers/womd_transformer.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+#SBATCH --job-name=coplayer_womd_tfm
+#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out
+#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err
+#SBATCH --mem=128GB
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --account=torch_pr_355_tandon_advanced
+#SBATCH --cpus-per-task=48
+#SBATCH --gres=gpu:1
+#SBATCH --array=0-15
+
+# Train co-player policies on WOMD with Transformer architecture
+# Varies entropy/discount conditioning for diverse co-player behaviors
+
+# Grid: 4 entropy levels × 4 discount levels = 16 configurations
+ZIPPED_RUNS=(
+  "0.5 0.8"
+  "0.1 0.8"
+  "0.01 0.8"
+  "0 0.8"
+
+  "0.5 0.6"
+  "0.1 0.6"
+  "0.01 0.6"
+  "0 0.6"
+
+  "0.5 0.4"
+  "0.1 0.4"
+  "0.01 0.4"
+  "0 0.4"
+
+  "0.5 0.2"
+  "0.1 0.2"
+  "0.01 0.2"
+  "0 0.2"
+)
+
+read -r ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}"
+
+# Fixed values
+CONDITION_TYPE="all"
+DISCOUNT_UB=1
+ENTROPY_LB=0
+
+singularity exec --nv \
+ --overlay "$OVERLAY_FILE:ro" \
+ "$SINGULARITY_IMAGE" \
+ bash -c "
+   set -e
+
+   source ~/.bashrc
+   cd /scratch/mmk9418/projects/Adaptive_Driving_Agent
+   source .venv/bin/activate
+
+   nice -n 19 python scripts/gpu_heartbeat.py &
+   HEARTBEAT_PID=\$!
+
+   puffer train puffer_drive --wandb --wandb-project ada_new_coplayers --tag coplayer_womd_transformer_lr3e4 \
+     --env.num-maps 10000 \
+     --env.conditioning.type $CONDITION_TYPE \
+     --env.conditioning.entropy-weight-lb $ENTROPY_LB \
+     --env.conditioning.entropy-weight-ub $ENTROPY_UB \
+     --env.conditioning.discount-weight-lb $DISCOUNT_LB \
+     --env.conditioning.discount-weight-ub $DISCOUNT_UB \
+     --policy-architecture Transformer \
+     --train.horizon 91 \
+     --train.context-length 91 \
+     --train.learning-rate 0.003 \
+     --train.checkpoint-interval 50
+
+   kill \$HEARTBEAT_PID
+ "
diff --git a/scripts/run_baseline.sh b/scripts/run_baseline.sh
new file mode 100755
index 0000000000..7e2f5b5e60
--- /dev/null
+++ b/scripts/run_baseline.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#SBATCH --job-name=puffer_baseline
+#SBATCH --output=/scratch/mmk9418/logs/%A_%x.out
+#SBATCH --error=/scratch/mmk9418/logs/%A_%x.err
+#SBATCH --mem=128GB
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --account=torch_pr_355_tandon_priority
+#SBATCH --cpus-per-task=48
+#SBATCH --gres=gpu:1
+
+# Vanilla baseline training script
+# No co-players - other vehicles follow recorded human trajectories from Waymo data
+# For comparison with co-player experiments in run.sh
+
+singularity exec --nv \
+ --overlay "$OVERLAY_FILE:ro" \
+ "$SINGULARITY_IMAGE" \
+ bash -c "
+   set -e
+
+   source ~/.bashrc
+   cd /scratch/mmk9418/projects/Adaptive_Driving_Agent
+   source .venv/bin/activate
+
+   # Start GPU heartbeat in background (for RL training which is CPU-bound)
+   nice -n 19 python scripts/gpu_heartbeat.py &
+   HEARTBEAT_PID=\$!
+   echo \"Started GPU Heartbeat with PID: \$HEARTBEAT_PID\"
+
+   puffer train puffer_adaptive_drive --wandb --tag adaptive_baseline \
+     --env.num-maps 1000 \
+     --env.conditioning.type none \
+     --env.co-player-enabled 0 \
+     --train.seed 42
+
+   kill \$HEARTBEAT_PID
+ "
diff --git a/scripts/run_nuplan_coplayers.sh b/scripts/run_nuplan_coplayers.sh
new file mode 100755
index 0000000000..8d821ec8a2
--- /dev/null
+++ b/scripts/run_nuplan_coplayers.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+#SBATCH --job-name=nuplan_coplayer
+#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out
+#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err
+#SBATCH --mem=128GB
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --account=torch_pr_355_tandon_priority
+#SBATCH --cpus-per-task=48
+#SBATCH --gres=gpu:1
+#SBATCH --array=0-15
+
+# Train co-player policies on NuPlan data with varying entropy/discount conditioning
+# These trained policies can later be used as co-players for adaptive agent training
+#
+# PREREQUISITE: Convert NuPlan JSON to binary format first:
+#   python -c "from pufferlib.ocean.drive.drive import process_all_maps; \
+#              process_all_maps('data/nuplan_gpudrive/nuplan', max_maps=5000)"
+#
+# This will create: resources/drive/binaries/nuplan/map_*.bin (dataset_name derived from folder)
+
+# Define configurations for each array task ID
+# Grid: 4 entropy levels × 4 discount levels = 16 configurations
+# Each entry: "entropy_weight_ub discount_weight_lb"
+ZIPPED_RUNS=(
+  "0.5 0.8"
+  "0.1 0.8"
+  "0.01 0.8"
+  "0 0.8"
+
+  "0.5 0.6"
+  "0.1 0.6"
+  "0.01 0.6"
+  "0 0.6"
+
+  "0.5 0.4"
+  "0.1 0.4"
+  "0.01 0.4"
+  "0 0.4"
+
+  "0.5 0.2"
+  "0.1 0.2"
+  "0.01 0.2"
+  "0 0.2"
+)
+
+# Parse the values for this array task
+read -r ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}"
+
+# Fixed values
+CONDITION_TYPE="all"
+DISCOUNT_UB=1
+ENTROPY_LB=0
+
+# NuPlan has ~5235 maps
+NUPLAN_NUM_MAPS=5000
+
+singularity exec --nv \
+ --overlay "$OVERLAY_FILE:ro" \
+ "$SINGULARITY_IMAGE" \
+ bash -c "
+   set -e
+
+   source ~/.bashrc
+   cd /scratch/mmk9418/projects/Adaptive_Driving_Agent
+   source .venv/bin/activate
+
+   # Start GPU heartbeat in background (for RL training which is CPU-bound)
+   nice -n 19 python scripts/gpu_heartbeat.py &
+   HEARTBEAT_PID=\$!
+   echo \"Started GPU Heartbeat with PID: \$HEARTBEAT_PID\"
+
+   puffer train puffer_drive --wandb --tag nuplan_coplayer_training \
+     --env.map-dir resources/drive/binaries/nuplan \
+     --env.num-maps $NUPLAN_NUM_MAPS \
+     --env.conditioning.type $CONDITION_TYPE \
+     --env.conditioning.entropy-weight-lb $ENTROPY_LB \
+     --env.conditioning.entropy-weight-ub $ENTROPY_UB \
+     --env.conditioning.discount-weight-lb $DISCOUNT_LB \
+     --env.conditioning.discount-weight-ub $DISCOUNT_UB \
+
+   kill \$HEARTBEAT_PID
+ "
diff --git a/scripts/run_transformer_adaptive.sh b/scripts/run_transformer_adaptive.sh
new file mode 100755
index 0000000000..82afefb8b3
--- /dev/null
+++ b/scripts/run_transformer_adaptive.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+#SBATCH --job-name=puffer_transformer
+#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out
+#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err
+#SBATCH --mem=128GB
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --account=torch_pr_355_tandon_priority
+#SBATCH --cpus-per-task=48
+#SBATCH --gres=gpu:1
+#SBATCH --array=0-15
+
+# Transformer-based adaptive agent training with co-players
+# Same co-player configurations as run.sh, but using Transformer architecture
+
+# Define configurations for each array task ID
+# Each entry: "path entropy_weight_ub discount_weight_lb"
+ZIPPED_RUNS=(
+  "experiments/puffer_drive_8u92j3ts/model_puffer_drive_003000.pt 0.5 0.8"
+  "experiments/puffer_drive_x4xs711x.pt 0.1 0.8"
+  "experiments/puffer_drive_hhzdzhl8.pt 0.01 0.8"
+  "experiments/puffer_drive_3xd48djp.pt 0 0.8"
+
+  "experiments/puffer_drive_fgglgofu.pt 0.5 0.6"
+  "experiments/puffer_drive_g3x9e5rn.pt 0.01 0.6"
+  "experiments/puffer_drive_gzuuzs0o.pt 0.1 0.6"
+  "experiments/puffer_drive_6nzf7xha.pt 0 0.6"
+
+  "experiments/puffer_drive_3iefv59j.pt 0.5 0.4"
+  "experiments/puffer_drive_7h07nrxy.pt 0.1 0.4"
+  "experiments/puffer_drive_bot2wl0m.pt 0.01 0.4"
+  "experiments/puffer_drive_n7mx9f4b.pt 0 0.4"
+
+  "experiments/puffer_drive_9jv4q77m.pt 0.5 0.2"
+  "experiments/puffer_drive_5p8gpw84.pt 0.1 0.2"
+  "experiments/puffer_drive_jskw659g.pt 0.01 0.2"
+  "experiments/puffer_drive_eeyizdrk.pt 0 0.2"
+)
+
+# Parse the values for this array task
+read -r COPLAYER_PATH ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}"
+
+# Fixed values
+CONDITION_TYPE="all"
+DISCOUNT_UB=1
+ENTROPY_LB=0
+
+singularity exec --nv \
+ --overlay "$OVERLAY_FILE:ro" \
+ "$SINGULARITY_IMAGE" \
+ bash -c "
+   set -e
+
+   source ~/.bashrc
+   cd /scratch/mmk9418/projects/Adaptive_Driving_Agent
+   source .venv/bin/activate
+
+   # Start GPU heartbeat in background (for RL training which is CPU-bound)
+   nice -n 19 python scripts/gpu_heartbeat.py &
+   HEARTBEAT_PID=\$!
+   echo \"Started GPU Heartbeat with PID: \$HEARTBEAT_PID\"
+
+   puffer train puffer_adaptive_drive --wandb --tag adaptive_transformer \
+     --env.num-maps 1000 \
+     --env.conditioning.type none \
+     --env.co-player-enabled 1 \
+     --env.co-player-policy.policy-path $COPLAYER_PATH \
+     --env.co-player-policy.conditioning.type $CONDITION_TYPE \
+     --env.co-player-policy.conditioning.discount-weight-lb $DISCOUNT_LB \
+     --env.co-player-policy.conditioning.discount-weight-ub $DISCOUNT_UB \
+     --env.co-player-policy.conditioning.entropy-weight-lb $ENTROPY_LB \
+     --env.co-player-policy.conditioning.entropy-weight-ub $ENTROPY_UB \
+     --rnn-name Transformer
+
+   kill \$HEARTBEAT_PID
+ "
diff --git a/scripts/womd_transformer_ablation.sh b/scripts/womd_transformer_ablation.sh
new file mode 100644
index 0000000000..accd07719d
--- /dev/null
+++ b/scripts/womd_transformer_ablation.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+#SBATCH --job-name=coplayer_womd_tfm_ablation
+#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out
+#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err
+#SBATCH --mem=128GB
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --account=torch_pr_355_tandon_advanced
+#SBATCH --cpus-per-task=48
+#SBATCH --gres=gpu:1
+#SBATCH --array=0-11
+
+# Ablation study on Transformer architecture parameters
+# Varies: context length (1, 32, 91), learning rate (0.003, 0.0003), input_size (64, 128)
+# Uses default entropy/discount values with "all" conditioning
+
+# Grid: 3 context × 2 learning rate × 2 input_size = 12 configurations
+# Format: "CTX LR INPUT_SIZE"
+ZIPPED_RUNS=(
+  # ctx=1
+  "1 0.003 64"
+  "1 0.003 128"
+  "1 0.0003 64"
+  "1 0.0003 128"
+
+  # ctx=32
+  "32 0.003 64"
+  "32 0.003 128"
+  "32 0.0003 64"
+  "32 0.0003 128"
+
+  # ctx=91
+  "91 0.003 64"
+  "91 0.003 128"
+  "91 0.0003 64"
+  "91 0.0003 128"
+)
+
+read -r CTX LR INPUT_SIZE <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}"
+
+# Calculate minibatch_size: must be <= batch_size (8192 * CTX) and divisible by CTX
+# Using multiplier 352 (same as 32032/91) for consistency
+MINIBATCH_SIZE=$((CTX * 352))
+
+# Fixed conditioning values (matching drive.ini defaults)
+CONDITION_TYPE="all"
+ENTROPY_UB=0.001
+ENTROPY_LB=0
+DISCOUNT_UB=0.98
+DISCOUNT_LB=0.80
+
+singularity exec --nv \
+ --overlay "$OVERLAY_FILE:ro" \
+ "$SINGULARITY_IMAGE" \
+ bash -c "
+   set -e
+
+   source ~/.bashrc
+   cd /scratch/mmk9418/projects/Adaptive_Driving_Agent
+   source .venv/bin/activate
+
+   nice -n 19 python scripts/gpu_heartbeat.py &
+   HEARTBEAT_PID=\$!
+
+   puffer train puffer_drive --wandb --wandb-project ada_new_coplayers --tag coplayer_womd_transformer_ablation \
+     --env.num-maps 10000 \
+     --env.conditioning.type $CONDITION_TYPE \
+     --env.conditioning.entropy-weight-lb $ENTROPY_LB \
+     --env.conditioning.entropy-weight-ub $ENTROPY_UB \
+     --env.conditioning.discount-weight-lb $DISCOUNT_LB \
+     --env.conditioning.discount-weight-ub $DISCOUNT_UB \
+     --policy-architecture Transformer \
+     --policy.input-size $INPUT_SIZE \
+     --train.context-length $CTX \
+     --train.horizon $CTX \
+     --train.learning-rate $LR \
+     --train.minibatch-size $MINIBATCH_SIZE \
+     --train.checkpoint-interval 50
+
+   kill \$HEARTBEAT_PID
+ "
diff --git a/tests/test_drive_config.py b/tests/test_drive_config.py
index 43b30916f8..486eb802cd 100644
--- a/tests/test_drive_config.py
+++ b/tests/test_drive_config.py
@@ -79,7 +79,7 @@ def test_drive_ini_config(self):
         if ASSERTION_LEVEL >= 3:
             self.assertEqual(args["train"]["total_timesteps"], 3_000_000_000)
             self.assertEqual(args["train"]["batch_size"], "auto")
-            self.assertEqual(args["train"]["bptt_horizon"], 91)
+            self.assertEqual(args["train"]["horizon"], 91)
             self.assertEqual(args["train"]["minibatch_size"], 11648)
             self.assertEqual(args["train"]["learning_rate"], 0.001)
             self.assertEqual(args["train"]["gamma"], 0.98)
diff --git a/tests/test_drive_scenarios.py b/tests/test_drive_scenarios.py
index 0d538c3c30..45f75a246b 100644
--- a/tests/test_drive_scenarios.py
+++ b/tests/test_drive_scenarios.py
@@ -20,7 +20,7 @@ def run_training_test(env_name, config_overrides, target_steps=10000, test_name=
                 "compile": False,
                 "total_timesteps": 100000,
                 "batch_size": 64,
-                "bptt_horizon": 8,
+                "horizon": 8,
                 "minibatch_size": 64,
                 "max_minibatch_size": 64,
                 "update_epochs": 1,
diff --git a/tests/test_drive_train.py b/tests/test_drive_train.py
index f9f16ef648..900f792ebc 100644
--- a/tests/test_drive_train.py
+++ b/tests/test_drive_train.py
@@ -27,7 +27,7 @@ def test_drive_training():
                 "compile": False,
                 "total_timesteps": 100000,
                 "batch_size": 64,
-                "bptt_horizon": 8,
+                "horizon": 8,
                 "minibatch_size": 64,
                 "max_minibatch_size": 64,
                 "update_epochs": 1,