Emerge-Lab · m2kulkarni · Feb 24, 2026 · Feb 24, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/.gitignore b/.gitignore
@@ -194,3 +194,4 @@ pufferlib/resources/drive/output*.mp4
 
 # Local TODO tracking
 TODO.md
+*.mp4
diff --git a/pufferlib/config/ocean/adaptive.ini b/pufferlib/config/ocean/adaptive.ini
@@ -2,18 +2,20 @@
 package = ocean
 env_name = puffer_adaptive_drive
 policy_name = Drive
-transformer_name = Transformer
- ; Changed from rnn_name
+rnn_name = Recurrent
 
 [vec]
 num_workers = 16
 num_envs = 16
-batch_size = 2
+batch_size = 1
 ; backend = Serial
 
 [policy]
-input_size = 128
-; Increased from 64 for richer representations
+input_size = 64
+hidden_size = 256
+
+[rnn]
+input_size = 256
 hidden_size = 256
 
 [transformer]
@@ -29,8 +31,8 @@ dropout = 0.0
 ; Dropout (keep at 0 for RL stability initially)
 
 [env]
-num_agents = 1512
-num_ego_agents = 756
+num_agents = 1024
+num_ego_agents = 512
 ; Options: discrete, continuous
 action_type = discrete
 ; Options: classic, jerk
@@ -120,7 +122,7 @@ minibatch_size = 36400
 ; 400 * 91
 max_minibatch_size = 36400
 minibatch_multiplier = 400
-policy_architecture = Transformer
+policy_architecture = Recurrent
 ; Matches scenario_length for buffer organization
 bptt_horizon = 32
 ; Keep for backward compatibility
@@ -131,7 +133,7 @@ clip_coef = 0.2
 ent_coef = 0.005
 gae_lambda = 0.95
 gamma = 0.98
-learning_rate = 0.0003
+learning_rate = 0.003
 ; Reduced from 0.003 (transformers often need lower LR)
 max_grad_norm = 1.0
 prio_alpha = 0.85
@@ -193,6 +195,8 @@ human_replay_num_agents = 32
 human_replay_num_rollouts = 100
 ; Number of maps to use for human replay evaluation
 human_replay_num_maps = 100
+; Number of maps to render for human replay (subset of eval maps)
+human_replay_render_num_maps = 3
 
 [sweep.train.learning_rate]
 distribution = log_normal

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
@@ -2,7 +2,7 @@
 package = ocean
 env_name = puffer_drive
 policy_name = Drive
-rnn_name = Transformer
+rnn_name = Recurrent
 
 [vec]
 num_workers = 16
@@ -14,9 +14,9 @@ batch_size = 2
 input_size = 64
 hidden_size = 256
 
-; [rnn]
-; input_size = 256
-; hidden_size = 256
+[rnn]
+input_size = 256
+hidden_size = 256
 
 [transformer]
 input_size = 256
@@ -112,15 +112,13 @@ discount_weight_ub = 0.80
 [train]
 seed=42
 total_timesteps = 2_000_000_000
-# learning_rate = 0.02
-# gamma = 0.985
 anneal_lr = True
-; Needs to be: num_agents * num_workers * BPTT horizon
+; Needs to be: num_agents * num_workers * context_window
 batch_size = auto
 minibatch_size = 32768
 max_minibatch_size = 32768
-; minibatch_size = 256
-; max_minibatch_size = 256
+minibatch_multiplier = 400
+policy_architecture = Recurrent
 bptt_horizon = 32
 adam_beta1 = 0.9
 adam_beta2 = 0.999
@@ -130,17 +128,15 @@ ent_coef = 0.005
 gae_lambda = 0.95
 gamma = 0.98
 learning_rate = 0.003
-max_grad_norm = 1
-prio_alpha = 0.8499999999999999
-prio_beta0 = 0.8499999999999999
+max_grad_norm = 1.0
+prio_alpha = 0.85
+prio_beta0 = 0.85
 update_epochs = 1
-vf_clip_coef = 0.1999999999999999
-vf_coef = 2
+vf_clip_coef = 0.2
+vf_coef = 2.0
 vtrace_c_clip = 1
 vtrace_rho_clip = 1
-checkpoint_interval = 100
-use_transformer = True
-context_window = 32
+checkpoint_interval = 10
 # Rendering options
 render = True
 render_interval = 100
@@ -184,8 +180,16 @@ wosac_sanity_check = False
 wosac_aggregate_results = True
 ; If True, enable human replay evaluation (pair policy-controlled agent with human replays)
 human_replay_eval = False
-; Control only the self-driving car
-human_replay_control_mode = "control_sdc_only"
+; Control mode for human replay (control_vehicles with max_controlled_agents=1 controls one agent)
+human_replay_control_mode = "control_vehicles"
+; Number of agents in human replay evaluation environment
+human_replay_num_agents = 32
+; Number of rollouts for human replay evaluation
+human_replay_num_rollouts = 100
+; Number of maps to use for human replay evaluation
+human_replay_num_maps = 100
+; Number of maps to render for human replay (subset of eval maps)
+human_replay_render_num_maps = 3
 
 [sweep.train.learning_rate]
 distribution = log_normal

diff --git a/pufferlib/ocean/drive/drive.c b/pufferlib/ocean/drive/drive.c
@@ -34,20 +34,18 @@ void test_drivenet() {
 void demo() {
 
     // Note: The settings below are hardcoded for demo purposes. Since the policy was
-    // trained with these exact settings, that changing them may lead to
-    // weird behavior.
+    // trained with these exact settings, changing them may lead to weird behavior.
     Drive env = {
         .human_agent_idx = 0,
-        .dynamics_model = conf.dynamics_model,
-        .reward_vehicle_collision = conf.reward_vehicle_collision,
-        .reward_offroad_collision = conf.reward_offroad_collision,
-        .reward_ade = conf.reward_ade,
-        .goal_radius = conf.goal_radius,
-        .dt = conf.dt,
+        .dynamics_model = CLASSIC,
+        .reward_vehicle_collision = -1.0f,
+        .reward_offroad_collision = -1.0f,
+        .goal_radius = 2.0f,
+        .dt = 0.1f,
         .map_name = "resources/drive/binaries/training/map_000.bin",
-        .init_steps = conf.init_steps,
-        .collision_behavior = conf.collision_behavior,
-        .offroad_behavior = conf.offroad_behavior,
+        .init_steps = 0,
+        .collision_behavior = 0,
+        .offroad_behavior = 0,
     };
     allocate(&env);
     c_reset(&env);

diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py
@@ -1084,7 +1084,8 @@ def test_performance(timeout=10, atn_cache=1024, num_agents=1024):
 if __name__ == "__main__":
     # test_performance()
     # Process the train dataset
-    process_all_maps(data_folder="/data/processed/training")
+    # process_all_maps(data_folder="/data/processed/training")
+    process_all_maps(data_folder="/data/nuplan_gpudrive/nuplan")
     # Process the validation/test dataset
     # process_all_maps(data_folder="data/processed/validation")
     # # Process the validation_interactive dataset
Original file line number	Diff line number	Diff line change
Expand Up		@@ -194,3 +194,4 @@ pufferlib/resources/drive/output*.mp4

		# Local TODO tracking
		TODO.md
		*.mp4