diff --git a/.gitignore b/.gitignore index 66e4b066be..71497ea416 100644 --- a/.gitignore +++ b/.gitignore @@ -194,3 +194,4 @@ pufferlib/resources/drive/output*.mp4 # Local TODO tracking TODO.md +*.mp4 diff --git a/evaluate_human_logs.py b/evaluate_human_logs.py index a91e8bd2ce..fb743398c3 100644 --- a/evaluate_human_logs.py +++ b/evaluate_human_logs.py @@ -104,7 +104,7 @@ def main(): print("Beginning human evaluations using HumanReplayEvaluator") parser = argparse.ArgumentParser() parser.add_argument("--policy-path", type=str, required=True) - parser.add_argument("--policy-architecture", type=str, default="Recurrent") + parser.add_argument("--rnn-name", type=str, default="Recurrent") parser.add_argument("--num-maps", type=int, default=10) parser.add_argument("--num-rollouts", type=int, default=100) parser.add_argument("--num-agents", type=int, default=64) @@ -119,7 +119,7 @@ def main(): print(f"Evaluation Configuration:") print(f" Policy: {args_parsed.policy_path}") - print(f" Policy Architecture: {args_parsed.policy_architecture}") + print(f" RNN Name: {args_parsed.rnn_name}") print(f" Num maps: {args_parsed.num_maps}") print(f" Total rollouts: {args_parsed.num_rollouts}") print(f" Num agents per env: {args_parsed.num_agents}") @@ -133,14 +133,13 @@ def main(): make_env = env_creator(env_name) scenario_length = 91 - context_length = args_parsed.k_scenarios * scenario_length + horizon = args_parsed.k_scenarios * scenario_length args = { + "rnn_name": args_parsed.rnn_name, "train": { "device": args_parsed.device, - "use_rnn": args_parsed.policy_architecture == "Recurrent", - "policy_architecture": args_parsed.policy_architecture, - "context_window": context_length, + "horizon": horizon, }, "env": { "num_agents": args_parsed.num_agents, @@ -176,10 +175,10 @@ def main(): print("Loading policy...") temp_env = make_env(**args["env"]) - if args_parsed.policy_architecture == "Recurrent": - base_policy = Drive(temp_env, input_size=64, hidden_size=256) + if args_parsed.rnn_name == "Recurrent": + base_policy = Drive(temp_env, input_size=128, hidden_size=256) policy = Recurrent(temp_env, base_policy, input_size=256, hidden_size=256).to(args_parsed.device) - elif args_parsed.policy_architecture == "Transformer": + elif args_parsed.rnn_name == "Transformer": base_policy = Drive(temp_env, input_size=128, hidden_size=256) policy = Transformer( temp_env, @@ -188,7 +187,7 @@ def main(): hidden_size=256, num_layers=2, num_heads=4, - context_length=context_length, + horizon=horizon, ).to(args_parsed.device) state_dict = torch.load(args_parsed.policy_path, map_location=args_parsed.device) @@ -242,6 +241,13 @@ def main(): values = [r.get(key, 0) for r in all_results] aggregated[key] = float(np.mean(values)) + # Aggregate scenario-specific metrics (scenario_0_*, scenario_1_*, etc.) + scenario_keys = [k for k in all_keys if k.startswith("scenario_")] + for key in scenario_keys: + values = [r.get(key, 0) for r in all_results] + aggregated[key] = float(np.mean(values)) + + if delta_keys: # Derive last scenario metrics from first + delta # Extract first scenario metrics first_scenario_keys = [k for k in metric_keys if k not in ["n"]] @@ -272,24 +278,22 @@ def main(): json.dump(aggregated, f, indent=2) # Print results - if args_parsed.adaptive_driving_agent and delta_keys: - print(f"\n0-Shot Performance (First Scenario):") - print(f" Score: {aggregated.get('first_scenario_score', float('nan')):.3f}") - print(f" Collision: {aggregated.get('first_scenario_collision_rate', float('nan')):.3f}") - print(f" Offroad: {aggregated.get('first_scenario_offroad_rate', float('nan')):.3f}") - print(f" Return: {aggregated.get('first_scenario_episode_return', float('nan')):.2f}") - - print(f"\nAdapted Performance (Last Scenario):") - print(f" Score: {aggregated.get('last_scenario_score', float('nan')):.3f}") - print(f" Collision: {aggregated.get('last_scenario_collision_rate', float('nan')):.3f}") - print(f" Offroad: {aggregated.get('last_scenario_offroad_rate', float('nan')):.3f}") - print(f" Return: {aggregated.get('last_scenario_episode_return', float('nan')):.2f}") - - print(f"\nAdaptive Metrics (Delta):") - print(f" Score: {aggregated.get('ada_delta_score', float('nan')):.4f}") - print(f" Collision rate: {aggregated.get('ada_delta_collision_rate', float('nan')):.4f}") - print(f" Offroad rate: {aggregated.get('ada_delta_offroad_rate', float('nan')):.4f}") - print(f" Episode return: {aggregated.get('ada_delta_episode_return', float('nan')):.4f}") + if args_parsed.adaptive_driving_agent: + # Print per-scenario metrics + for i in range(args_parsed.k_scenarios): + label = "0-Shot" if i == 0 else f"Scenario {i}" + print(f"\n{label} Performance:") + print(f" Score: {aggregated.get(f'scenario_{i}_score', float('nan')):.3f}") + print(f" Collision: {aggregated.get(f'scenario_{i}_collision_rate', float('nan')):.3f}") + print(f" Offroad: {aggregated.get(f'scenario_{i}_offroad_rate', float('nan')):.3f}") + print(f" Return: {aggregated.get(f'scenario_{i}_episode_return', float('nan')):.2f}") + + if delta_keys: + print(f"\nAdaptive Metrics (Delta: Last - First):") + print(f" Score: {aggregated.get('ada_delta_score', float('nan')):.4f}") + print(f" Collision rate: {aggregated.get('ada_delta_collision_rate', float('nan')):.4f}") + print(f" Offroad rate: {aggregated.get('ada_delta_offroad_rate', float('nan')):.4f}") + print(f" Episode return: {aggregated.get('ada_delta_episode_return', float('nan')):.4f}") print(f"\nSaved to {args_parsed.output}") import sys diff --git a/pufferlib/config/default.ini b/pufferlib/config/default.ini index 810de828c0..248eb8d3b3 100644 --- a/pufferlib/config/default.ini +++ b/pufferlib/config/default.ini @@ -49,7 +49,7 @@ minibatch_size = 8192 # Accumulate gradients above this size max_minibatch_size = 32768 -bptt_horizon = 64 +horizon = 64 compile = False compile_mode = max-autotune-no-cudagraphs compile_fullgraph = True @@ -81,7 +81,7 @@ downsample = 10 ; mean = 1e8 ; scale = time -; [sweep.train.bptt_horizon] +; [sweep.train.horizon] ; distribution = uniform_pow2 ; min = 16 ; max = 64 diff --git a/pufferlib/config/ocean/adaptive.ini b/pufferlib/config/ocean/adaptive.ini index 598faf50ff..f7ab1b2b0c 100644 --- a/pufferlib/config/ocean/adaptive.ini +++ b/pufferlib/config/ocean/adaptive.ini @@ -2,18 +2,20 @@ package = ocean env_name = puffer_adaptive_drive policy_name = Drive -transformer_name = Transformer - ; Changed from rnn_name +policy_architecture = Transformer [vec] num_workers = 16 num_envs = 16 -batch_size = 2 +batch_size = 1 ; backend = Serial [policy] input_size = 128 -; Increased from 64 for richer representations +hidden_size = 256 + +[rnn] +input_size = 256 hidden_size = 256 [transformer] @@ -23,14 +25,13 @@ num_layers = 2 ; Number of transformer layers num_heads = 4 ; Number of attention heads (must divide hidden_size) -; context_length = 182 -; k_scenarios (2) * scenario_length (91) = maximum attention span +; Transformer uses `horizon` from [train] section for attention span dropout = 0.0 ; Dropout (keep at 0 for RL stability initially) [env] -num_agents = 1512 -num_ego_agents = 756 +num_agents = 1024 +num_ego_agents = 512 ; Options: discrete, continuous action_type = discrete ; Options: classic, jerk @@ -60,7 +61,7 @@ k_scenarios = 2 termination_mode = 1 ; 0 - terminate at episode_length, 1 - terminate after all agents have been reset map_dir = "resources/drive/binaries/training" -num_maps = 1000 +num_maps = 10000 ; Determines which step of the trajectory to initialize the agents at upon reset init_steps = 0 ; Options: "control_vehicles", "control_agents", "control_wosac", "control_sdc_only" @@ -68,7 +69,7 @@ control_mode = "control_vehicles" ; Options: "created_all_valid", "create_only_controlled" init_mode = "create_all_valid" ; train with co players -co_player_enabled = False +co_player_enabled = True [env.conditioning] @@ -87,15 +88,24 @@ discount_weight_ub = 0.98 [env.co_player_policy] policy_name = Drive -rnn_name = Recurrent +; Options: "Recurrent", "Transformer" +architecture = Recurrent policy_path = "pufferlib/resources/drive/policies/varied_discount.pt" -input_size = 64 +input_size = 128 hidden_size = 256 [env.co_player_policy.rnn] input_size = 256 hidden_size = 256 +[env.co_player_policy.transformer] +input_size = 256 +hidden_size = 256 +num_layers = 2 +num_heads = 4 +horizon = 91 +dropout = 0.0 + [env.co_player_policy.conditioning] ; Options: "none", "reward", "entropy", "discount", "all" type = "all" @@ -114,16 +124,14 @@ discount_weight_ub = 0.98 seed=42 total_timesteps = 2_000_000_000 anneal_lr = True -; Needs to be: num_agents * num_workers * context_window +; Needs to be: num_agents * num_workers * horizon batch_size = auto minibatch_size = 36400 -; 400 * 91 +; 200 * 182 (must be divisible by horizon = k_scenarios * scenario_length) max_minibatch_size = 36400 minibatch_multiplier = 400 -policy_architecture = Transformer -; Matches scenario_length for buffer organization -bptt_horizon = 32 -; Keep for backward compatibility +; Sequence length - overridden to k_scenarios * scenario_length for adaptive +horizon = 91 adam_beta1 = 0.9 adam_beta2 = 0.999 adam_eps = 1e-8 @@ -131,7 +139,7 @@ clip_coef = 0.2 ent_coef = 0.005 gae_lambda = 0.95 gamma = 0.98 -learning_rate = 0.0003 +learning_rate = 0.003 ; Reduced from 0.003 (transformers often need lower LR) max_grad_norm = 1.0 prio_alpha = 0.85 @@ -193,6 +201,8 @@ human_replay_num_agents = 32 human_replay_num_rollouts = 100 ; Number of maps to use for human replay evaluation human_replay_num_maps = 100 +; Number of maps to render for human replay (subset of eval maps) +human_replay_render_num_maps = 2 [sweep.train.learning_rate] distribution = log_normal diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini index 7d5efd43be..af2dfef40c 100644 --- a/pufferlib/config/ocean/drive.ini +++ b/pufferlib/config/ocean/drive.ini @@ -2,7 +2,7 @@ package = ocean env_name = puffer_drive policy_name = Drive -rnn_name = Transformer +policy_architecture = Transformer [vec] num_workers = 16 @@ -11,12 +11,12 @@ batch_size = 2 ; backend = Serial [policy] -input_size = 64 +input_size = 128 hidden_size = 256 -; [rnn] -; input_size = 256 -; hidden_size = 256 +[rnn] +input_size = 256 +hidden_size = 256 [transformer] input_size = 256 @@ -25,13 +25,12 @@ num_layers = 2 ; Number of transformer layers num_heads = 4 ; Number of attention heads (must divide hidden_size) -context_window = 32 ; k_scenarios (2) * scenario_length (91) = maximum attention span dropout = 0.0 ; Dropout (keep at 0 for RL stability initially) [env] -num_agents = 512 +num_agents = 1024 num_ego_agents = 512 ; Options: discrete, continuous action_type = discrete @@ -47,7 +46,7 @@ goal_radius = 2.0 ; Max target speed in m/s for the agent to maintain towards the goal goal_speed = 100.0 ; What to do when the goal is reached. Options: 0:"respawn", 1:"generate_new_goals", 2:"stop" -goal_behavior = 1 +goal_behavior = 0 ; Determines the target distance to the new goal in the case of goal_behavior = generate_new_goals. ; Large numbers will select a goal point further away from the agent's current position. goal_target_distance = 30.0 @@ -112,16 +111,15 @@ discount_weight_ub = 0.80 [train] seed=42 total_timesteps = 2_000_000_000 -# learning_rate = 0.02 -# gamma = 0.985 anneal_lr = True -; Needs to be: num_agents * num_workers * BPTT horizon +; Needs to be: num_agents * num_workers * horizon batch_size = auto -minibatch_size = 32768 -max_minibatch_size = 32768 -; minibatch_size = 256 -; max_minibatch_size = 256 -bptt_horizon = 32 +minibatch_size = 32760 +; 360 * 91 +max_minibatch_size = 32760 +minibatch_multiplier = 360 +; Sequence length for training - matches scenario_length for full episode context +horizon = 91 adam_beta1 = 0.9 adam_beta2 = 0.999 adam_eps = 1e-8 @@ -130,17 +128,16 @@ ent_coef = 0.005 gae_lambda = 0.95 gamma = 0.98 learning_rate = 0.003 -max_grad_norm = 1 -prio_alpha = 0.8499999999999999 -prio_beta0 = 0.8499999999999999 +max_grad_norm = 1.0 +prio_alpha = 0.85 +prio_beta0 = 0.85 update_epochs = 1 -vf_clip_coef = 0.1999999999999999 -vf_coef = 2 +vf_clip_coef = 0.2 +vf_coef = 2.0 vtrace_c_clip = 1 vtrace_rho_clip = 1 checkpoint_interval = 100 -use_transformer = True -context_window = 32 +context_length = 32 # Rendering options render = True render_interval = 100 @@ -158,7 +155,7 @@ zoom_in = True render_map = none [eval] -eval_interval = 1000 +eval_interval = 100 ; Path to dataset used for evaluation map_dir = "resources/drive/binaries/training" ; Evaluation will run on the first num_maps maps in the map_dir directory @@ -183,9 +180,17 @@ wosac_sanity_check = False ; Only return aggregate results across all scenes wosac_aggregate_results = True ; If True, enable human replay evaluation (pair policy-controlled agent with human replays) -human_replay_eval = False -; Control only the self-driving car -human_replay_control_mode = "control_sdc_only" +human_replay_eval = True +; Control mode for human replay (control_vehicles with max_controlled_agents=1 controls one agent) +human_replay_control_mode = "control_vehicles" +; Number of agents in human replay evaluation environment +human_replay_num_agents = 32 +; Number of rollouts for human replay evaluation +human_replay_num_rollouts = 100 +; Number of maps to use for human replay evaluation +human_replay_num_maps = 100 +; Number of maps to render for human replay (subset of eval maps) +human_replay_render_num_maps = 2 [sweep.train.learning_rate] distribution = log_normal diff --git a/pufferlib/models.py b/pufferlib/models.py index d0b438f9a3..9c6228468b 100644 --- a/pufferlib/models.py +++ b/pufferlib/models.py @@ -208,7 +208,7 @@ def __init__( hidden_size=128, num_layers=4, num_heads=8, - context_length=512, + horizon=512, dropout=0.0, ): """Wraps your policy with a Transformer for temporal modeling. @@ -220,7 +220,7 @@ def __init__( hidden_size: Transformer hidden dimension num_layers: Number of transformer layers num_heads: Number of attention heads - context_length: Maximum sequence length to attend over + horizon: Maximum sequence length to attend over dropout: Dropout probability """ super().__init__() @@ -228,7 +228,7 @@ def __init__( self.policy = policy self.input_size = input_size self.hidden_size = hidden_size - self.context_length = context_length + self.horizon = horizon self.num_layers = num_layers self.num_heads = num_heads self.is_continuous = self.policy.is_continuous @@ -240,7 +240,7 @@ def __init__( self.input_projection = nn.Identity() # Learnable positional embeddings - self.positional_embedding = nn.Parameter(torch.zeros(1, context_length, hidden_size)) + self.positional_embedding = nn.Parameter(torch.zeros(1, horizon, hidden_size)) nn.init.normal_(self.positional_embedding, std=0.02) # Transformer encoder @@ -307,33 +307,29 @@ def forward_eval(self, observations, state): hidden = self.input_projection(hidden) if "transformer_context" not in state or state["transformer_context"] is None: - context = torch.zeros(B, self.context_length, self.hidden_size, device=device) + context = torch.zeros(B, self.horizon, self.hidden_size, device=device) pos = torch.zeros(1, dtype=torch.long, device=device) else: context = state["transformer_context"] pos = state.get("transformer_position", torch.zeros(1, dtype=torch.long, device=device)) - if ( - context.shape[-1] != self.hidden_size - or context.shape[0] != B - or context.shape[1] != self.context_length - ): - context = torch.zeros(B, self.context_length, self.hidden_size, device=device) + if context.shape[-1] != self.hidden_size or context.shape[0] != B or context.shape[1] != self.horizon: + context = torch.zeros(B, self.horizon, self.hidden_size, device=device) pos = torch.zeros(1, dtype=torch.long, device=device) - write_idx = (pos % self.context_length).long() + write_idx = (pos % self.horizon).long() context[:, write_idx, :] = hidden.unsqueeze(1) pos = pos + 1 - pos_embed = self.positional_embedding[:, : self.context_length] + pos_embed = self.positional_embedding[:, : self.horizon] context_with_pos = context + pos_embed - causal_mask = self.get_causal_mask(self.context_length, device) + causal_mask = self.get_causal_mask(self.horizon, device) output = self.transformer(context_with_pos, mask=causal_mask, is_causal=True) output = self.output_norm(output) - read_idx = ((pos - 1) % self.context_length).long() + read_idx = ((pos - 1) % self.horizon).long() hidden_out = output[:, read_idx, :].squeeze(1) state["transformer_context"] = context @@ -361,7 +357,7 @@ def forward(self, observations, state): hidden = self.input_projection(hidden) # Remove dynamic truncation - use clamp instead of if - T_actual = min(T, self.context_length) # Python int, fine + T_actual = min(T, self.horizon) # Python int, fine if T_actual < T: hidden = hidden[:, -T_actual:] T = T_actual @@ -390,7 +386,7 @@ def forward(self, observations, state): values = values.view(B, T) # Use Python int for context_len - no sync - context_len = min(T, self.context_length) + context_len = min(T, self.horizon) state["hidden"] = hidden state["transformer_context"] = hidden[:, -context_len:].detach() state["transformer_position"] = torch.full((B,), context_len - 1, dtype=torch.long, device=device) diff --git a/pufferlib/ocean/benchmark/evaluator.py b/pufferlib/ocean/benchmark/evaluator.py index dbf84c1906..03a36eabe8 100644 --- a/pufferlib/ocean/benchmark/evaluator.py +++ b/pufferlib/ocean/benchmark/evaluator.py @@ -638,7 +638,7 @@ def rollout(self, args, puffer_env, policy): the policy is with (static) human partners. Args: - args: Config dict with train settings (device, use_rnn, policy_architecture, etc.) + args: Config dict with train settings (device, rnn_name, etc.) puffer_env: PufferLib environment wrapper policy: Trained policy to evaluate @@ -654,24 +654,27 @@ def rollout(self, args, puffer_env, policy): obs, info = puffer_env.reset() - policy_architecture = args["train"].get("policy_architecture", "Recurrent") k_scenarios = args["env"].get("k_scenarios", 1) - if policy_architecture == "Recurrent": + # Detect architecture from policy object + is_transformer = hasattr(policy, "horizon") and hasattr(policy, "transformer") + is_recurrent = hasattr(policy, "lstm") + + if is_recurrent: state = dict( lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device), lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device), ) - elif policy_architecture == "Transformer": - context_length = args["train"].get("context_window", 182) + elif is_transformer: state = dict( - transformer_context=torch.zeros(num_agents, context_length, policy.hidden_size, device=device), + transformer_context=torch.zeros(num_agents, policy.horizon, policy.hidden_size, device=device), transformer_position=torch.zeros(1, dtype=torch.long, device=device), ) else: state = {} collected_infos = [] + scenario_metrics = {} # Store scenario-specific metrics (scenario_0_*, scenario_1_*, etc.) delta_metrics = None # Loop through scenarios @@ -690,7 +693,7 @@ def rollout(self, args, puffer_env, policy): obs, rewards, dones, truncs, info_list = puffer_env.step(action_np) # Reset transformer context on mid-scenario terminations (not at scenario boundaries) - if policy_architecture == "Transformer": + if is_transformer: is_last_step = time_idx == self.sim_steps - 1 if not is_last_step: done_mask = dones | truncs @@ -704,6 +707,9 @@ def rollout(self, args, puffer_env, policy): if isinstance(info_dict, dict): if "ada_delta_score" in info_dict: delta_metrics = info_dict + elif any(k.startswith("scenario_") for k in info_dict.keys()): + # Scenario-specific metrics (scenario_0_*, scenario_1_*, etc.) + scenario_metrics.update(info_dict) elif "score" in info_dict: collected_infos.append(info_dict) @@ -715,6 +721,10 @@ def rollout(self, args, puffer_env, policy): values = [info.get(key, 0) for info in collected_infos] aggregated[key] = np.mean(values) + # Merge scenario-specific metrics if they exist + if scenario_metrics: + aggregated.update(scenario_metrics) + # Merge delta metrics if they exist if delta_metrics: aggregated.update(delta_metrics) diff --git a/pufferlib/ocean/drive/drive.c b/pufferlib/ocean/drive/drive.c index 9f6337051c..bae514790c 100644 --- a/pufferlib/ocean/drive/drive.c +++ b/pufferlib/ocean/drive/drive.c @@ -34,20 +34,18 @@ void test_drivenet() { void demo() { // Note: The settings below are hardcoded for demo purposes. Since the policy was - // trained with these exact settings, that changing them may lead to - // weird behavior. + // trained with these exact settings, changing them may lead to weird behavior. Drive env = { .human_agent_idx = 0, - .dynamics_model = conf.dynamics_model, - .reward_vehicle_collision = conf.reward_vehicle_collision, - .reward_offroad_collision = conf.reward_offroad_collision, - .reward_ade = conf.reward_ade, - .goal_radius = conf.goal_radius, - .dt = conf.dt, + .dynamics_model = CLASSIC, + .reward_vehicle_collision = -1.0f, + .reward_offroad_collision = -1.0f, + .goal_radius = 2.0f, + .dt = 0.1f, .map_name = "resources/drive/binaries/training/map_000.bin", - .init_steps = conf.init_steps, - .collision_behavior = conf.collision_behavior, - .offroad_behavior = conf.offroad_behavior, + .init_steps = 0, + .collision_behavior = 0, + .offroad_behavior = 0, }; allocate(&env); c_reset(&env); diff --git a/pufferlib/ocean/drive/drive.h b/pufferlib/ocean/drive/drive.h index 009b2d6206..6b44f49398 100644 --- a/pufferlib/ocean/drive/drive.h +++ b/pufferlib/ocean/drive/drive.h @@ -549,10 +549,16 @@ Entity *load_map_binary(const char *filename, Drive *env) { return NULL; // Read sdc_track_index - fread(&env->sdc_track_index, sizeof(int), 1, file); + if (fread(&env->sdc_track_index, sizeof(int), 1, file) != 1) { + fclose(file); + return NULL; + } // Read tracks_to_predict - fread(&env->num_tracks_to_predict, sizeof(int), 1, file); + if (fread(&env->num_tracks_to_predict, sizeof(int), 1, file) != 1) { + fclose(file); + return NULL; + } if (env->num_tracks_to_predict > 0) { env->tracks_to_predict_indices = (int *)malloc(env->num_tracks_to_predict * sizeof(int)); @@ -569,10 +575,22 @@ Entity *load_map_binary(const char *filename, Drive *env) { Entity *entities = (Entity *)malloc(env->num_entities * sizeof(Entity)); for (int i = 0; i < env->num_entities; i++) { // Read base entity data - fread(&entities[i].scenario_id, sizeof(int), 1, file); - fread(&entities[i].type, sizeof(int), 1, file); - fread(&entities[i].id, sizeof(int), 1, file); - fread(&entities[i].array_size, sizeof(int), 1, file); + if (fread(&entities[i].scenario_id, sizeof(int), 1, file) != 1 || + fread(&entities[i].type, sizeof(int), 1, file) != 1 || fread(&entities[i].id, sizeof(int), 1, file) != 1 || + fread(&entities[i].array_size, sizeof(int), 1, file) != 1) { + // File truncated - adjust entity count and break + env->num_entities = i; + env->num_objects = (i < env->num_objects) ? i : env->num_objects; + env->num_roads = env->num_entities - env->num_objects; + break; + } + // Validate array_size is reasonable (max 1000 timesteps = 100s at 0.1s dt) + if (entities[i].array_size <= 0 || entities[i].array_size > 1000) { + env->num_entities = i; + env->num_objects = (i < env->num_objects) ? i : env->num_objects; + env->num_roads = env->num_entities - env->num_objects; + break; + } // Allocate arrays based on type int size = entities[i].array_size; entities[i].traj_x = (float *)malloc(size * sizeof(float)); @@ -773,12 +791,14 @@ void init_grid_map(Drive *env) { float x_center = (env->entities[i].traj_x[j] + env->entities[i].traj_x[j + 1]) / 2; float y_center = (env->entities[i].traj_y[j] + env->entities[i].traj_y[j + 1]) / 2; int grid_index = getGridIndex(env, x_center, y_center); - env->grid_map->cell_entities_count[grid_index]++; + if (grid_index != -1) { + env->grid_map->cell_entities_count[grid_index]++; + } } } } - int cell_entities_insert_index[grid_cell_count]; // Helper array for insertion index - memset(cell_entities_insert_index, 0, grid_cell_count * sizeof(int)); + // Use heap allocation instead of VLA to avoid stack overflow on large maps + int *cell_entities_insert_index = (int *)calloc(grid_cell_count, sizeof(int)); // Initialize grid cells for (int grid_index = 0; grid_index < grid_cell_count; grid_index++) { @@ -804,6 +824,7 @@ void init_grid_map(Drive *env) { } } } + free(cell_entities_insert_index); } void init_neighbor_offsets(Drive *env) { diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py index 420e726b5e..02c1fe626b 100644 --- a/pufferlib/ocean/drive/drive.py +++ b/pufferlib/ocean/drive/drive.py @@ -79,6 +79,9 @@ def __init__( if episode_length != None: self.scenario_length = episode_length + # Only set episode_length if not already set (adaptive.py sets it before calling super()) + if not hasattr(self, "episode_length"): + self.episode_length = self.scenario_length # Adaptive driving agent setup self.adaptive_driving_agent = int(adaptive_driving_agent) @@ -223,11 +226,11 @@ def __init__( self._action_type_flag = 0 if action_type == "discrete" else 1 - # Check if resources directory exists - binary_path = f"{map_dir}/map_000.bin" + # Check if resources directory exists (check map_001 since some datasets start at 001) + binary_path = f"{map_dir}/map_001.bin" if not os.path.exists(binary_path): raise FileNotFoundError( - f"Required directory {binary_path} not found. Please ensure the Drive maps are downloaded and installed correctly per docs." + f"Required file {binary_path} not found. Please ensure the Drive maps are downloaded and installed correctly per docs." ) # Check maps availability @@ -407,9 +410,11 @@ def _set_env_variables(self): self.agent_offsets, self.map_ids, self.num_envs = my_shared_tuple self.ego_ids = [i for i in range(self.agent_offsets[-1])] if len(self.ego_ids) != self.num_agents: - raise ValueError( - f"mismatch between number of ego agents {len(self.ego_ids)} and number of agents {self.num_agents}" + print( + f"Warning: requested {self.num_agents} agents but maps contain {len(self.ego_ids)} valid agents. Adjusting.", + flush=True, ) + self.num_agents = len(self.ego_ids) self.local_co_player_ids = [[] for i in range(self.num_envs)] self.local_ego_ids = [[0] for i in range(self.num_envs)] @@ -593,6 +598,10 @@ def step(self, actions): scenario_log["scenario_id"] = self.current_scenario self.scenario_metrics.append(scenario_log) + # Log metrics for all scenarios with scenario-specific prefixes + prefixed_log = {f"scenario_{self.current_scenario}_{k}": v for k, v in scenario_log.items() if k != "scenario_id"} + info.append(prefixed_log) + if self.current_scenario == self.k_scenarios - 1: delta_metrics = self._compute_delta_metrics() if delta_metrics: @@ -604,6 +613,10 @@ def step(self, actions): self.current_scenario = (self.current_scenario + 1) % self.k_scenarios + # Reset coplayer LSTM state at scenario boundary so coplayer behaves consistently + if self.population_play: + self._reset_co_player_state() + if self.tick > 0 and self.resample_frequency > 0 and self.tick % self.resample_frequency == 0: self.tick = 0 will_resample = 1 @@ -872,7 +885,11 @@ def save_map_binary(map_data, output_file, unique_map_id): elif obj_type == "cyclist": obj_type = 3 f.write(struct.pack("i", obj_type)) # type - f.write(struct.pack("i", obj.get("id", 0))) # id + # Truncate large IDs to fit in int32 range + obj_id = obj.get("id", 0) + if isinstance(obj_id, int) and (obj_id > 2147483647 or obj_id < -2147483648): + obj_id = obj_id % 2147483647 + f.write(struct.pack("i", obj_id)) # id f.write(struct.pack("i", trajectory_length)) # array_size # Write position arrays positions = obj.get("position", []) @@ -952,7 +969,11 @@ def save_map_binary(map_data, output_file, unique_map_id): road_type = 10 # Write base entity data f.write(struct.pack("i", road_type)) # type - f.write(struct.pack("i", road.get("id", 0))) # id + # Truncate large IDs to fit in int32 range + road_id = road.get("id", 0) + if isinstance(road_id, int) and (road_id > 2147483647 or road_id < -2147483648): + road_id = road_id % 2147483647 + f.write(struct.pack("i", road_id)) # id f.write(struct.pack("i", size)) # array_size # Write position arrays @@ -1084,7 +1105,8 @@ def test_performance(timeout=10, atn_cache=1024, num_agents=1024): if __name__ == "__main__": # test_performance() # Process the train dataset - process_all_maps(data_folder="/data/processed/training") + # process_all_maps(data_folder="/data/processed/training") + process_all_maps(data_folder="/data/nuplan_gpudrive/nuplan") # Process the validation/test dataset # process_all_maps(data_folder="data/processed/validation") # # Process the validation_interactive dataset diff --git a/pufferlib/ocean/drive/visualize.c b/pufferlib/ocean/drive/visualize.c index 4820bd9be0..6f484e641d 100644 --- a/pufferlib/ocean/drive/visualize.c +++ b/pufferlib/ocean/drive/visualize.c @@ -65,11 +65,15 @@ void CloseVideo(VideoRecorder *recorder) { waitpid(recorder->pid, NULL, 0); } -void renderTopDownView(Drive *env, Client *client, int map_height, int obs, int lasers, int trajectories, - int frame_count, float *path, int show_human_logs, int show_grid, int img_width, int img_height, - int zoom_in) { +void renderTopDownView(Drive *env, Client *client, float map_width, float map_height, int obs, int lasers, + int trajectories, int frame_count, float *path, int show_human_logs, int show_grid, + int img_width, int img_height, int zoom_in, int current_scenario, int total_scenarios) { BeginDrawing(); + // Calculate map center + float center_x = (env->grid_map->top_left_x + env->grid_map->bottom_right_x) / 2.0f; + float center_y = (env->grid_map->top_left_y + env->grid_map->bottom_right_y) / 2.0f; + // Top-down orthographic camera Camera3D camera = {0}; @@ -77,10 +81,11 @@ void renderTopDownView(Drive *env, Client *client, int map_height, int obs, int camera.position = (Vector3){0.0f, 0.0f, 500.0f}; // above the scene camera.target = (Vector3){0.0f, 0.0f, 0.0f}; // look at origin camera.fovy = map_height; - } else { // Show full map - camera.position = (Vector3){env->grid_map->top_left_x, env->grid_map->bottom_right_y, 500.0f}; - camera.target = (Vector3){env->grid_map->top_left_x, env->grid_map->bottom_right_y, 0.0f}; - camera.fovy = 2 * map_height; + } else { // Show full map - center camera on map + camera.position = (Vector3){center_x, center_y, 500.0f}; + camera.target = (Vector3){center_x, center_y, 0.0f}; + // Use the larger dimension to ensure full map is visible + camera.fovy = (map_height > map_width) ? map_height * 1.1f : map_width * 1.1f; } camera.up = (Vector3){0.0f, -1.0f, 0.0f}; @@ -133,6 +138,12 @@ void renderTopDownView(Drive *env, Client *client, int map_height, int obs, int // Draw scene draw_scene(env, client, 1, obs, lasers, show_grid); EndMode3D(); + + // Draw scenario counter overlay (2D text on top of 3D scene) + char scenario_text[64]; + snprintf(scenario_text, sizeof(scenario_text), "Scenario %d / %d", current_scenario, total_scenarios); + DrawText(scenario_text, 20, 20, 30, WHITE); + EndDrawing(); } @@ -189,18 +200,125 @@ static int make_gif_from_frames(const char *pattern, int fps, const char *palett return 0; } +// Transform observations from ego format to co-player format by inserting conditioning values +// src_obs: Source observations (may include ego conditioning) +// dst_obs: Destination buffer for co-player format (with co-player conditioning) +// num_agents: Number of agents to transform +// ego_base_dim: Base ego features (7 for CLASSIC, 10 for JERK) +// co_use_rc/ec/dc: Co-player conditioning flags (determines which features to insert) +void transform_obs_for_coplayer(float *src_obs, float *dst_obs, int num_agents, int ego_obs_size, int coplayer_obs_size, + int ego_base_dim, int co_use_rc, int co_use_ec, int co_use_dc, float collision_lb, + float collision_ub, float offroad_lb, float offroad_ub, float goal_lb, float goal_ub, + float entropy_lb, float entropy_ub, float discount_lb, float discount_ub) { + // Fixed sizes for partner and road features (from drive.h constants) + int partner_features = (MAX_AGENTS - 1) * PARTNER_FEATURES; + int road_features = MAX_ROAD_SEGMENT_OBSERVATIONS * ROAD_FEATURES; + int partner_road_features = partner_features + road_features; + + // Derive source conditioning size from ego_obs_size + // This handles cases where ego policy has conditioning (type != "none") + int src_conditioning = ego_obs_size - ego_base_dim - partner_road_features; + + // Calculate destination conditioning size from flags + int dst_conditioning = (co_use_rc ? 3 : 0) + (co_use_ec ? 1 : 0) + (co_use_dc ? 1 : 0); + + for (int i = 0; i < num_agents; i++) { + float *src = src_obs + i * ego_obs_size; + float *dst = dst_obs + i * coplayer_obs_size; + + // Copy ego base features (without conditioning) + memcpy(dst, src, ego_base_dim * sizeof(float)); + + // Sample and insert conditioning values based on flags + // Order must match: reward (3), entropy (1), discount (1) + int cond_idx = ego_base_dim; + if (co_use_rc) { + // Reward conditioning (3 features: collision, offroad, goal) + dst[cond_idx++] = collision_lb + (float)rand() / RAND_MAX * (collision_ub - collision_lb); + dst[cond_idx++] = offroad_lb + (float)rand() / RAND_MAX * (offroad_ub - offroad_lb); + dst[cond_idx++] = goal_lb + (float)rand() / RAND_MAX * (goal_ub - goal_lb); + } + if (co_use_ec) { + // Entropy conditioning (1 feature) + dst[cond_idx++] = entropy_lb + (float)rand() / RAND_MAX * (entropy_ub - entropy_lb); + } + if (co_use_dc) { + // Discount conditioning (1 feature) + dst[cond_idx++] = discount_lb + (float)rand() / RAND_MAX * (discount_ub - discount_lb); + } + + // Copy partner + road features, skipping over any source conditioning + memcpy(dst + ego_base_dim + dst_conditioning, src + ego_base_dim + src_conditioning, + partner_road_features * sizeof(float)); + } +} + +// Helper function for dual-policy forward pass +// Runs ego policy on first num_ego_agents, co-player policy on the rest +// Handles different observation sizes between ego and co-player policies +void forward_population(DriveNet *ego_net, DriveNet *co_player_net, float *observations, int *actions, + int num_ego_agents, int num_co_players, int ego_obs_size, int coplayer_obs_size, + int ego_base_dim, int co_use_rc, int co_use_ec, int co_use_dc, float co_collision_lb, + float co_collision_ub, float co_offroad_lb, float co_offroad_ub, float co_goal_lb, + float co_goal_ub, float co_entropy_lb, float co_entropy_ub, float co_discount_lb, + float co_discount_ub) { + if (co_player_net == NULL || num_co_players == 0) { + // Single policy mode - use ego net for all agents + forward(ego_net, observations, actions); + return; + } + + // Allocate temporary buffers for ego observations/actions + float *ego_obs = (float *)malloc(num_ego_agents * ego_obs_size * sizeof(float)); + int *ego_actions = (int *)malloc(num_ego_agents * sizeof(int)); + + // Allocate temporary buffers for co-player observations/actions + float *co_obs_raw = observations + num_ego_agents * ego_obs_size; + float *co_obs_transformed = (float *)malloc(num_co_players * coplayer_obs_size * sizeof(float)); + int *co_actions = (int *)malloc(num_co_players * sizeof(int)); + + // Copy ego observations (already correct format) + memcpy(ego_obs, observations, num_ego_agents * ego_obs_size * sizeof(float)); + + // Transform co-player observations (add conditioning features) + transform_obs_for_coplayer(co_obs_raw, co_obs_transformed, num_co_players, ego_obs_size, coplayer_obs_size, + ego_base_dim, co_use_rc, co_use_ec, co_use_dc, co_collision_lb, co_collision_ub, + co_offroad_lb, co_offroad_ub, co_goal_lb, co_goal_ub, co_entropy_lb, co_entropy_ub, + co_discount_lb, co_discount_ub); + + // Run forward on each network + forward(ego_net, ego_obs, ego_actions); + forward(co_player_net, co_obs_transformed, co_actions); + + // Combine actions back + memcpy(actions, ego_actions, num_ego_agents * sizeof(int)); + memcpy(actions + num_ego_agents, co_actions, num_co_players * sizeof(int)); + + // Cleanup + free(ego_obs); + free(ego_actions); + free(co_obs_transformed); + free(co_actions); +} + int eval_gif(const char *map_name, const char *policy_name, int show_grid, int obs_only, int lasers, int show_human_logs, int frame_skip, const char *view_mode, const char *output_topdown, - const char *output_agent, int num_maps, int zoom_in) { + const char *output_agent, int num_maps, int zoom_in, const char *ini_file, int k_scenarios_cli, + int max_controlled_agents_cli, const char *co_player_policy_name, const char *map_dir_cli) { // Parse configuration from INI file env_init_config conf = {0}; - const char *ini_file = "pufferlib/config/ocean/drive.ini"; if (ini_parse(ini_file, handler, &conf) < 0) { fprintf(stderr, "Error: Could not load %s. Cannot determine environment configuration.\n", ini_file); return -1; } + // Override map_dir if provided via CLI + if (map_dir_cli != NULL) { + strncpy(conf.map_dir, map_dir_cli, sizeof(conf.map_dir) - 1); + conf.map_dir[sizeof(conf.map_dir) - 1] = '\0'; + } + char map_buffer[100]; if (map_name == NULL) { srand(time(NULL)); @@ -269,7 +387,7 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o .entropy_weight_ub = (conf.conditioning != NULL) ? conf.conditioning->entropy_weight_ub : 0.0f, .discount_weight_lb = (conf.conditioning != NULL) ? conf.conditioning->discount_weight_lb : 0.0f, .discount_weight_ub = (conf.conditioning != NULL) ? conf.conditioning->discount_weight_ub : 0.0f, - .max_controlled_agents = 32, + .max_controlled_agents = (max_controlled_agents_cli > 0) ? max_controlled_agents_cli : 32, }; allocate(&env); @@ -317,11 +435,92 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o client->cyclist = LoadModel("resources/drive/cyclist.glb"); client->pedestrian = LoadModel("resources/drive/pedestrian.glb"); + // Determine number of ego agents vs co-players + int num_ego_agents = env.active_agent_count; + int num_co_players = 0; + DriveNet *co_player_net = NULL; + + // Co-player conditioning flags (hoisted to outer scope for later use) + int co_use_rc = 0, co_use_ec = 0, co_use_dc = 0; + + // Check if co-player policy is provided (either via CLI or INI) + const char *actual_co_player_policy = co_player_policy_name; + if (actual_co_player_policy == NULL && conf.co_player_enabled && strlen(conf.co_player_policy_path) > 0) { + actual_co_player_policy = conf.co_player_policy_path; + } + + if (actual_co_player_policy != NULL) { + // Population play mode - split agents between ego and co-player + // Use num_ego_agents from config, or default to half + if (conf.num_ego_agents > 0 && conf.num_ego_agents < env.active_agent_count) { + num_ego_agents = conf.num_ego_agents; + } else { + num_ego_agents = env.active_agent_count / 2; + } + num_co_players = env.active_agent_count - num_ego_agents; + + printf("Population play: %d ego agents, %d co-players\n", num_ego_agents, num_co_players); + + // Load co-player policy + FILE *co_policy_file = fopen(actual_co_player_policy, "rb"); + if (co_policy_file != NULL) { + fclose(co_policy_file); + Weights *co_weights = load_weights(actual_co_player_policy); + + // Determine co-player conditioning from config + if (conf.co_player_conditioning != NULL) { + co_use_rc = (strcmp(conf.co_player_conditioning->type, "reward") == 0 || + strcmp(conf.co_player_conditioning->type, "all") == 0); + co_use_ec = (strcmp(conf.co_player_conditioning->type, "entropy") == 0 || + strcmp(conf.co_player_conditioning->type, "all") == 0); + co_use_dc = (strcmp(conf.co_player_conditioning->type, "discount") == 0 || + strcmp(conf.co_player_conditioning->type, "all") == 0); + } + + co_player_net = + init_drivenet(co_weights, num_co_players, env.dynamics_model, co_use_rc, co_use_ec, co_use_dc); + printf("Co-player policy loaded with conditioning: rc=%d, ec=%d, dc=%d\n", co_use_rc, co_use_ec, co_use_dc); + } else { + printf("Warning: Could not load co-player policy from %s. Using main policy for all agents.\n", + actual_co_player_policy); + num_ego_agents = env.active_agent_count; + num_co_players = 0; + } + } + + // Extract co-player conditioning bounds from config + float co_collision_lb = 0, co_collision_ub = 0; + float co_offroad_lb = 0, co_offroad_ub = 0; + float co_goal_lb = 0, co_goal_ub = 0; + float co_entropy_lb = 0, co_entropy_ub = 0; + float co_discount_lb = 0, co_discount_ub = 0; + + // Get conditioning dims directly from co_player_net to ensure consistency + int coplayer_num_conditioning = (co_player_net != NULL) ? co_player_net->conditioning_dims : 0; + + if (conf.co_player_conditioning != NULL) { + co_collision_lb = conf.co_player_conditioning->reward_collision_weight_lb; + co_collision_ub = conf.co_player_conditioning->reward_collision_weight_ub; + co_offroad_lb = conf.co_player_conditioning->reward_offroad_weight_lb; + co_offroad_ub = conf.co_player_conditioning->reward_offroad_weight_ub; + co_goal_lb = conf.co_player_conditioning->reward_goal_weight_lb; + co_goal_ub = conf.co_player_conditioning->reward_goal_weight_ub; + co_entropy_lb = conf.co_player_conditioning->entropy_weight_lb; + co_entropy_ub = conf.co_player_conditioning->entropy_weight_ub; + co_discount_lb = conf.co_player_conditioning->discount_weight_lb; + co_discount_ub = conf.co_player_conditioning->discount_weight_ub; + } + + // Load main (ego) policy Weights *weights = load_weights(policy_name); printf("Active agents in map: %d\n", env.active_agent_count); - DriveNet *net = init_drivenet(weights, env.active_agent_count, env.dynamics_model, use_rc, use_ec, use_dc); + DriveNet *net = init_drivenet(weights, num_ego_agents, env.dynamics_model, use_rc, use_ec, use_dc); - int frame_count = env.scenario_length > 0 ? env.scenario_length : TRAJECTORY_LENGTH_DEFAULT; + // Calculate frame count: k_scenarios * scenario_length for adaptive agents + int scenario_length = env.scenario_length > 0 ? env.scenario_length : TRAJECTORY_LENGTH_DEFAULT; + int k_scenarios = (k_scenarios_cli > 0) ? k_scenarios_cli : (conf.k_scenarios > 0 ? conf.k_scenarios : 1); + int frame_count = k_scenarios * scenario_length; + printf("Rendering %d scenarios x %d steps = %d total frames\n", k_scenarios, scenario_length, frame_count); char filename_topdown[256]; char filename_agent[256]; @@ -373,16 +572,39 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o } } + // Calculate observation sizes per agent + // ego_base_dim: 7 for CLASSIC dynamics, 10 for JERK dynamics + int ego_base_dim = (env.dynamics_model == 1) ? 10 : 7; // 1 = JERK + + // Ego observation size (environment generates observations without conditioning for ego) + int ego_obs_size = + net->ego_dim + (MAX_AGENTS - 1) * PARTNER_FEATURES + MAX_ROAD_SEGMENT_OBSERVATIONS * ROAD_FEATURES; + + // Co-player observation size (includes conditioning features) + int coplayer_obs_size = ego_obs_size; + if (co_player_net != NULL) { + coplayer_obs_size = co_player_net->ego_dim + (MAX_AGENTS - 1) * PARTNER_FEATURES + + MAX_ROAD_SEGMENT_OBSERVATIONS * ROAD_FEATURES; + } + + printf("Observation sizes: ego=%d, coplayer=%d, ego_base_dim=%d, coplayer_conditioning=%d\n", ego_obs_size, + coplayer_obs_size, ego_base_dim, coplayer_num_conditioning); + if (render_topdown) { printf("Recording topdown view...\n"); for (int i = 0; i < frame_count; i++) { + // Calculate current scenario (1-indexed for display) + int current_scenario = (i / scenario_length) + 1; if (i % frame_skip == 0) { - renderTopDownView(&env, client, map_height, 0, 0, 0, frame_count, NULL, show_human_logs, show_grid, - img_width, img_height, zoom_in); + renderTopDownView(&env, client, map_width, map_height, 0, 0, 0, frame_count, NULL, show_human_logs, + show_grid, img_width, img_height, zoom_in, current_scenario, k_scenarios); WriteFrame(&topdown_recorder, img_width, img_height); rendered_frames++; } - forward(net, env.observations, (int *)env.actions); + forward_population(net, co_player_net, env.observations, (int *)env.actions, num_ego_agents, num_co_players, + ego_obs_size, coplayer_obs_size, ego_base_dim, co_use_rc, co_use_ec, co_use_dc, + co_collision_lb, co_collision_ub, co_offroad_lb, co_offroad_ub, co_goal_lb, co_goal_ub, + co_entropy_lb, co_entropy_ub, co_discount_lb, co_discount_ub); c_step(&env); } } @@ -400,7 +622,10 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o WriteFrame(&agent_recorder, img_width, img_height); rendered_frames++; } - forward(net, env.observations, (int *)env.actions); + forward_population(net, co_player_net, env.observations, (int *)env.actions, num_ego_agents, num_co_players, + ego_obs_size, coplayer_obs_size, ego_base_dim, co_use_rc, co_use_ec, co_use_dc, + co_collision_lb, co_collision_ub, co_offroad_lb, co_offroad_ub, co_goal_lb, co_goal_ub, + co_entropy_lb, co_entropy_ub, co_discount_lb, co_discount_ub); c_step(&env); } } @@ -424,6 +649,9 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o free_allocated(&env); free_drivenet(net); free(weights); + if (co_player_net != NULL) { + free_drivenet(co_player_net); + } return 0; } @@ -440,10 +668,15 @@ int main(int argc, char *argv[]) { // File paths and num_maps (not in [env] section) const char *map_name = NULL; const char *policy_name = "resources/drive/puffer_drive_weights.bin"; + const char *co_player_policy_name = NULL; const char *output_topdown = NULL; const char *output_agent = NULL; + const char *ini_file = "pufferlib/config/ocean/drive.ini"; + const char *map_dir_cli = NULL; // CLI override for map_dir int num_maps = 1; int scenario_length_cli = -1; + int k_scenarios_cli = -1; + int max_controlled_agents_cli = -1; int use_rc = 0; int use_ec = 0; int use_dc = 0; @@ -515,10 +748,51 @@ int main(int argc, char *argv[]) { num_maps = atoi(argv[i + 1]); i++; } + } else if (strcmp(argv[i], "--ini-file") == 0) { + if (i + 1 < argc) { + ini_file = argv[i + 1]; + i++; + } else { + fprintf(stderr, "Error: --ini-file option requires a file path\n"); + return 1; + } + } else if (strcmp(argv[i], "--k-scenarios") == 0) { + if (i + 1 < argc) { + k_scenarios_cli = atoi(argv[i + 1]); + i++; + } else { + fprintf(stderr, "Error: --k-scenarios option requires a number\n"); + return 1; + } + } else if (strcmp(argv[i], "--max-controlled-agents") == 0) { + if (i + 1 < argc) { + max_controlled_agents_cli = atoi(argv[i + 1]); + i++; + } else { + fprintf(stderr, "Error: --max-controlled-agents option requires a number\n"); + return 1; + } + } else if (strcmp(argv[i], "--co-player-policy") == 0) { + if (i + 1 < argc) { + co_player_policy_name = argv[i + 1]; + i++; + } else { + fprintf(stderr, "Error: --co-player-policy option requires a file path\n"); + return 1; + } + } else if (strcmp(argv[i], "--map-dir") == 0) { + if (i + 1 < argc) { + map_dir_cli = argv[i + 1]; + i++; + } else { + fprintf(stderr, "Error: --map-dir option requires a directory path\n"); + return 1; + } } } eval_gif(map_name, policy_name, show_grid, obs_only, lasers, show_human_logs, frame_skip, view_mode, output_topdown, - output_agent, num_maps, zoom_in); + output_agent, num_maps, zoom_in, ini_file, k_scenarios_cli, max_controlled_agents_cli, + co_player_policy_name, map_dir_cli); return 0; } diff --git a/pufferlib/ocean/env_binding.h b/pufferlib/ocean/env_binding.h index b0ea15458f..3262620d60 100644 --- a/pufferlib/ocean/env_binding.h +++ b/pufferlib/ocean/env_binding.h @@ -616,12 +616,18 @@ static PyObject *vec_log(PyObject *self, PyObject *args) { float n = aggregate.n; // Average across EGO agents only if (n > 0) { + // Compute completion_rate from raw totals BEFORE averaging + float total_goals_reached = aggregate.goals_reached_this_episode; + float total_goals_sampled = aggregate.goals_sampled_this_episode; + if (total_goals_sampled > 0) { + aggregate.completion_rate = total_goals_reached / total_goals_sampled; + } else { + aggregate.completion_rate = 0.0f; + } + for (int i = 0; i < num_keys; i++) { ((float *)&aggregate)[i] /= n; } - - // Compute completion_rate from aggregated counts - aggregate.completion_rate = aggregate.goals_reached_this_episode / aggregate.goals_sampled_this_episode; } // User populates dict @@ -632,15 +638,20 @@ static PyObject *vec_log(PyObject *self, PyObject *args) { if (has_co_players && co_player_aggregate.n > 0.0f) { float co_player_n = co_player_aggregate.n; + // Compute co-player completion rate from raw totals BEFORE averaging + float co_total_goals_reached = co_player_aggregate.goals_reached_this_episode; + float co_total_goals_sampled = co_player_aggregate.goals_sampled_this_episode; + if (co_total_goals_sampled > 0) { + co_player_aggregate.completion_rate = co_total_goals_reached / co_total_goals_sampled; + } else { + co_player_aggregate.completion_rate = 0.0f; + } + // Average co-player metrics across CO-PLAYER agents only for (int i = 0; i < num_keys; i++) { ((float *)&co_player_aggregate)[i] /= co_player_n; } - // Compute co-player completion rate - co_player_aggregate.completion_rate = - co_player_aggregate.goals_reached_this_episode / co_player_aggregate.goals_sampled_this_episode; - // Add co-player metrics to dict with co_player_ prefix assign_to_dict(dict, "ego_co_player_ratio", n / co_player_n); assign_to_dict(dict, "co_player_completion_rate", co_player_aggregate.completion_rate); diff --git a/pufferlib/ocean/env_config.h b/pufferlib/ocean/env_config.h index 4f26a8b32e..35f2806806 100644 --- a/pufferlib/ocean/env_config.h +++ b/pufferlib/ocean/env_config.h @@ -38,12 +38,18 @@ typedef struct { int goal_behavior; float goal_target_distance; int scenario_length; + int k_scenarios; int termination_mode; int init_steps; int init_mode; int control_mode; char map_dir[256]; conditioning_config *conditioning; + // Population play settings + int co_player_enabled; + int num_ego_agents; + char co_player_policy_path[256]; + conditioning_config *co_player_conditioning; } env_init_config; // INI file parser handler - parses all environment configuration from drive.ini @@ -97,6 +103,8 @@ static int handler(void *config, const char *section, const char *name, const ch env_config->dt = atof(value); } else if (MATCH("env", "scenario_length")) { env_config->scenario_length = atoi(value); + } else if (MATCH("env", "k_scenarios")) { + env_config->k_scenarios = atoi(value); } else if (MATCH("env", "termination_mode")) { env_config->termination_mode = atoi(value); } else if (MATCH("env", "init_steps")) { @@ -175,6 +183,85 @@ static int handler(void *config, const char *section, const char *name, const ch } env_config->conditioning->discount_weight_ub = atof(value); } + // Population play settings + else if (MATCH("env", "co_player_enabled")) { + if (strcmp(value, "True") == 0 || strcmp(value, "true") == 0 || strcmp(value, "1") == 0) { + env_config->co_player_enabled = 1; + } else { + env_config->co_player_enabled = 0; + } + } else if (MATCH("env", "num_ego_agents")) { + env_config->num_ego_agents = atoi(value); + } + // Co-player policy settings + else if (MATCH("env.co_player_policy", "policy_path")) { + if (sscanf(value, "\"%255[^\"]\"", env_config->co_player_policy_path) != 1) { + strncpy(env_config->co_player_policy_path, value, sizeof(env_config->co_player_policy_path) - 1); + env_config->co_player_policy_path[sizeof(env_config->co_player_policy_path) - 1] = '\0'; + } + } else if (MATCH("env.co_player_policy.conditioning", "type")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + if (value[0] == '"') { + size_t len = strlen(value) - 2; + env_config->co_player_conditioning->type = (char *)malloc(len + 1); + strncpy(env_config->co_player_conditioning->type, value + 1, len); + env_config->co_player_conditioning->type[len] = '\0'; + } else { + env_config->co_player_conditioning->type = strdup(value); + } + } else if (MATCH("env.co_player_policy.conditioning", "collision_weight_lb")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + env_config->co_player_conditioning->reward_collision_weight_lb = atof(value); + } else if (MATCH("env.co_player_policy.conditioning", "collision_weight_ub")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + env_config->co_player_conditioning->reward_collision_weight_ub = atof(value); + } else if (MATCH("env.co_player_policy.conditioning", "offroad_weight_lb")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + env_config->co_player_conditioning->reward_offroad_weight_lb = atof(value); + } else if (MATCH("env.co_player_policy.conditioning", "offroad_weight_ub")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + env_config->co_player_conditioning->reward_offroad_weight_ub = atof(value); + } else if (MATCH("env.co_player_policy.conditioning", "goal_weight_lb")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + env_config->co_player_conditioning->reward_goal_weight_lb = atof(value); + } else if (MATCH("env.co_player_policy.conditioning", "goal_weight_ub")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + env_config->co_player_conditioning->reward_goal_weight_ub = atof(value); + } else if (MATCH("env.co_player_policy.conditioning", "entropy_weight_lb")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + env_config->co_player_conditioning->entropy_weight_lb = atof(value); + } else if (MATCH("env.co_player_policy.conditioning", "entropy_weight_ub")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + env_config->co_player_conditioning->entropy_weight_ub = atof(value); + } else if (MATCH("env.co_player_policy.conditioning", "discount_weight_lb")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + env_config->co_player_conditioning->discount_weight_lb = atof(value); + } else if (MATCH("env.co_player_policy.conditioning", "discount_weight_ub")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + env_config->co_player_conditioning->discount_weight_ub = atof(value); + } else { return 0; // Unknown section/name, indicate failure to handle diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py index 270205ed70..f2affe1427 100644 --- a/pufferlib/pufferl.py +++ b/pufferlib/pufferl.py @@ -79,10 +79,14 @@ def __init__(self, config, vecenv, policy, logger=None): if config.get("policy_architecture", "Recurrent") == "Recurrent": config["bptt_horizon"] = vecenv.driver_env.episode_length if config.get("policy_architecture", "Recurrent") == "Transformer": - config["context_window"] = self.context_length = vecenv.driver_env.episode_length + config["context_length"] = self.context_length = vecenv.driver_env.episode_length config["bptt_horizon"] = ( vecenv.driver_env.episode_length ) ## this is used downstream so you need to define it too + else: + if config.get("policy_architecture", "Recurrent") == "Transformer": + self.context_length = config["context_length"] + config["bptt_horizon"] = config["context_length"] vecenv.async_reset(seed) obs_space = vecenv.single_observation_space @@ -95,7 +99,7 @@ def __init__(self, config, vecenv, policy, logger=None): if config.get("policy_architecture", "Recurrent") == "Recurrent": batch_size = vecenv.driver_env.num_ego_agents * config["bptt_horizon"] * vecenv.num_workers if config.get("policy_architecture", "Recurrent") == "Transformer": - batch_size = vecenv.driver_env.num_ego_agents * config["context_window"] * vecenv.num_workers + batch_size = vecenv.driver_env.num_ego_agents * config["context_length"] * vecenv.num_workers config["batch_size"] = batch_size ## this is dynamic and based on ego agents else: agents_for_calc = total_agents @@ -107,24 +111,24 @@ def __init__(self, config, vecenv, policy, logger=None): if ( config["batch_size"] == "auto" and config.get("bptt_horizon", "auto") == "auto" - and config.get("context_window", "auto") == "auto" + and config.get("context_length", "auto") == "auto" ): - raise pufferlib.APIUsageError("Must specify batch_size, bptt_horizon, or context_window") + raise pufferlib.APIUsageError("Must specify batch_size, bptt_horizon, or context_length") elif config["batch_size"] == "auto": if config.get("policy_architecture", "Recurrent") == "Recurrent": config["batch_size"] = agents_for_calc * config["bptt_horizon"] elif config.get("policy_architecture", "Recurrent") == "Transformer": - config["batch_size"] = agents_for_calc * config["context_window"] + config["batch_size"] = agents_for_calc * config["context_length"] elif ( config.get("bptt_horizon", "auto") == "auto" and config.get("policy_architecture", "Recurrent") == "Recurrent" ): config["bptt_horizon"] = config["batch_size"] // agents_for_calc elif ( - config.get("context_window", "auto") == "auto" + config.get("context_length", "auto") == "auto" and config.get("policy_architecture", "Recurrent") == "Transformer" ): - config["context_window"] = config["batch_size"] // agents_for_calc + config["context_length"] = config["batch_size"] // agents_for_calc batch_size = config["batch_size"] @@ -132,9 +136,9 @@ def __init__(self, config, vecenv, policy, logger=None): if config.get("policy_architecture", "Recurrent") == "Recurrent": horizon = config["bptt_horizon"] elif config.get("policy_architecture", "Recurrent") == "Transformer": - horizon = config["context_window"] + horizon = config["context_length"] else: - horizon = config.get("bptt_horizon", config.get("context_window", 1)) + horizon = config.get("bptt_horizon", config.get("context_length", 1)) config["bptt_horizon"] = horizon # For backward compatibility @@ -180,7 +184,7 @@ def __init__(self, config, vecenv, policy, logger=None): ensure_drive_binary() # LSTM - if config.get("policy_architecture", "Recurrent") == "Recurrent": + if config.get("rnn_name", "Recurrent") == "Recurrent": h = policy.hidden_size if self.population_play: n = vecenv.ego_agents_per_batch # Use ego agents per batch @@ -193,7 +197,7 @@ def __init__(self, config, vecenv, policy, logger=None): self.lstm_c = {i * n: torch.zeros(n, h, device=device) for i in range(total_agents // n)} # TRANSFORMER - if config.get("policy_architecture", "Recurrent") == "Transformer": + if config.get("rnn_name", "Recurrent") == "Transformer": h = policy.hidden_size if self.population_play: @@ -345,22 +349,23 @@ def evaluate(self): device = config["device"] # Reset hidden states for both RNN and Transformer - if config.get("policy_architecture", "Recurrent") == "Recurrent": + if config.get("rnn_name", "Recurrent") == "Recurrent": for k in self.lstm_h: self.lstm_h[k] = torch.zeros(self.lstm_h[k].shape, device=device) self.lstm_c[k] = torch.zeros(self.lstm_c[k].shape, device=device) - if config.get("policy_architecture", "Recurrent") == "Transformer": + if config.get("rnn_name", "Recurrent") == "Transformer": h = self.policy.hidden_size for k in self.transformer_context: n = self.transformer_context[k].shape[0] # Pre-allocate full buffer instead of empty - self.transformer_context[k] = torch.zeros(n, self.context_length, h, device=device) + self.transformer_context[k] = torch.zeros(n, self.horizon, h, device=device) self.transformer_position[k] = torch.zeros(1, dtype=torch.long, device=device) self.full_rows = 0 while self.full_rows < self.segments: profile("env", epoch) + print(".", end="", flush=True) # Workaround: visible I/O prevents multiprocessing deadlock o, r, d, t, info, env_id, mask = self.vecenv.recv() # print(f"o shape is {o.shape}", flush = True) if self.population_play: @@ -418,11 +423,11 @@ def evaluate(self): batch_size = self.vecenv.agents_per_batch state_key = (env_id.start // batch_size) * batch_size - if config.get("policy_architecture", "Recurrent") == "Recurrent": + if config.get("rnn_name", "Recurrent") == "Recurrent": state["lstm_h"] = self.lstm_h[state_key] state["lstm_c"] = self.lstm_c[state_key] - if config.get("policy_architecture", "Recurrent") == "Transformer": + if config.get("rnn_name", "Recurrent") == "Transformer": state["transformer_context"] = self.transformer_context[state_key] state["transformer_position"] = self.transformer_position[state_key] # Note: terminals not needed for eval since we're doing single-step inference @@ -434,7 +439,7 @@ def evaluate(self): profile("eval_copy", epoch) with torch.no_grad(): # Update hidden states after forward pass - if config.get("policy_architecture", "Recurrent") == "Recurrent": + if config.get("rnn_name", "Recurrent") == "Recurrent": if self.population_play: batch_size = self.vecenv.ego_agents_per_batch else: @@ -444,7 +449,7 @@ def evaluate(self): self.lstm_h[lstm_key] = state["lstm_h"] self.lstm_c[lstm_key] = state["lstm_c"] - if config.get("policy_architecture", "Recurrent") == "Transformer": + if config.get("rnn_name", "Recurrent") == "Transformer": if self.population_play: batch_size = self.vecenv.ego_agents_per_batch else: @@ -483,7 +488,7 @@ def evaluate(self): self.ep_lengths[env_id] += 1 # Use appropriate horizon based on model type horizon = ( - config.get("context_window") + config.get("context_length") if config.get("policy_architecture", "Recurrent") == "Transformer" else config["bptt_horizon"] ) @@ -597,8 +602,8 @@ def train(self): # Handle observation reshaping based on model type if ( - not config.get("policy_architecture", "Recurrent") == "Recurrent" - and not config.get("policy_architecture", "Recurrent") == "Transformer" + not config.get("rnn_name", "Recurrent") == "Recurrent" + and not config.get("rnn_name", "Recurrent") == "Transformer" ): # Flatten for non-recurrent models mb_obs = mb_obs.reshape(-1, *self.vecenv.single_observation_space.shape) @@ -608,10 +613,10 @@ def train(self): ) # Add appropriate state based on model type - if config.get("policy_architecture", "Recurrent") == "Recurrent": + if config.get("rnn_name", "Recurrent") == "Recurrent": state["lstm_h"] = None state["lstm_c"] = None - elif config.get("policy_architecture", "Recurrent") == "Transformer": + elif config.get("rnn_name", "Recurrent") == "Transformer": state["transformer_context"] = None state["transformer_position"] = None state["terminals"] = mb_terminals # For episode boundary masking @@ -620,8 +625,8 @@ def train(self): # Handle action sampling based on observation shape if ( - config.get("policy_architecture", "Recurrent") == "Recurrent" - or config.get("policy_architecture", "Recurrent") == "Transformer" + config.get("rnn_name", "Recurrent") == "Recurrent" + or config.get("rnn_name", "Recurrent") == "Transformer" ): # Add this right before calling sample_logits if isinstance(logits, tuple): @@ -789,6 +794,38 @@ def train(self): ): pufferlib.utils.run_human_replay_eval_in_subprocess(self.config, self.logger, self.global_step) + # Eval rendering (ego vs human logs) + if self.config["eval"].get("human_replay_eval", False): + if self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training: + model_dir = os.path.join(self.config["data_dir"], f"{self.config['env']}_{self.logger.run_id}") + model_files = glob.glob(os.path.join(model_dir, "model_*.pt")) + + if model_files: + latest_cpt = max(model_files, key=os.path.getctime) + bin_path = f"{model_dir}.bin" + + try: + export_args = {"env_name": self.config["env"], "load_model_path": latest_cpt, **self.config} + export( + args=export_args, + env_name=self.config["env"], + vecenv=self.vecenv, + policy=self.uncompiled_policy, + path=bin_path, + silent=True, + ) + eval_video_dir = os.path.join(model_dir, "eval_videos") + pufferlib.utils.render_human_replay_videos( + config=self.config, + policy_bin_path=bin_path, + output_dir=eval_video_dir, + num_maps=self.config["eval"].get("human_replay_render_num_maps", 3), + logger=self.logger, + global_step=self.global_step, + ) + except Exception as e: + print(f"Failed to render eval videos: {e}") + def mean_and_log(self): config = self.config for k in list(self.stats.keys()): @@ -1255,7 +1292,13 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None): elif args["wandb"]: logger = WandbLogger(args) - train_config = dict(**args["train"], env=env_name, eval=args.get("eval", {}), env_config=args.get("env", {})) + train_config = dict( + **args["train"], + env=env_name, + eval=args.get("eval", {}), + env_config=args.get("env", {}), + policy_architecture=args.get("policy_architecture", "Recurrent"), + ) pufferl = PuffeRL(train_config, vecenv, policy, logger) all_logs = [] @@ -1670,49 +1713,57 @@ def load_policy(args, vecenv, env_name=""): env_module = importlib.import_module(module_name) device = args["train"]["device"] - policy_cls = getattr(env_module.torch, args["policy_name"]) - policy = policy_cls(vecenv.driver_env, **args["policy"]) - - # Handle both RNN and Transformer wrappers - rnn_name = args.get("rnn_name") - transformer_name = args.get("transformer_name") - - if transformer_name is not None: - # Load transformer wrapper - transformer_cls = getattr(env_module.torch, transformer_name) - args["transformer"]["context_length"] = vecenv.driver_env.episode_length - policy = transformer_cls(vecenv.driver_env, policy, **args["transformer"]) - elif rnn_name is not None: - # Load RNN wrapper - rnn_cls = getattr(env_module.torch, rnn_name) - policy = rnn_cls(vecenv.driver_env, policy, **args["rnn"]) - - policy = policy.to(device) load_id = args["load_id"] - if load_id is not None: + load_path = args.get("load_model_path") + state_dict = None + rnn_name = args.get("policy_architecture", "Recurrent") + + if load_path is not None: + state_dict = torch.load(load_path, map_location=device) + state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} + elif load_id is not None: if args["neptune"]: path = NeptuneLogger(args, load_id, mode="read-only").download() elif args["wandb"]: path = WandbLogger(args, load_id).download() else: raise pufferlib.APIUsageError("No run id provided for eval") - state_dict = torch.load(path, map_location=device) state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} - policy.load_state_dict(state_dict) - load_path = args["load_model_path"] - if load_path == "latest": - load_path = max(glob.glob(f"experiments/{env_name}*.pt"), key=os.path.getctime) + # Auto-detect architecture from state_dict keys + if state_dict is not None: + if "positional_embedding" in state_dict: + rnn_name = "Transformer" + elif "lstm.weight_ih_l0" in state_dict: + rnn_name = "Recurrent" - if load_path is not None: - state_dict = torch.load(load_path, map_location=device) - state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} + policy_cls = getattr(env_module.torch, args["policy_name"]) + policy = policy_cls(vecenv.driver_env, **args["policy"]) + + # Handle both RNN and Transformer wrappers via rnn_name + if rnn_name == "Transformer": + # Load transformer wrapper + transformer_cls = getattr(env_module.torch, rnn_name) + # For adaptive_driving_agent, use episode_length as horizon (k_scenarios * scenario_length) + # Otherwise, use config horizon with fallback to episode_length + is_adaptive = getattr(vecenv.driver_env, "env_name", None) == "adaptive_drive" + if is_adaptive: + args["transformer"]["horizon"] = vecenv.driver_env.episode_length + else: + args["transformer"]["horizon"] = args["train"].get("horizon", vecenv.driver_env.episode_length) + policy = transformer_cls(vecenv.driver_env, policy, **args["transformer"]) + elif rnn_name is not None: + # Load RNN wrapper (Recurrent) + rnn_cls = getattr(env_module.torch, rnn_name) + policy = rnn_cls(vecenv.driver_env, policy, **args["rnn"]) + + policy = policy.to(device) + + # Load the state dict if we have one + if state_dict is not None: policy.load_state_dict(state_dict) - # state_path = os.path.join(*load_path.split('/')[:-1], 'state.pt') - # optim_state = torch.load(state_path)['optimizer_state_dict'] - # pufferl.optimizer.load_state_dict(optim_state) return policy diff --git a/pufferlib/resources/drive/binaries/training/map_000.bin b/pufferlib/resources/drive/binaries/training/map_000.bin index 434b98c255..ef87c59af2 100644 Binary files a/pufferlib/resources/drive/binaries/training/map_000.bin and b/pufferlib/resources/drive/binaries/training/map_000.bin differ diff --git a/pufferlib/resources/drive/puffer_adaptive_drive_co_player.bin b/pufferlib/resources/drive/puffer_adaptive_drive_co_player.bin new file mode 100644 index 0000000000..bb1d3f0233 Binary files /dev/null and b/pufferlib/resources/drive/puffer_adaptive_drive_co_player.bin differ diff --git a/pufferlib/utils.py b/pufferlib/utils.py index 8ef3cb4034..eef8c03c7f 100644 --- a/pufferlib/utils.py +++ b/pufferlib/utils.py @@ -38,17 +38,21 @@ def run_human_replay_eval_in_subprocess(config, logger, global_step): if is_adaptive: # Use evaluate_human_logs.py for adaptive agents with human replay + # Get architecture from config (determines Recurrent vs Transformer) + # Check both policy_architecture and rnn_name for compatibility + rnn_name = config.get("policy_architecture", config.get("rnn_name", "Recurrent")) + cmd = [ sys.executable, "evaluate_human_logs.py", "--policy-path", latest_cpt, - "--policy-architecture", - config.get("policy_architecture", "Transformer"), + "--rnn-name", + rnn_name, "--adaptive-driving-agent", "1", "--k-scenarios", - str(env_config.get("k_scenarios", 2)), + str(env_config.get("k_scenarios", 1)), "--num-agents", str(eval_config.get("human_replay_num_agents", 32)), "--num-maps", @@ -302,9 +306,37 @@ def render_videos(config, vecenv, logger, epoch, global_step, bin_path): env_vars = os.environ.copy() env_vars["ASAN_OPTIONS"] = "exitcode=0" + # Detect if this is an adaptive agent + env_name = config.get("env", "") + is_adaptive = "adaptive" in env_name + + # Select correct INI file based on agent type + if is_adaptive: + ini_file = "pufferlib/config/ocean/adaptive.ini" + else: + ini_file = "pufferlib/config/ocean/drive.ini" + # Base command with only visualization flags (env config comes from INI) base_cmd = ["xvfb-run", "-a", "-s", "-screen 0 1280x720x24", "./visualize"] + # Pass the correct INI file + base_cmd.extend(["--ini-file", ini_file]) + + # Get env config for k_scenarios and co-player settings + env_config = config.get("env_config", {}) + + # Pass k_scenarios for adaptive agents (longer videos) + k_scenarios = env_config.get("k_scenarios", 1) + if k_scenarios > 1: + base_cmd.extend(["--k-scenarios", str(k_scenarios)]) + + # Pass co-player policy if population play is enabled + co_player_enabled = env_config.get("co_player_enabled", False) + if co_player_enabled: + co_player_path = f"resources/drive/{config['env']}_co_player.bin" + if os.path.exists(co_player_path): + base_cmd.extend(["--co-player-policy", co_player_path]) + # Visualization config flags only if config.get("show_grid", False): base_cmd.append("--show-grid") @@ -407,3 +439,135 @@ def render_videos(config, vecenv, logger, epoch, global_step, bin_path): # Clean up bin weights file if os.path.exists(expected_weights_path): os.remove(expected_weights_path) + + +def render_human_replay_videos(config, policy_bin_path, output_dir, num_maps=5, logger=None, global_step=0): + """ + Render videos for human replay evaluation (1 ego agent + human log trajectories). + + In this mode, only one agent is policy-controlled (the ego), while all other agents + follow their logged human trajectories (rendered in GOLD). + + Args: + config: Configuration dictionary with env settings + policy_bin_path: Path to the policy weights .bin file + output_dir: Directory to save output videos + num_maps: Number of maps to render + logger: Optional logger with wandb attribute for logging + global_step: Current training step for wandb logging + + Returns: + List of output video paths + """ + if not os.path.exists(policy_bin_path): + print(f"Policy weights file does not exist: {policy_bin_path}") + return [] + + try: + os.makedirs(output_dir, exist_ok=True) + + # Copy the binary weights to the expected location + expected_weights_path = "resources/drive/puffer_drive_weights.bin" + os.makedirs(os.path.dirname(expected_weights_path), exist_ok=True) + shutil.copy2(policy_bin_path, expected_weights_path) + + env_vars = os.environ.copy() + env_vars["ASAN_OPTIONS"] = "exitcode=0" + + # Get env config + env_config = config.get("env_config", config.get("env", {})) + k_scenarios = env_config.get("k_scenarios", env_config.get("k-scenarios", 1)) + map_dir = env_config.get("map_dir", env_config.get("map-dir", None)) + + # Build command for human replay rendering + cmd = [ + "xvfb-run", + "-a", + "-s", + "-screen 0 1280x720x24", + "./visualize", + "--ini-file", + "pufferlib/config/ocean/adaptive.ini", + "--policy-name", + expected_weights_path, + "--max-controlled-agents", + "1", # Only 1 ego agent + "--k-scenarios", + str(k_scenarios), + "--num-maps", + str(num_maps), + "--log-trajectories", # Show human trajectory logs + "--zoom-in", + "--view", + "both", + "--output-topdown", + "resources/drive/output_topdown.mp4", + "--output-agent", + "resources/drive/output_agent.mp4", + ] + + # Add map_dir override if specified (for NuPlan or other datasets) + if map_dir: + cmd.extend(["--map-dir", map_dir]) + + output_videos = [] + videos_to_log_world = [] + videos_to_log_agent = [] + + print(f"[Human Replay Render] Starting render for {num_maps} maps, map_dir={map_dir}", flush=True) + print(f"[Human Replay Render] Command: {' '.join(cmd)}", flush=True) + + for map_idx in range(num_maps): + print(f"[Human Replay Render] Rendering map {map_idx}...", flush=True) + result = subprocess.run(cmd, cwd=os.getcwd(), capture_output=True, text=True, timeout=600, env=env_vars) + print(f"[Human Replay Render] Return code: {result.returncode}", flush=True) + if result.stderr: + print(f"[Human Replay Render] stderr: {result.stderr[:500]}", flush=True) + + vids_exist = os.path.exists("resources/drive/output_topdown.mp4") and os.path.exists( + "resources/drive/output_agent.mp4" + ) + print(f"[Human Replay Render] Videos exist: {vids_exist}", flush=True) + + if result.returncode == 0 or (result.returncode == 1 and vids_exist): + videos = [ + ("resources/drive/output_topdown.mp4", f"human_replay_map{map_idx:02d}_topdown.mp4"), + ("resources/drive/output_agent.mp4", f"human_replay_map{map_idx:02d}_agent.mp4"), + ] + + for source_vid, target_filename in videos: + if os.path.exists(source_vid): + target_path = os.path.join(output_dir, target_filename) + shutil.move(source_vid, target_path) + output_videos.append(target_path) + + if logger and hasattr(logger, "wandb") and logger.wandb: + import wandb + + if "topdown" in target_filename: + videos_to_log_world.append(wandb.Video(target_path, format="mp4")) + else: + videos_to_log_agent.append(wandb.Video(target_path, format="mp4")) + else: + print(f"Human replay rendering failed for map {map_idx}: {result.stderr}") + + # Log to wandb + if logger and hasattr(logger, "wandb") and logger.wandb and (videos_to_log_world or videos_to_log_agent): + payload = {} + if videos_to_log_world: + payload["eval/human_replay_world_view"] = videos_to_log_world + if videos_to_log_agent: + payload["eval/human_replay_agent_view"] = videos_to_log_agent + logger.wandb.log(payload, step=global_step) + + return output_videos + + except subprocess.TimeoutExpired: + print("Human replay rendering timed out") + return [] + except Exception as e: + print(f"Failed to render human replay videos: {e}") + return [] + finally: + if os.path.exists(expected_weights_path): + os.remove(expected_weights_path) diff --git a/pufferlib/vector.py b/pufferlib/vector.py index c397ec7e16..73a096f7bc 100644 --- a/pufferlib/vector.py +++ b/pufferlib/vector.py @@ -848,6 +848,8 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer input_size = co_player_policy.get("input_size", 256) hidden_size = co_player_policy.get("hidden_size", 256) co_player_rnn = co_player_policy.get("rnn", None) + co_player_architecture = co_player_policy.get("architecture", "Recurrent") + co_player_transformer = co_player_policy.get("transformer", {}) # Get conditioning type from env_k co_player_conditioning = co_player_policy.get("conditioning") @@ -914,7 +916,18 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer base_policy = Drive(co_player_env, input_size=input_size, hidden_size=hidden_size) - if co_player_rnn: + if co_player_architecture == "Transformer": + policy = pufferlib.models.TransformerWrapper( + co_player_env, + base_policy, + input_size=co_player_transformer.get("input_size", 256), + hidden_size=co_player_transformer.get("hidden_size", 256), + num_layers=co_player_transformer.get("num_layers", 2), + num_heads=co_player_transformer.get("num_heads", 4), + horizon=co_player_transformer.get("horizon", 91), + dropout=co_player_transformer.get("dropout", 0.0), + ) + elif co_player_rnn: policy = pufferlib.models.LSTMWrapper( co_player_env, base_policy, diff --git a/scripts/adaptive/nuplan_recurrent.sh b/scripts/adaptive/nuplan_recurrent.sh new file mode 100755 index 0000000000..8bb17717e6 --- /dev/null +++ b/scripts/adaptive/nuplan_recurrent.sh @@ -0,0 +1,79 @@ +#!/bin/bash +#SBATCH --job-name=adaptive_nuplan_rnn +#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out +#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err +#SBATCH --mem=128GB +#SBATCH --time=24:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --account=torch_pr_355_tandon_advanced +#SBATCH --cpus-per-task=48 +#SBATCH --gres=gpu:1 +#SBATCH --array=0-15 + +# Train adaptive agents on NuPlan with Recurrent architecture +# Uses pre-trained NuPlan Recurrent co-players with varied conditioning +# +# PREREQUISITE: Train co-players first with scripts/coplayers/nuplan_recurrent.sh +# Then update ZIPPED_RUNS with the trained policy paths from wandb + +# Co-player policies trained with scripts/coplayers/nuplan_recurrent.sh +# Each entry: "policy_path entropy_weight_ub discount_weight_lb" +ZIPPED_RUNS=( + "experiments/puffer_drive_mwiatx5g.pt 0.5 0.8" + "experiments/puffer_drive_old90mw2.pt 0.1 0.8" + "TODO_FAILED 0.01 0.8" + "TODO_FAILED 0 0.8" + + "experiments/puffer_drive_hous32qj.pt 0.5 0.6" + "experiments/puffer_drive_dhxoorxc.pt 0.1 0.6" + "experiments/puffer_drive_0h5radmw.pt 0.01 0.6" + "experiments/puffer_drive_elet1zj8.pt 0 0.6" + + "experiments/puffer_drive_a57umusk.pt 0.5 0.4" + "experiments/puffer_drive_zozl26ek.pt 0.1 0.4" + "experiments/puffer_drive_jco8adma.pt 0.01 0.4" + "experiments/puffer_drive_5hhwfmmt.pt 0 0.4" + + "experiments/puffer_drive_a56dgt1x.pt 0.5 0.2" + "experiments/puffer_drive_dqzqt7qx.pt 0.1 0.2" + "experiments/puffer_drive_qetoozcn.pt 0.01 0.2" + "experiments/puffer_drive_q30rjcbu.pt 0 0.2" +) + +read -r COPLAYER_PATH ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}" + +# Fixed values +CONDITION_TYPE="all" +DISCOUNT_UB=1 +ENTROPY_LB=0 +NUPLAN_NUM_MAPS=5000 + +singularity exec --nv \ + --overlay "$OVERLAY_FILE:ro" \ + "$SINGULARITY_IMAGE" \ + bash -c " + set -e + + source ~/.bashrc + cd /scratch/mmk9418/projects/Adaptive_Driving_Agent + source .venv/bin/activate + + nice -n 19 python scripts/gpu_heartbeat.py & + HEARTBEAT_PID=\$! + + puffer train puffer_adaptive_drive --wandb --tag adaptive_nuplan_recurrent_k2 \ + --env.map-dir resources/drive/binaries/nuplan \ + --env.num-maps $NUPLAN_NUM_MAPS \ + --env.conditioning.type none \ + --env.co-player-enabled 1 \ + --env.co-player-policy.policy-path $COPLAYER_PATH \ + --env.co-player-policy.conditioning.type $CONDITION_TYPE \ + --env.co-player-policy.conditioning.discount-weight-lb $DISCOUNT_LB \ + --env.co-player-policy.conditioning.discount-weight-ub $DISCOUNT_UB \ + --env.co-player-policy.conditioning.entropy-weight-lb $ENTROPY_LB \ + --env.co-player-policy.conditioning.entropy-weight-ub $ENTROPY_UB \ + --rnn-name Recurrent \ + + kill \$HEARTBEAT_PID + " diff --git a/scripts/adaptive/nuplan_transformer.sh b/scripts/adaptive/nuplan_transformer.sh new file mode 100755 index 0000000000..4c5f4806a0 --- /dev/null +++ b/scripts/adaptive/nuplan_transformer.sh @@ -0,0 +1,76 @@ +#!/bin/bash +#SBATCH --job-name=adaptive_nuplan_tfm +#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out +#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err +#SBATCH --mem=128GB +#SBATCH --time=24:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --account=torch_pr_355_tandon_advanced +#SBATCH --cpus-per-task=48 +#SBATCH --gres=gpu:1 +#SBATCH --array=0-15 + +# Train adaptive agents on NuPlan with Transformer architecture +# Uses pre-trained NuPlan Transformer co-players with varied conditioning +# +# PREREQUISITE: Train co-players first with scripts/coplayers/nuplan_transformer.sh +# Then update ZIPPED_RUNS with the trained policy paths from wandb + +# Co-player policies trained with scripts/coplayers/nuplan_transformer.sh +# Each entry: "policy_path entropy_weight_ub discount_weight_lb" +ZIPPED_RUNS=( +"experiments/puffer_drive_joqbmi4s.pt 0.01 0.2" +"experiments/puffer_drive_0h81rtfi.pt 0 0.4" +"experiments/puffer_drive_medzmgum.pt 0.1 0.2" +"experiments/puffer_drive_f4a8yoi9.pt 0.5 0.2" +"experiments/puffer_drive_b1yx43w2.pt 0.01 0.4" +"experiments/puffer_drive_lv4x8hlt.pt 0.1 0.4" +"experiments/puffer_drive_j51yz49e.pt 0.5 0.4" +"experiments/puffer_drive_iry1wanp.pt 0 0.6" +"experiments/puffer_drive_kx8bhu3v.pt 0.01 0.6" +"experiments/puffer_drive_js4mf85k.pt 0.1 0.6" +"experiments/puffer_drive_52u5onve.pt 0.5 0.6" +"experiments/puffer_drive_nu7lkmx4.pt 0 0.8" +"experiments/puffer_drive_f8dpkpbq.pt 0.01 0.8" +"experiments/puffer_drive_zxsxu6z7.pt 0.1 0.8" +"experiments/puffer_drive_mfahi5bc.pt 0.5 0.8" +) + +read -r COPLAYER_PATH ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}" + +# Fixed values +CONDITION_TYPE="all" +DISCOUNT_UB=1 +ENTROPY_LB=0 +NUPLAN_NUM_MAPS=5000 + +singularity exec --nv \ + --overlay "$OVERLAY_FILE:ro" \ + "$SINGULARITY_IMAGE" \ + bash -c " + set -e + + source ~/.bashrc + cd /scratch/mmk9418/projects/Adaptive_Driving_Agent + source .venv/bin/activate + + nice -n 19 python scripts/gpu_heartbeat.py & + HEARTBEAT_PID=\$! + + puffer train puffer_adaptive_drive --wandb --tag adaptive_nuplan_transformer_1apr \ + --env.map-dir resources/drive/binaries/nuplan \ + --env.num-maps $NUPLAN_NUM_MAPS \ + --env.conditioning.type none \ + --env.co-player-enabled 1 \ + --env.co-player-policy.policy-path $COPLAYER_PATH \ + --env.co-player-policy.architecture Transformer \ + --env.co-player-policy.conditioning.type $CONDITION_TYPE \ + --env.co-player-policy.conditioning.discount-weight-lb $DISCOUNT_LB \ + --env.co-player-policy.conditioning.discount-weight-ub $DISCOUNT_UB \ + --env.co-player-policy.conditioning.entropy-weight-lb $ENTROPY_LB \ + --env.co-player-policy.conditioning.entropy-weight-ub $ENTROPY_UB \ + --rnn-name Transformer \ + + kill \$HEARTBEAT_PID + " diff --git a/scripts/adaptive/womd_recurrent.sh b/scripts/adaptive/womd_recurrent.sh new file mode 100755 index 0000000000..96a25ae967 --- /dev/null +++ b/scripts/adaptive/womd_recurrent.sh @@ -0,0 +1,74 @@ +#!/bin/bash +#SBATCH --job-name=adaptive_womd_rnn +#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out +#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err +#SBATCH --mem=128GB +#SBATCH --time=24:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --account=torch_pr_355_tandon_advanced +#SBATCH --cpus-per-task=48 +#SBATCH --gres=gpu:1 +#SBATCH --array=0-15 + +# Train adaptive agents on WOMD with Recurrent architecture +# Uses pre-trained WOMD Recurrent co-players with varied conditioning + +# Co-player policies trained with scripts/coplayers/womd_recurrent.sh +# Each entry: "policy_path entropy_weight_ub discount_weight_lb" +ZIPPED_RUNS=( + "experiments/puffer_drive_o9brlhxk.pt 0.5 0.8" + "experiments/puffer_drive_ufgi4za3.pt 0.1 0.8" + "experiments/puffer_drive_h97urhep.pt 0.01 0.8" + "experiments/puffer_drive_sa0w31jc.pt 0 0.8" + + "experiments/puffer_drive_ojounplt.pt 0.5 0.6" + "experiments/puffer_drive_5tr8vzex.pt 0.1 0.6" + "experiments/puffer_drive_wr30n6a9.pt 0.01 0.6" + "experiments/puffer_drive_use1k7kc.pt 0 0.6" + + "experiments/puffer_drive_nb2vowrz.pt 0.5 0.4" + "experiments/puffer_drive_e3e6vion.pt 0.1 0.4" + "experiments/puffer_drive_yyu6icxx.pt 0.01 0.4" + "experiments/puffer_drive_iuzoucs0.pt 0 0.4" + + "experiments/puffer_drive_u8kntkem.pt 0.5 0.2" + "TODO_FAILED 0.1 0.2" + "experiments/puffer_drive_f8ccc1f1.pt 0.01 0.2" + "experiments/puffer_drive_04p394xa.pt 0 0.2" +) + +read -r COPLAYER_PATH ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}" + +# Fixed values +CONDITION_TYPE="all" +DISCOUNT_UB=1 +ENTROPY_LB=0 + +singularity exec --nv \ + --overlay "$OVERLAY_FILE:ro" \ + "$SINGULARITY_IMAGE" \ + bash -c " + set -e + + source ~/.bashrc + cd /scratch/mmk9418/projects/Adaptive_Driving_Agent + source .venv/bin/activate + + nice -n 19 python scripts/gpu_heartbeat.py & + HEARTBEAT_PID=\$! + + puffer train puffer_adaptive_drive --wandb --tag adaptive_womd_recurrent_k2 \ + --env.num-maps 10000 \ + --env.conditioning.type none \ + --env.co-player-enabled 1 \ + --env.co-player-policy.policy-path $COPLAYER_PATH \ + --env.co-player-policy.conditioning.type $CONDITION_TYPE \ + --env.co-player-policy.conditioning.discount-weight-lb $DISCOUNT_LB \ + --env.co-player-policy.conditioning.discount-weight-ub $DISCOUNT_UB \ + --env.co-player-policy.conditioning.entropy-weight-lb $ENTROPY_LB \ + --env.co-player-policy.conditioning.entropy-weight-ub $ENTROPY_UB \ + --rnn-name Recurrent + + kill \$HEARTBEAT_PID + " diff --git a/scripts/adaptive/womd_transformer.sh b/scripts/adaptive/womd_transformer.sh new file mode 100755 index 0000000000..1adcda4ca9 --- /dev/null +++ b/scripts/adaptive/womd_transformer.sh @@ -0,0 +1,77 @@ +#!/bin/bash +#SBATCH --job-name=adaptive_womd_tfm +#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out +#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err +#SBATCH --mem=128GB +#SBATCH --time=24:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --account=torch_pr_355_tandon_advanced +#SBATCH --cpus-per-task=48 +#SBATCH --gres=gpu:1 +#SBATCH --array=0-15 + +# Train adaptive agents on WOMD with Transformer architecture +# Uses pre-trained WOMD Transformer co-players with varied conditioning +# +# PREREQUISITE: Train co-players first with scripts/coplayers/womd_transformer.sh +# Then update ZIPPED_RUNS with the trained policy paths from wandb + +# Co-player policies trained with scripts/coplayers/womd_transformer.sh +# Each entry: "policy_path entropy_weight_ub discount_weight_lb" +ZIPPED_RUNS=( + "experiments/puffer_drive_zagelrzs.pt 0.5 0.8" + "experiments/puffer_drive_d8kb6hwf.pt 0.1 0.8" + "experiments/puffer_drive_xdmwezaw.pt 0.01 0.8" + "experiments/puffer_drive_0cxi9nf8.pt 0 0.8" + + "experiments/puffer_drive_t69evoxz.pt 0.5 0.6" + "experiments/puffer_drive_yuuod9cn.pt 0.1 0.6" + "experiments/puffer_drive_436bzeu2.pt 0.01 0.6" + "experiments/puffer_drive_ct49w01c.pt 0 0.6" + + "experiments/puffer_drive_1e54zwgz.pt 0.5 0.4" + "experiments/puffer_drive_epupe6sw.pt 0.1 0.4" + "experiments/puffer_drive_npqu25y1.pt 0.01 0.4" + "experiments/puffer_drive_v9urng8s.pt 0 0.4" + + "experiments/puffer_drive_fugsjie2.pt 0.5 0.2" + "experiments/puffer_drive_iejlfoo7.pt 0.1 0.2" + "experiments/puffer_drive_rl7e091t.pt 0.01 0.2" + "experiments/puffer_drive_vztz9mmh.pt 0 0.2" +) + +read -r COPLAYER_PATH ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}" + +# Fixed values +CONDITION_TYPE="all" +DISCOUNT_UB=1 +ENTROPY_LB=0 + +singularity exec --nv \ + --overlay "$OVERLAY_FILE:ro" \ + "$SINGULARITY_IMAGE" \ + bash -c " + set -e + + source ~/.bashrc + cd /scratch/mmk9418/projects/Adaptive_Driving_Agent + source .venv/bin/activate + + nice -n 19 python scripts/gpu_heartbeat.py & + HEARTBEAT_PID=\$! + + puffer train puffer_adaptive_drive --wandb --tag adaptive_womd_transformer_new \ + --env.num-maps 10000 \ + --env.conditioning.type none \ + --env.co-player-enabled 1 \ + --env.co-player-policy.policy-path $COPLAYER_PATH \ + --env.co-player-policy.conditioning.type $CONDITION_TYPE \ + --env.co-player-policy.conditioning.discount-weight-lb $DISCOUNT_LB \ + --env.co-player-policy.conditioning.discount-weight-ub $DISCOUNT_UB \ + --env.co-player-policy.conditioning.entropy-weight-lb $ENTROPY_LB \ + --env.co-player-policy.conditioning.entropy-weight-ub $ENTROPY_UB \ + --rnn-name Transformer + + kill \$HEARTBEAT_PID + " diff --git a/scripts/baselines/nuplan_recurrent.sh b/scripts/baselines/nuplan_recurrent.sh new file mode 100755 index 0000000000..7070d9270c --- /dev/null +++ b/scripts/baselines/nuplan_recurrent.sh @@ -0,0 +1,44 @@ +#!/bin/bash +#SBATCH --job-name=baseline_nuplan_rnn +#SBATCH --output=/scratch/mmk9418/logs/%A_%x.out +#SBATCH --error=/scratch/mmk9418/logs/%A_%x.err +#SBATCH --mem=128GB +#SBATCH --time=24:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --account=torch_pr_355_tandon_advanced +#SBATCH --cpus-per-task=48 +#SBATCH --gres=gpu:1 + +# Vanilla baseline on NuPlan with Recurrent architecture +# No co-players - other vehicles follow recorded human trajectories +# +# PREREQUISITE: Convert NuPlan JSON to binary format first: +# python -c "from pufferlib.ocean.drive.drive import process_all_maps; \ +# process_all_maps('data/nuplan_gpudrive/nuplan', max_maps=5000)" + +NUPLAN_NUM_MAPS=5000 + +singularity exec --nv \ + --overlay "$OVERLAY_FILE:ro" \ + "$SINGULARITY_IMAGE" \ + bash -c " + set -e + + source ~/.bashrc + cd /scratch/mmk9418/projects/Adaptive_Driving_Agent + source .venv/bin/activate + + nice -n 19 python scripts/gpu_heartbeat.py & + HEARTBEAT_PID=\$! + + puffer train puffer_adaptive_drive --wandb --tag baseline_nuplan_recurrent \ + --env.map-dir resources/drive/binaries/nuplan \ + --env.num-maps $NUPLAN_NUM_MAPS \ + --env.conditioning.type none \ + --env.co-player-enabled 0 \ + --train.seed 42 \ + --rnn-name Recurrent \ + + kill \$HEARTBEAT_PID + " diff --git a/scripts/baselines/nuplan_transformer.sh b/scripts/baselines/nuplan_transformer.sh new file mode 100755 index 0000000000..7977cb4d4f --- /dev/null +++ b/scripts/baselines/nuplan_transformer.sh @@ -0,0 +1,44 @@ +#!/bin/bash +#SBATCH --job-name=baseline_nuplan_tfm +#SBATCH --output=/scratch/mmk9418/logs/%A_%x.out +#SBATCH --error=/scratch/mmk9418/logs/%A_%x.err +#SBATCH --mem=128GB +#SBATCH --time=24:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --account=torch_pr_355_tandon_advanced +#SBATCH --cpus-per-task=48 +#SBATCH --gres=gpu:1 + +# Vanilla baseline on NuPlan with Transformer architecture +# No co-players - other vehicles follow recorded human trajectories +# +# PREREQUISITE: Convert NuPlan JSON to binary format first: +# python -c "from pufferlib.ocean.drive.drive import process_all_maps; \ +# process_all_maps('data/nuplan_gpudrive/nuplan', max_maps=5000)" + +NUPLAN_NUM_MAPS=5000 + +singularity exec --nv \ + --overlay "$OVERLAY_FILE:ro" \ + "$SINGULARITY_IMAGE" \ + bash -c " + set -e + + source ~/.bashrc + cd /scratch/mmk9418/projects/Adaptive_Driving_Agent + source .venv/bin/activate + + nice -n 19 python scripts/gpu_heartbeat.py & + HEARTBEAT_PID=\$! + + puffer train puffer_adaptive_drive --wandb --tag baseline_nuplan_transformer \ + --env.map-dir resources/drive/binaries/nuplan \ + --env.num-maps $NUPLAN_NUM_MAPS \ + --env.conditioning.type none \ + --env.co-player-enabled 0 \ + --train.seed 42 \ + --rnn-name Transformer \ + + kill \$HEARTBEAT_PID + " diff --git a/scripts/baselines/womd_recurrent.sh b/scripts/baselines/womd_recurrent.sh new file mode 100755 index 0000000000..f20ad727b1 --- /dev/null +++ b/scripts/baselines/womd_recurrent.sh @@ -0,0 +1,37 @@ +#!/bin/bash +#SBATCH --job-name=baseline_womd_rnn +#SBATCH --output=/scratch/mmk9418/logs/%A_%x.out +#SBATCH --error=/scratch/mmk9418/logs/%A_%x.err +#SBATCH --mem=128GB +#SBATCH --time=24:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --account=torch_pr_355_tandon_advanced +#SBATCH --cpus-per-task=48 +#SBATCH --gres=gpu:1 + +# Vanilla baseline on WOMD with Recurrent architecture +# No co-players - other vehicles follow recorded human trajectories + +singularity exec --nv \ + --overlay "$OVERLAY_FILE:ro" \ + "$SINGULARITY_IMAGE" \ + bash -c " + set -e + + source ~/.bashrc + cd /scratch/mmk9418/projects/Adaptive_Driving_Agent + source .venv/bin/activate + + nice -n 19 python scripts/gpu_heartbeat.py & + HEARTBEAT_PID=\$! + + puffer train puffer_adaptive_drive --wandb --tag baseline_womd_recurrent \ + --env.num-maps 10000 \ + --env.conditioning.type none \ + --env.co-player-enabled 0 \ + --train.seed 42 \ + --rnn-name Recurrent + + kill \$HEARTBEAT_PID + " diff --git a/scripts/baselines/womd_transformer.sh b/scripts/baselines/womd_transformer.sh new file mode 100755 index 0000000000..91c028a652 --- /dev/null +++ b/scripts/baselines/womd_transformer.sh @@ -0,0 +1,37 @@ +#!/bin/bash +#SBATCH --job-name=baseline_womd_tfm +#SBATCH --output=/scratch/mmk9418/logs/%A_%x.out +#SBATCH --error=/scratch/mmk9418/logs/%A_%x.err +#SBATCH --mem=128GB +#SBATCH --time=24:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --account=torch_pr_355_tandon_advanced +#SBATCH --cpus-per-task=48 +#SBATCH --gres=gpu:1 + +# Vanilla baseline on WOMD with Transformer architecture +# No co-players - other vehicles follow recorded human trajectories + +singularity exec --nv \ + --overlay "$OVERLAY_FILE:ro" \ + "$SINGULARITY_IMAGE" \ + bash -c " + set -e + + source ~/.bashrc + cd /scratch/mmk9418/projects/Adaptive_Driving_Agent + source .venv/bin/activate + + nice -n 19 python scripts/gpu_heartbeat.py & + HEARTBEAT_PID=\$! + + puffer train puffer_adaptive_drive --wandb --tag baseline_womd_transformer \ + --env.num-maps 10000 \ + --env.conditioning.type none \ + --env.co-player-enabled 0 \ + --train.seed 42 \ + --rnn-name Transformer + + kill \$HEARTBEAT_PID + " diff --git a/scripts/coplayers/nuplan_recurrent.sh b/scripts/coplayers/nuplan_recurrent.sh new file mode 100755 index 0000000000..8e016b201c --- /dev/null +++ b/scripts/coplayers/nuplan_recurrent.sh @@ -0,0 +1,77 @@ +#!/bin/bash +#SBATCH --job-name=coplayer_nuplan_rnn +#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out +#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err +#SBATCH --mem=128GB +#SBATCH --time=24:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --account=torch_pr_355_tandon_advanced +#SBATCH --cpus-per-task=48 +#SBATCH --gres=gpu:1 +#SBATCH --array=0-15 + +# Train co-player policies on NuPlan with Recurrent architecture +# Varies entropy/discount conditioning for diverse co-player behaviors +# +# PREREQUISITE: Convert NuPlan JSON to binary format first: +# python -c "from pufferlib.ocean.drive.drive import process_all_maps; \ +# process_all_maps('data/nuplan_gpudrive/nuplan', max_maps=5000)" + +# Grid: 4 entropy levels × 4 discount levels = 16 configurations +ZIPPED_RUNS=( + "0.5 0.8" + "0.1 0.8" + "0.01 0.8" + "0 0.8" + + "0.5 0.6" + "0.1 0.6" + "0.01 0.6" + "0 0.6" + + "0.5 0.4" + "0.1 0.4" + "0.01 0.4" + "0 0.4" + + "0.5 0.2" + "0.1 0.2" + "0.01 0.2" + "0 0.2" +) + +read -r ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}" + +# Fixed values +CONDITION_TYPE="all" +DISCOUNT_UB=1 +ENTROPY_LB=0 +NUPLAN_NUM_MAPS=5000 + +singularity exec --nv \ + --overlay "$OVERLAY_FILE:ro" \ + "$SINGULARITY_IMAGE" \ + bash -c " + set -e + + source ~/.bashrc + cd /scratch/mmk9418/projects/Adaptive_Driving_Agent + source .venv/bin/activate + + nice -n 19 python scripts/gpu_heartbeat.py & + HEARTBEAT_PID=\$! + + puffer train puffer_drive --wandb --wandb-project ada_new_coplayers --tag coplayer_nuplan_recurrent \ + --env.map-dir resources/drive/binaries/nuplan \ + --env.num-maps $NUPLAN_NUM_MAPS \ + --env.conditioning.type $CONDITION_TYPE \ + --env.conditioning.entropy-weight-lb $ENTROPY_LB \ + --env.conditioning.entropy-weight-ub $ENTROPY_UB \ + --env.conditioning.discount-weight-lb $DISCOUNT_LB \ + --env.conditioning.discount-weight-ub $DISCOUNT_UB \ + --rnn-name Recurrent \ + --train.checkpoint-interval 50 + + kill \$HEARTBEAT_PID + " diff --git a/scripts/coplayers/nuplan_transformer.sh b/scripts/coplayers/nuplan_transformer.sh new file mode 100755 index 0000000000..465f1bf9eb --- /dev/null +++ b/scripts/coplayers/nuplan_transformer.sh @@ -0,0 +1,80 @@ +#!/bin/bash +#SBATCH --job-name=coplayer_nuplan_tfm +#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out +#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err +#SBATCH --mem=128GB +#SBATCH --time=24:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --account=torch_pr_355_tandon_advanced +#SBATCH --cpus-per-task=48 +#SBATCH --gres=gpu:1 +#SBATCH --array=0-15 + +# Train co-player policies on NuPlan with Transformer architecture +# Varies entropy/discount conditioning for diverse co-player behaviors +# +# PREREQUISITE: Convert NuPlan JSON to binary format first: +# python -c "from pufferlib.ocean.drive.drive import process_all_maps; \ +# process_all_maps('data/nuplan_gpudrive/nuplan', max_maps=5000)" + +# Grid: 4 entropy levels × 4 discount levels = 16 configurations +ZIPPED_RUNS=( + "0.5 0.8" + "0.1 0.8" + "0.01 0.8" + "0 0.8" + + "0.5 0.6" + "0.1 0.6" + "0.01 0.6" + "0 0.6" + + "0.5 0.4" + "0.1 0.4" + "0.01 0.4" + "0 0.4" + + "0.5 0.2" + "0.1 0.2" + "0.01 0.2" + "0 0.2" +) + +read -r ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}" + +# Fixed values +CONDITION_TYPE="all" +DISCOUNT_UB=1 +ENTROPY_LB=0 +NUPLAN_NUM_MAPS=5000 + +singularity exec --nv \ + --overlay "$OVERLAY_FILE:ro" \ + "$SINGULARITY_IMAGE" \ + bash -c " + set -e + + source ~/.bashrc + cd /scratch/mmk9418/projects/Adaptive_Driving_Agent + source .venv/bin/activate + + nice -n 19 python scripts/gpu_heartbeat.py & + HEARTBEAT_PID=\$! + + puffer train puffer_drive --wandb --wandb-project ada_new_coplayers --tag coplayer_nuplan_transformer_lr3e4 \ + --env.map-dir resources/drive/binaries/nuplan \ + --env.num-maps $NUPLAN_NUM_MAPS \ + --env.conditioning.type $CONDITION_TYPE \ + --env.conditioning.entropy-weight-lb $ENTROPY_LB \ + --env.conditioning.entropy-weight-ub $ENTROPY_UB \ + --env.conditioning.discount-weight-lb $DISCOUNT_LB \ + --env.conditioning.discount-weight-ub $DISCOUNT_UB \ + --policy-architecture Transformer \ + --train.context-length 91 \ + --train.horizon 91 \ + --train.learning-rate 0.003 \ + --train.checkpoint-interval 50 + + kill \$HEARTBEAT_PID + " diff --git a/scripts/coplayers/womd_recurrent.sh b/scripts/coplayers/womd_recurrent.sh new file mode 100755 index 0000000000..8f33fce9ff --- /dev/null +++ b/scripts/coplayers/womd_recurrent.sh @@ -0,0 +1,71 @@ +#!/bin/bash +#SBATCH --job-name=coplayer_womd_rnn +#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out +#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err +#SBATCH --mem=128GB +#SBATCH --time=24:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --account=torch_pr_355_tandon_advanced +#SBATCH --cpus-per-task=48 +#SBATCH --gres=gpu:1 +#SBATCH --array=0-15 + +# Train co-player policies on WOMD with Recurrent architecture +# Varies entropy/discount conditioning for diverse co-player behaviors + +# Grid: 4 entropy levels × 4 discount levels = 16 configurations +ZIPPED_RUNS=( + "0.5 0.8" + "0.1 0.8" + "0.01 0.8" + "0 0.8" + + "0.5 0.6" + "0.1 0.6" + "0.01 0.6" + "0 0.6" + + "0.5 0.4" + "0.1 0.4" + "0.01 0.4" + "0 0.4" + + "0.5 0.2" + "0.1 0.2" + "0.01 0.2" + "0 0.2" +) + +read -r ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}" + +# Fixed values +CONDITION_TYPE="all" +DISCOUNT_UB=1 +ENTROPY_LB=0 + +singularity exec --nv \ + --overlay "$OVERLAY_FILE:ro" \ + "$SINGULARITY_IMAGE" \ + bash -c " + set -e + + source ~/.bashrc + cd /scratch/mmk9418/projects/Adaptive_Driving_Agent + source .venv/bin/activate + + nice -n 19 python scripts/gpu_heartbeat.py & + HEARTBEAT_PID=\$! + + puffer train puffer_drive --wandb --wandb-project ada_new_coplayers --tag coplayer_womd_recurrent \ + --env.num-maps 10000 \ + --env.conditioning.type $CONDITION_TYPE \ + --env.conditioning.entropy-weight-lb $ENTROPY_LB \ + --env.conditioning.entropy-weight-ub $ENTROPY_UB \ + --env.conditioning.discount-weight-lb $DISCOUNT_LB \ + --env.conditioning.discount-weight-ub $DISCOUNT_UB \ + --rnn-name Recurrent \ + --train.checkpoint-interval 50 + + kill \$HEARTBEAT_PID + " diff --git a/scripts/coplayers/womd_transformer.sh b/scripts/coplayers/womd_transformer.sh new file mode 100755 index 0000000000..1260547c00 --- /dev/null +++ b/scripts/coplayers/womd_transformer.sh @@ -0,0 +1,74 @@ +#!/bin/bash +#SBATCH --job-name=coplayer_womd_tfm +#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out +#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err +#SBATCH --mem=128GB +#SBATCH --time=24:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --account=torch_pr_355_tandon_advanced +#SBATCH --cpus-per-task=48 +#SBATCH --gres=gpu:1 +#SBATCH --array=0-15 + +# Train co-player policies on WOMD with Transformer architecture +# Varies entropy/discount conditioning for diverse co-player behaviors + +# Grid: 4 entropy levels × 4 discount levels = 16 configurations +ZIPPED_RUNS=( + "0.5 0.8" + "0.1 0.8" + "0.01 0.8" + "0 0.8" + + "0.5 0.6" + "0.1 0.6" + "0.01 0.6" + "0 0.6" + + "0.5 0.4" + "0.1 0.4" + "0.01 0.4" + "0 0.4" + + "0.5 0.2" + "0.1 0.2" + "0.01 0.2" + "0 0.2" +) + +read -r ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}" + +# Fixed values +CONDITION_TYPE="all" +DISCOUNT_UB=1 +ENTROPY_LB=0 + +singularity exec --nv \ + --overlay "$OVERLAY_FILE:ro" \ + "$SINGULARITY_IMAGE" \ + bash -c " + set -e + + source ~/.bashrc + cd /scratch/mmk9418/projects/Adaptive_Driving_Agent + source .venv/bin/activate + + nice -n 19 python scripts/gpu_heartbeat.py & + HEARTBEAT_PID=\$! + + puffer train puffer_drive --wandb --wandb-project ada_new_coplayers --tag coplayer_womd_transformer_lr3e4 \ + --env.num-maps 10000 \ + --env.conditioning.type $CONDITION_TYPE \ + --env.conditioning.entropy-weight-lb $ENTROPY_LB \ + --env.conditioning.entropy-weight-ub $ENTROPY_UB \ + --env.conditioning.discount-weight-lb $DISCOUNT_LB \ + --env.conditioning.discount-weight-ub $DISCOUNT_UB \ + --policy-architecture Transformer \ + --train.horizon 91 \ + --train.context-length 91 \ + --train.learning-rate 0.003 \ + --train.checkpoint-interval 50 + + kill \$HEARTBEAT_PID + " diff --git a/scripts/run_baseline.sh b/scripts/run_baseline.sh new file mode 100755 index 0000000000..7e2f5b5e60 --- /dev/null +++ b/scripts/run_baseline.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=puffer_baseline +#SBATCH --output=/scratch/mmk9418/logs/%A_%x.out +#SBATCH --error=/scratch/mmk9418/logs/%A_%x.err +#SBATCH --mem=128GB +#SBATCH --time=24:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --account=torch_pr_355_tandon_priority +#SBATCH --cpus-per-task=48 +#SBATCH --gres=gpu:1 + +# Vanilla baseline training script +# No co-players - other vehicles follow recorded human trajectories from Waymo data +# For comparison with co-player experiments in run.sh + +singularity exec --nv \ + --overlay "$OVERLAY_FILE:ro" \ + "$SINGULARITY_IMAGE" \ + bash -c " + set -e + + source ~/.bashrc + cd /scratch/mmk9418/projects/Adaptive_Driving_Agent + source .venv/bin/activate + + # Start GPU heartbeat in background (for RL training which is CPU-bound) + nice -n 19 python scripts/gpu_heartbeat.py & + HEARTBEAT_PID=\$! + echo \"Started GPU Heartbeat with PID: \$HEARTBEAT_PID\" + + puffer train puffer_adaptive_drive --wandb --tag adaptive_baseline \ + --env.num-maps 1000 \ + --env.conditioning.type none \ + --env.co-player-enabled 0 \ + --train.seed 42 + + kill \$HEARTBEAT_PID + " diff --git a/scripts/run_nuplan_coplayers.sh b/scripts/run_nuplan_coplayers.sh new file mode 100755 index 0000000000..8d821ec8a2 --- /dev/null +++ b/scripts/run_nuplan_coplayers.sh @@ -0,0 +1,84 @@ +#!/bin/bash +#SBATCH --job-name=nuplan_coplayer +#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out +#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err +#SBATCH --mem=128GB +#SBATCH --time=24:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --account=torch_pr_355_tandon_priority +#SBATCH --cpus-per-task=48 +#SBATCH --gres=gpu:1 +#SBATCH --array=0-15 + +# Train co-player policies on NuPlan data with varying entropy/discount conditioning +# These trained policies can later be used as co-players for adaptive agent training +# +# PREREQUISITE: Convert NuPlan JSON to binary format first: +# python -c "from pufferlib.ocean.drive.drive import process_all_maps; \ +# process_all_maps('data/nuplan_gpudrive/nuplan', max_maps=5000)" +# +# This will create: resources/drive/binaries/nuplan/map_*.bin (dataset_name derived from folder) + +# Define configurations for each array task ID +# Grid: 4 entropy levels × 4 discount levels = 16 configurations +# Each entry: "entropy_weight_ub discount_weight_lb" +ZIPPED_RUNS=( + "0.5 0.8" + "0.1 0.8" + "0.01 0.8" + "0 0.8" + + "0.5 0.6" + "0.1 0.6" + "0.01 0.6" + "0 0.6" + + "0.5 0.4" + "0.1 0.4" + "0.01 0.4" + "0 0.4" + + "0.5 0.2" + "0.1 0.2" + "0.01 0.2" + "0 0.2" +) + +# Parse the values for this array task +read -r ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}" + +# Fixed values +CONDITION_TYPE="all" +DISCOUNT_UB=1 +ENTROPY_LB=0 + +# NuPlan has ~5235 maps +NUPLAN_NUM_MAPS=5000 + +singularity exec --nv \ + --overlay "$OVERLAY_FILE:ro" \ + "$SINGULARITY_IMAGE" \ + bash -c " + set -e + + source ~/.bashrc + cd /scratch/mmk9418/projects/Adaptive_Driving_Agent + source .venv/bin/activate + + # Start GPU heartbeat in background (for RL training which is CPU-bound) + nice -n 19 python scripts/gpu_heartbeat.py & + HEARTBEAT_PID=\$! + echo \"Started GPU Heartbeat with PID: \$HEARTBEAT_PID\" + + puffer train puffer_drive --wandb --tag nuplan_coplayer_training \ + --env.map-dir resources/drive/binaries/nuplan \ + --env.num-maps $NUPLAN_NUM_MAPS \ + --env.conditioning.type $CONDITION_TYPE \ + --env.conditioning.entropy-weight-lb $ENTROPY_LB \ + --env.conditioning.entropy-weight-ub $ENTROPY_UB \ + --env.conditioning.discount-weight-lb $DISCOUNT_LB \ + --env.conditioning.discount-weight-ub $DISCOUNT_UB \ + + kill \$HEARTBEAT_PID + " diff --git a/scripts/run_transformer_adaptive.sh b/scripts/run_transformer_adaptive.sh new file mode 100755 index 0000000000..82afefb8b3 --- /dev/null +++ b/scripts/run_transformer_adaptive.sh @@ -0,0 +1,77 @@ +#!/bin/bash +#SBATCH --job-name=puffer_transformer +#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out +#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err +#SBATCH --mem=128GB +#SBATCH --time=24:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --account=torch_pr_355_tandon_priority +#SBATCH --cpus-per-task=48 +#SBATCH --gres=gpu:1 +#SBATCH --array=0-15 + +# Transformer-based adaptive agent training with co-players +# Same co-player configurations as run.sh, but using Transformer architecture + +# Define configurations for each array task ID +# Each entry: "path entropy_weight_ub discount_weight_lb" +ZIPPED_RUNS=( + "experiments/puffer_drive_8u92j3ts/model_puffer_drive_003000.pt 0.5 0.8" + "experiments/puffer_drive_x4xs711x.pt 0.1 0.8" + "experiments/puffer_drive_hhzdzhl8.pt 0.01 0.8" + "experiments/puffer_drive_3xd48djp.pt 0 0.8" + + "experiments/puffer_drive_fgglgofu.pt 0.5 0.6" + "experiments/puffer_drive_g3x9e5rn.pt 0.01 0.6" + "experiments/puffer_drive_gzuuzs0o.pt 0.1 0.6" + "experiments/puffer_drive_6nzf7xha.pt 0 0.6" + + "experiments/puffer_drive_3iefv59j.pt 0.5 0.4" + "experiments/puffer_drive_7h07nrxy.pt 0.1 0.4" + "experiments/puffer_drive_bot2wl0m.pt 0.01 0.4" + "experiments/puffer_drive_n7mx9f4b.pt 0 0.4" + + "experiments/puffer_drive_9jv4q77m.pt 0.5 0.2" + "experiments/puffer_drive_5p8gpw84.pt 0.1 0.2" + "experiments/puffer_drive_jskw659g.pt 0.01 0.2" + "experiments/puffer_drive_eeyizdrk.pt 0 0.2" +) + +# Parse the values for this array task +read -r COPLAYER_PATH ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}" + +# Fixed values +CONDITION_TYPE="all" +DISCOUNT_UB=1 +ENTROPY_LB=0 + +singularity exec --nv \ + --overlay "$OVERLAY_FILE:ro" \ + "$SINGULARITY_IMAGE" \ + bash -c " + set -e + + source ~/.bashrc + cd /scratch/mmk9418/projects/Adaptive_Driving_Agent + source .venv/bin/activate + + # Start GPU heartbeat in background (for RL training which is CPU-bound) + nice -n 19 python scripts/gpu_heartbeat.py & + HEARTBEAT_PID=\$! + echo \"Started GPU Heartbeat with PID: \$HEARTBEAT_PID\" + + puffer train puffer_adaptive_drive --wandb --tag adaptive_transformer \ + --env.num-maps 1000 \ + --env.conditioning.type none \ + --env.co-player-enabled 1 \ + --env.co-player-policy.policy-path $COPLAYER_PATH \ + --env.co-player-policy.conditioning.type $CONDITION_TYPE \ + --env.co-player-policy.conditioning.discount-weight-lb $DISCOUNT_LB \ + --env.co-player-policy.conditioning.discount-weight-ub $DISCOUNT_UB \ + --env.co-player-policy.conditioning.entropy-weight-lb $ENTROPY_LB \ + --env.co-player-policy.conditioning.entropy-weight-ub $ENTROPY_UB \ + --rnn-name Transformer + + kill \$HEARTBEAT_PID + " diff --git a/scripts/womd_transformer_ablation.sh b/scripts/womd_transformer_ablation.sh new file mode 100644 index 0000000000..accd07719d --- /dev/null +++ b/scripts/womd_transformer_ablation.sh @@ -0,0 +1,82 @@ +#!/bin/bash +#SBATCH --job-name=coplayer_womd_tfm_ablation +#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out +#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err +#SBATCH --mem=128GB +#SBATCH --time=24:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --account=torch_pr_355_tandon_advanced +#SBATCH --cpus-per-task=48 +#SBATCH --gres=gpu:1 +#SBATCH --array=0-11 + +# Ablation study on Transformer architecture parameters +# Varies: context length (1, 32, 91), learning rate (0.003, 0.0003), input_size (64, 128) +# Uses default entropy/discount values with "all" conditioning + +# Grid: 3 context × 2 learning rate × 2 input_size = 12 configurations +# Format: "CTX LR INPUT_SIZE" +ZIPPED_RUNS=( + # ctx=1 + "1 0.003 64" + "1 0.003 128" + "1 0.0003 64" + "1 0.0003 128" + + # ctx=32 + "32 0.003 64" + "32 0.003 128" + "32 0.0003 64" + "32 0.0003 128" + + # ctx=91 + "91 0.003 64" + "91 0.003 128" + "91 0.0003 64" + "91 0.0003 128" +) + +read -r CTX LR INPUT_SIZE <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}" + +# Calculate minibatch_size: must be <= batch_size (8192 * CTX) and divisible by CTX +# Using multiplier 352 (same as 32032/91) for consistency +MINIBATCH_SIZE=$((CTX * 352)) + +# Fixed conditioning values (matching drive.ini defaults) +CONDITION_TYPE="all" +ENTROPY_UB=0.001 +ENTROPY_LB=0 +DISCOUNT_UB=0.98 +DISCOUNT_LB=0.80 + +singularity exec --nv \ + --overlay "$OVERLAY_FILE:ro" \ + "$SINGULARITY_IMAGE" \ + bash -c " + set -e + + source ~/.bashrc + cd /scratch/mmk9418/projects/Adaptive_Driving_Agent + source .venv/bin/activate + + nice -n 19 python scripts/gpu_heartbeat.py & + HEARTBEAT_PID=\$! + + puffer train puffer_drive --wandb --wandb-project ada_new_coplayers --tag coplayer_womd_transformer_ablation \ + --env.num-maps 10000 \ + --env.conditioning.type $CONDITION_TYPE \ + --env.conditioning.entropy-weight-lb $ENTROPY_LB \ + --env.conditioning.entropy-weight-ub $ENTROPY_UB \ + --env.conditioning.discount-weight-lb $DISCOUNT_LB \ + --env.conditioning.discount-weight-ub $DISCOUNT_UB \ + --policy-architecture Transformer \ + --policy.input-size $INPUT_SIZE \ + --train.context-length $CTX \ + --train.horizon $CTX \ + --train.learning-rate $LR \ + --train.minibatch-size $MINIBATCH_SIZE \ + --train.checkpoint-interval 50 + + kill \$HEARTBEAT_PID + " diff --git a/tests/test_drive_config.py b/tests/test_drive_config.py index 43b30916f8..486eb802cd 100644 --- a/tests/test_drive_config.py +++ b/tests/test_drive_config.py @@ -79,7 +79,7 @@ def test_drive_ini_config(self): if ASSERTION_LEVEL >= 3: self.assertEqual(args["train"]["total_timesteps"], 3_000_000_000) self.assertEqual(args["train"]["batch_size"], "auto") - self.assertEqual(args["train"]["bptt_horizon"], 91) + self.assertEqual(args["train"]["horizon"], 91) self.assertEqual(args["train"]["minibatch_size"], 11648) self.assertEqual(args["train"]["learning_rate"], 0.001) self.assertEqual(args["train"]["gamma"], 0.98) diff --git a/tests/test_drive_scenarios.py b/tests/test_drive_scenarios.py index 0d538c3c30..45f75a246b 100644 --- a/tests/test_drive_scenarios.py +++ b/tests/test_drive_scenarios.py @@ -20,7 +20,7 @@ def run_training_test(env_name, config_overrides, target_steps=10000, test_name= "compile": False, "total_timesteps": 100000, "batch_size": 64, - "bptt_horizon": 8, + "horizon": 8, "minibatch_size": 64, "max_minibatch_size": 64, "update_epochs": 1, diff --git a/tests/test_drive_train.py b/tests/test_drive_train.py index f9f16ef648..900f792ebc 100644 --- a/tests/test_drive_train.py +++ b/tests/test_drive_train.py @@ -27,7 +27,7 @@ def test_drive_training(): "compile": False, "total_timesteps": 100000, "batch_size": 64, - "bptt_horizon": 8, + "horizon": 8, "minibatch_size": 64, "max_minibatch_size": 64, "update_epochs": 1,