diff --git a/.gitignore b/.gitignore index 66e4b066be..71497ea416 100644 --- a/.gitignore +++ b/.gitignore @@ -194,3 +194,4 @@ pufferlib/resources/drive/output*.mp4 # Local TODO tracking TODO.md +*.mp4 diff --git a/pufferlib/config/ocean/adaptive.ini b/pufferlib/config/ocean/adaptive.ini index 598faf50ff..8abea752a9 100644 --- a/pufferlib/config/ocean/adaptive.ini +++ b/pufferlib/config/ocean/adaptive.ini @@ -2,18 +2,20 @@ package = ocean env_name = puffer_adaptive_drive policy_name = Drive -transformer_name = Transformer - ; Changed from rnn_name +rnn_name = Recurrent [vec] num_workers = 16 num_envs = 16 -batch_size = 2 +batch_size = 1 ; backend = Serial [policy] -input_size = 128 -; Increased from 64 for richer representations +input_size = 64 +hidden_size = 256 + +[rnn] +input_size = 256 hidden_size = 256 [transformer] @@ -29,8 +31,8 @@ dropout = 0.0 ; Dropout (keep at 0 for RL stability initially) [env] -num_agents = 1512 -num_ego_agents = 756 +num_agents = 1024 +num_ego_agents = 512 ; Options: discrete, continuous action_type = discrete ; Options: classic, jerk @@ -120,7 +122,7 @@ minibatch_size = 36400 ; 400 * 91 max_minibatch_size = 36400 minibatch_multiplier = 400 -policy_architecture = Transformer +policy_architecture = Recurrent ; Matches scenario_length for buffer organization bptt_horizon = 32 ; Keep for backward compatibility @@ -131,7 +133,7 @@ clip_coef = 0.2 ent_coef = 0.005 gae_lambda = 0.95 gamma = 0.98 -learning_rate = 0.0003 +learning_rate = 0.003 ; Reduced from 0.003 (transformers often need lower LR) max_grad_norm = 1.0 prio_alpha = 0.85 @@ -193,6 +195,8 @@ human_replay_num_agents = 32 human_replay_num_rollouts = 100 ; Number of maps to use for human replay evaluation human_replay_num_maps = 100 +; Number of maps to render for human replay (subset of eval maps) +human_replay_render_num_maps = 3 [sweep.train.learning_rate] distribution = log_normal diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini index 7d5efd43be..c9a9eaf5c5 100644 --- a/pufferlib/config/ocean/drive.ini +++ b/pufferlib/config/ocean/drive.ini @@ -2,7 +2,7 @@ package = ocean env_name = puffer_drive policy_name = Drive -rnn_name = Transformer +rnn_name = Recurrent [vec] num_workers = 16 @@ -14,9 +14,9 @@ batch_size = 2 input_size = 64 hidden_size = 256 -; [rnn] -; input_size = 256 -; hidden_size = 256 +[rnn] +input_size = 256 +hidden_size = 256 [transformer] input_size = 256 @@ -112,15 +112,13 @@ discount_weight_ub = 0.80 [train] seed=42 total_timesteps = 2_000_000_000 -# learning_rate = 0.02 -# gamma = 0.985 anneal_lr = True -; Needs to be: num_agents * num_workers * BPTT horizon +; Needs to be: num_agents * num_workers * context_window batch_size = auto minibatch_size = 32768 max_minibatch_size = 32768 -; minibatch_size = 256 -; max_minibatch_size = 256 +minibatch_multiplier = 400 +policy_architecture = Recurrent bptt_horizon = 32 adam_beta1 = 0.9 adam_beta2 = 0.999 @@ -130,17 +128,15 @@ ent_coef = 0.005 gae_lambda = 0.95 gamma = 0.98 learning_rate = 0.003 -max_grad_norm = 1 -prio_alpha = 0.8499999999999999 -prio_beta0 = 0.8499999999999999 +max_grad_norm = 1.0 +prio_alpha = 0.85 +prio_beta0 = 0.85 update_epochs = 1 -vf_clip_coef = 0.1999999999999999 -vf_coef = 2 +vf_clip_coef = 0.2 +vf_coef = 2.0 vtrace_c_clip = 1 vtrace_rho_clip = 1 -checkpoint_interval = 100 -use_transformer = True -context_window = 32 +checkpoint_interval = 10 # Rendering options render = True render_interval = 100 @@ -184,8 +180,16 @@ wosac_sanity_check = False wosac_aggregate_results = True ; If True, enable human replay evaluation (pair policy-controlled agent with human replays) human_replay_eval = False -; Control only the self-driving car -human_replay_control_mode = "control_sdc_only" +; Control mode for human replay (control_vehicles with max_controlled_agents=1 controls one agent) +human_replay_control_mode = "control_vehicles" +; Number of agents in human replay evaluation environment +human_replay_num_agents = 32 +; Number of rollouts for human replay evaluation +human_replay_num_rollouts = 100 +; Number of maps to use for human replay evaluation +human_replay_num_maps = 100 +; Number of maps to render for human replay (subset of eval maps) +human_replay_render_num_maps = 3 [sweep.train.learning_rate] distribution = log_normal diff --git a/pufferlib/ocean/drive/drive.c b/pufferlib/ocean/drive/drive.c index 9f6337051c..bae514790c 100644 --- a/pufferlib/ocean/drive/drive.c +++ b/pufferlib/ocean/drive/drive.c @@ -34,20 +34,18 @@ void test_drivenet() { void demo() { // Note: The settings below are hardcoded for demo purposes. Since the policy was - // trained with these exact settings, that changing them may lead to - // weird behavior. + // trained with these exact settings, changing them may lead to weird behavior. Drive env = { .human_agent_idx = 0, - .dynamics_model = conf.dynamics_model, - .reward_vehicle_collision = conf.reward_vehicle_collision, - .reward_offroad_collision = conf.reward_offroad_collision, - .reward_ade = conf.reward_ade, - .goal_radius = conf.goal_radius, - .dt = conf.dt, + .dynamics_model = CLASSIC, + .reward_vehicle_collision = -1.0f, + .reward_offroad_collision = -1.0f, + .goal_radius = 2.0f, + .dt = 0.1f, .map_name = "resources/drive/binaries/training/map_000.bin", - .init_steps = conf.init_steps, - .collision_behavior = conf.collision_behavior, - .offroad_behavior = conf.offroad_behavior, + .init_steps = 0, + .collision_behavior = 0, + .offroad_behavior = 0, }; allocate(&env); c_reset(&env); diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py index 420e726b5e..f4938cd58d 100644 --- a/pufferlib/ocean/drive/drive.py +++ b/pufferlib/ocean/drive/drive.py @@ -1084,7 +1084,8 @@ def test_performance(timeout=10, atn_cache=1024, num_agents=1024): if __name__ == "__main__": # test_performance() # Process the train dataset - process_all_maps(data_folder="/data/processed/training") + # process_all_maps(data_folder="/data/processed/training") + process_all_maps(data_folder="/data/nuplan_gpudrive/nuplan") # Process the validation/test dataset # process_all_maps(data_folder="data/processed/validation") # # Process the validation_interactive dataset diff --git a/pufferlib/ocean/drive/visualize.c b/pufferlib/ocean/drive/visualize.c index 4820bd9be0..6fcc26a73e 100644 --- a/pufferlib/ocean/drive/visualize.c +++ b/pufferlib/ocean/drive/visualize.c @@ -65,11 +65,15 @@ void CloseVideo(VideoRecorder *recorder) { waitpid(recorder->pid, NULL, 0); } -void renderTopDownView(Drive *env, Client *client, int map_height, int obs, int lasers, int trajectories, - int frame_count, float *path, int show_human_logs, int show_grid, int img_width, int img_height, - int zoom_in) { +void renderTopDownView(Drive *env, Client *client, float map_width, float map_height, int obs, int lasers, + int trajectories, int frame_count, float *path, int show_human_logs, int show_grid, + int img_width, int img_height, int zoom_in, int current_scenario, int total_scenarios) { BeginDrawing(); + // Calculate map center + float center_x = (env->grid_map->top_left_x + env->grid_map->bottom_right_x) / 2.0f; + float center_y = (env->grid_map->top_left_y + env->grid_map->bottom_right_y) / 2.0f; + // Top-down orthographic camera Camera3D camera = {0}; @@ -77,10 +81,11 @@ void renderTopDownView(Drive *env, Client *client, int map_height, int obs, int camera.position = (Vector3){0.0f, 0.0f, 500.0f}; // above the scene camera.target = (Vector3){0.0f, 0.0f, 0.0f}; // look at origin camera.fovy = map_height; - } else { // Show full map - camera.position = (Vector3){env->grid_map->top_left_x, env->grid_map->bottom_right_y, 500.0f}; - camera.target = (Vector3){env->grid_map->top_left_x, env->grid_map->bottom_right_y, 0.0f}; - camera.fovy = 2 * map_height; + } else { // Show full map - center camera on map + camera.position = (Vector3){center_x, center_y, 500.0f}; + camera.target = (Vector3){center_x, center_y, 0.0f}; + // Use the larger dimension to ensure full map is visible + camera.fovy = (map_height > map_width) ? map_height * 1.1f : map_width * 1.1f; } camera.up = (Vector3){0.0f, -1.0f, 0.0f}; @@ -133,6 +138,12 @@ void renderTopDownView(Drive *env, Client *client, int map_height, int obs, int // Draw scene draw_scene(env, client, 1, obs, lasers, show_grid); EndMode3D(); + + // Draw scenario counter overlay (2D text on top of 3D scene) + char scenario_text[64]; + snprintf(scenario_text, sizeof(scenario_text), "Scenario %d / %d", current_scenario, total_scenarios); + DrawText(scenario_text, 20, 20, 30, WHITE); + EndDrawing(); } @@ -189,13 +200,114 @@ static int make_gif_from_frames(const char *pattern, int fps, const char *palett return 0; } +// Transform observations from ego format to co-player format by inserting conditioning values +// src_obs: Source observations (may include ego conditioning) +// dst_obs: Destination buffer for co-player format (with co-player conditioning) +// num_agents: Number of agents to transform +// ego_base_dim: Base ego features (7 for CLASSIC, 10 for JERK) +// co_use_rc/ec/dc: Co-player conditioning flags (determines which features to insert) +void transform_obs_for_coplayer(float *src_obs, float *dst_obs, int num_agents, int ego_obs_size, int coplayer_obs_size, + int ego_base_dim, int co_use_rc, int co_use_ec, int co_use_dc, float collision_lb, + float collision_ub, float offroad_lb, float offroad_ub, float goal_lb, float goal_ub, + float entropy_lb, float entropy_ub, float discount_lb, float discount_ub) { + // Fixed sizes for partner and road features (from drive.h constants) + int partner_features = (MAX_AGENTS - 1) * PARTNER_FEATURES; + int road_features = MAX_ROAD_SEGMENT_OBSERVATIONS * ROAD_FEATURES; + int partner_road_features = partner_features + road_features; + + // Derive source conditioning size from ego_obs_size + // This handles cases where ego policy has conditioning (type != "none") + int src_conditioning = ego_obs_size - ego_base_dim - partner_road_features; + + // Calculate destination conditioning size from flags + int dst_conditioning = (co_use_rc ? 3 : 0) + (co_use_ec ? 1 : 0) + (co_use_dc ? 1 : 0); + + for (int i = 0; i < num_agents; i++) { + float *src = src_obs + i * ego_obs_size; + float *dst = dst_obs + i * coplayer_obs_size; + + // Copy ego base features (without conditioning) + memcpy(dst, src, ego_base_dim * sizeof(float)); + + // Sample and insert conditioning values based on flags + // Order must match: reward (3), entropy (1), discount (1) + int cond_idx = ego_base_dim; + if (co_use_rc) { + // Reward conditioning (3 features: collision, offroad, goal) + dst[cond_idx++] = collision_lb + (float)rand() / RAND_MAX * (collision_ub - collision_lb); + dst[cond_idx++] = offroad_lb + (float)rand() / RAND_MAX * (offroad_ub - offroad_lb); + dst[cond_idx++] = goal_lb + (float)rand() / RAND_MAX * (goal_ub - goal_lb); + } + if (co_use_ec) { + // Entropy conditioning (1 feature) + dst[cond_idx++] = entropy_lb + (float)rand() / RAND_MAX * (entropy_ub - entropy_lb); + } + if (co_use_dc) { + // Discount conditioning (1 feature) + dst[cond_idx++] = discount_lb + (float)rand() / RAND_MAX * (discount_ub - discount_lb); + } + + // Copy partner + road features, skipping over any source conditioning + memcpy(dst + ego_base_dim + dst_conditioning, src + ego_base_dim + src_conditioning, + partner_road_features * sizeof(float)); + } +} + +// Helper function for dual-policy forward pass +// Runs ego policy on first num_ego_agents, co-player policy on the rest +// Handles different observation sizes between ego and co-player policies +void forward_population(DriveNet *ego_net, DriveNet *co_player_net, float *observations, int *actions, + int num_ego_agents, int num_co_players, int ego_obs_size, int coplayer_obs_size, + int ego_base_dim, int co_use_rc, int co_use_ec, int co_use_dc, float co_collision_lb, + float co_collision_ub, float co_offroad_lb, float co_offroad_ub, float co_goal_lb, + float co_goal_ub, float co_entropy_lb, float co_entropy_ub, float co_discount_lb, + float co_discount_ub) { + if (co_player_net == NULL || num_co_players == 0) { + // Single policy mode - use ego net for all agents + forward(ego_net, observations, actions); + return; + } + + // Allocate temporary buffers for ego observations/actions + float *ego_obs = (float *)malloc(num_ego_agents * ego_obs_size * sizeof(float)); + int *ego_actions = (int *)malloc(num_ego_agents * sizeof(int)); + + // Allocate temporary buffers for co-player observations/actions + float *co_obs_raw = observations + num_ego_agents * ego_obs_size; + float *co_obs_transformed = (float *)malloc(num_co_players * coplayer_obs_size * sizeof(float)); + int *co_actions = (int *)malloc(num_co_players * sizeof(int)); + + // Copy ego observations (already correct format) + memcpy(ego_obs, observations, num_ego_agents * ego_obs_size * sizeof(float)); + + // Transform co-player observations (add conditioning features) + transform_obs_for_coplayer(co_obs_raw, co_obs_transformed, num_co_players, ego_obs_size, coplayer_obs_size, + ego_base_dim, co_use_rc, co_use_ec, co_use_dc, co_collision_lb, co_collision_ub, + co_offroad_lb, co_offroad_ub, co_goal_lb, co_goal_ub, co_entropy_lb, co_entropy_ub, + co_discount_lb, co_discount_ub); + + // Run forward on each network + forward(ego_net, ego_obs, ego_actions); + forward(co_player_net, co_obs_transformed, co_actions); + + // Combine actions back + memcpy(actions, ego_actions, num_ego_agents * sizeof(int)); + memcpy(actions + num_ego_agents, co_actions, num_co_players * sizeof(int)); + + // Cleanup + free(ego_obs); + free(ego_actions); + free(co_obs_transformed); + free(co_actions); +} + int eval_gif(const char *map_name, const char *policy_name, int show_grid, int obs_only, int lasers, int show_human_logs, int frame_skip, const char *view_mode, const char *output_topdown, - const char *output_agent, int num_maps, int zoom_in) { + const char *output_agent, int num_maps, int zoom_in, const char *ini_file, int k_scenarios_cli, + int max_controlled_agents_cli, const char *co_player_policy_name) { // Parse configuration from INI file env_init_config conf = {0}; - const char *ini_file = "pufferlib/config/ocean/drive.ini"; if (ini_parse(ini_file, handler, &conf) < 0) { fprintf(stderr, "Error: Could not load %s. Cannot determine environment configuration.\n", ini_file); return -1; @@ -269,7 +381,7 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o .entropy_weight_ub = (conf.conditioning != NULL) ? conf.conditioning->entropy_weight_ub : 0.0f, .discount_weight_lb = (conf.conditioning != NULL) ? conf.conditioning->discount_weight_lb : 0.0f, .discount_weight_ub = (conf.conditioning != NULL) ? conf.conditioning->discount_weight_ub : 0.0f, - .max_controlled_agents = 32, + .max_controlled_agents = (max_controlled_agents_cli > 0) ? max_controlled_agents_cli : 32, }; allocate(&env); @@ -317,11 +429,92 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o client->cyclist = LoadModel("resources/drive/cyclist.glb"); client->pedestrian = LoadModel("resources/drive/pedestrian.glb"); + // Determine number of ego agents vs co-players + int num_ego_agents = env.active_agent_count; + int num_co_players = 0; + DriveNet *co_player_net = NULL; + + // Co-player conditioning flags (hoisted to outer scope for later use) + int co_use_rc = 0, co_use_ec = 0, co_use_dc = 0; + + // Check if co-player policy is provided (either via CLI or INI) + const char *actual_co_player_policy = co_player_policy_name; + if (actual_co_player_policy == NULL && conf.co_player_enabled && strlen(conf.co_player_policy_path) > 0) { + actual_co_player_policy = conf.co_player_policy_path; + } + + if (actual_co_player_policy != NULL) { + // Population play mode - split agents between ego and co-player + // Use num_ego_agents from config, or default to half + if (conf.num_ego_agents > 0 && conf.num_ego_agents < env.active_agent_count) { + num_ego_agents = conf.num_ego_agents; + } else { + num_ego_agents = env.active_agent_count / 2; + } + num_co_players = env.active_agent_count - num_ego_agents; + + printf("Population play: %d ego agents, %d co-players\n", num_ego_agents, num_co_players); + + // Load co-player policy + FILE *co_policy_file = fopen(actual_co_player_policy, "rb"); + if (co_policy_file != NULL) { + fclose(co_policy_file); + Weights *co_weights = load_weights(actual_co_player_policy); + + // Determine co-player conditioning from config + if (conf.co_player_conditioning != NULL) { + co_use_rc = (strcmp(conf.co_player_conditioning->type, "reward") == 0 || + strcmp(conf.co_player_conditioning->type, "all") == 0); + co_use_ec = (strcmp(conf.co_player_conditioning->type, "entropy") == 0 || + strcmp(conf.co_player_conditioning->type, "all") == 0); + co_use_dc = (strcmp(conf.co_player_conditioning->type, "discount") == 0 || + strcmp(conf.co_player_conditioning->type, "all") == 0); + } + + co_player_net = + init_drivenet(co_weights, num_co_players, env.dynamics_model, co_use_rc, co_use_ec, co_use_dc); + printf("Co-player policy loaded with conditioning: rc=%d, ec=%d, dc=%d\n", co_use_rc, co_use_ec, co_use_dc); + } else { + printf("Warning: Could not load co-player policy from %s. Using main policy for all agents.\n", + actual_co_player_policy); + num_ego_agents = env.active_agent_count; + num_co_players = 0; + } + } + + // Extract co-player conditioning bounds from config + float co_collision_lb = 0, co_collision_ub = 0; + float co_offroad_lb = 0, co_offroad_ub = 0; + float co_goal_lb = 0, co_goal_ub = 0; + float co_entropy_lb = 0, co_entropy_ub = 0; + float co_discount_lb = 0, co_discount_ub = 0; + + // Get conditioning dims directly from co_player_net to ensure consistency + int coplayer_num_conditioning = (co_player_net != NULL) ? co_player_net->conditioning_dims : 0; + + if (conf.co_player_conditioning != NULL) { + co_collision_lb = conf.co_player_conditioning->reward_collision_weight_lb; + co_collision_ub = conf.co_player_conditioning->reward_collision_weight_ub; + co_offroad_lb = conf.co_player_conditioning->reward_offroad_weight_lb; + co_offroad_ub = conf.co_player_conditioning->reward_offroad_weight_ub; + co_goal_lb = conf.co_player_conditioning->reward_goal_weight_lb; + co_goal_ub = conf.co_player_conditioning->reward_goal_weight_ub; + co_entropy_lb = conf.co_player_conditioning->entropy_weight_lb; + co_entropy_ub = conf.co_player_conditioning->entropy_weight_ub; + co_discount_lb = conf.co_player_conditioning->discount_weight_lb; + co_discount_ub = conf.co_player_conditioning->discount_weight_ub; + } + + // Load main (ego) policy Weights *weights = load_weights(policy_name); printf("Active agents in map: %d\n", env.active_agent_count); - DriveNet *net = init_drivenet(weights, env.active_agent_count, env.dynamics_model, use_rc, use_ec, use_dc); + DriveNet *net = init_drivenet(weights, num_ego_agents, env.dynamics_model, use_rc, use_ec, use_dc); - int frame_count = env.scenario_length > 0 ? env.scenario_length : TRAJECTORY_LENGTH_DEFAULT; + // Calculate frame count: k_scenarios * scenario_length for adaptive agents + int scenario_length = env.scenario_length > 0 ? env.scenario_length : TRAJECTORY_LENGTH_DEFAULT; + int k_scenarios = (k_scenarios_cli > 0) ? k_scenarios_cli : (conf.k_scenarios > 0 ? conf.k_scenarios : 1); + int frame_count = k_scenarios * scenario_length; + printf("Rendering %d scenarios x %d steps = %d total frames\n", k_scenarios, scenario_length, frame_count); char filename_topdown[256]; char filename_agent[256]; @@ -373,16 +566,39 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o } } + // Calculate observation sizes per agent + // ego_base_dim: 7 for CLASSIC dynamics, 10 for JERK dynamics + int ego_base_dim = (env.dynamics_model == 1) ? 10 : 7; // 1 = JERK + + // Ego observation size (environment generates observations without conditioning for ego) + int ego_obs_size = + net->ego_dim + (MAX_AGENTS - 1) * PARTNER_FEATURES + MAX_ROAD_SEGMENT_OBSERVATIONS * ROAD_FEATURES; + + // Co-player observation size (includes conditioning features) + int coplayer_obs_size = ego_obs_size; + if (co_player_net != NULL) { + coplayer_obs_size = co_player_net->ego_dim + (MAX_AGENTS - 1) * PARTNER_FEATURES + + MAX_ROAD_SEGMENT_OBSERVATIONS * ROAD_FEATURES; + } + + printf("Observation sizes: ego=%d, coplayer=%d, ego_base_dim=%d, coplayer_conditioning=%d\n", ego_obs_size, + coplayer_obs_size, ego_base_dim, coplayer_num_conditioning); + if (render_topdown) { printf("Recording topdown view...\n"); for (int i = 0; i < frame_count; i++) { + // Calculate current scenario (1-indexed for display) + int current_scenario = (i / scenario_length) + 1; if (i % frame_skip == 0) { - renderTopDownView(&env, client, map_height, 0, 0, 0, frame_count, NULL, show_human_logs, show_grid, - img_width, img_height, zoom_in); + renderTopDownView(&env, client, map_width, map_height, 0, 0, 0, frame_count, NULL, show_human_logs, + show_grid, img_width, img_height, zoom_in, current_scenario, k_scenarios); WriteFrame(&topdown_recorder, img_width, img_height); rendered_frames++; } - forward(net, env.observations, (int *)env.actions); + forward_population(net, co_player_net, env.observations, (int *)env.actions, num_ego_agents, num_co_players, + ego_obs_size, coplayer_obs_size, ego_base_dim, co_use_rc, co_use_ec, co_use_dc, + co_collision_lb, co_collision_ub, co_offroad_lb, co_offroad_ub, co_goal_lb, co_goal_ub, + co_entropy_lb, co_entropy_ub, co_discount_lb, co_discount_ub); c_step(&env); } } @@ -400,7 +616,10 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o WriteFrame(&agent_recorder, img_width, img_height); rendered_frames++; } - forward(net, env.observations, (int *)env.actions); + forward_population(net, co_player_net, env.observations, (int *)env.actions, num_ego_agents, num_co_players, + ego_obs_size, coplayer_obs_size, ego_base_dim, co_use_rc, co_use_ec, co_use_dc, + co_collision_lb, co_collision_ub, co_offroad_lb, co_offroad_ub, co_goal_lb, co_goal_ub, + co_entropy_lb, co_entropy_ub, co_discount_lb, co_discount_ub); c_step(&env); } } @@ -424,6 +643,9 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o free_allocated(&env); free_drivenet(net); free(weights); + if (co_player_net != NULL) { + free_drivenet(co_player_net); + } return 0; } @@ -440,10 +662,14 @@ int main(int argc, char *argv[]) { // File paths and num_maps (not in [env] section) const char *map_name = NULL; const char *policy_name = "resources/drive/puffer_drive_weights.bin"; + const char *co_player_policy_name = NULL; const char *output_topdown = NULL; const char *output_agent = NULL; + const char *ini_file = "pufferlib/config/ocean/drive.ini"; int num_maps = 1; int scenario_length_cli = -1; + int k_scenarios_cli = -1; + int max_controlled_agents_cli = -1; int use_rc = 0; int use_ec = 0; int use_dc = 0; @@ -515,10 +741,43 @@ int main(int argc, char *argv[]) { num_maps = atoi(argv[i + 1]); i++; } + } else if (strcmp(argv[i], "--ini-file") == 0) { + if (i + 1 < argc) { + ini_file = argv[i + 1]; + i++; + } else { + fprintf(stderr, "Error: --ini-file option requires a file path\n"); + return 1; + } + } else if (strcmp(argv[i], "--k-scenarios") == 0) { + if (i + 1 < argc) { + k_scenarios_cli = atoi(argv[i + 1]); + i++; + } else { + fprintf(stderr, "Error: --k-scenarios option requires a number\n"); + return 1; + } + } else if (strcmp(argv[i], "--max-controlled-agents") == 0) { + if (i + 1 < argc) { + max_controlled_agents_cli = atoi(argv[i + 1]); + i++; + } else { + fprintf(stderr, "Error: --max-controlled-agents option requires a number\n"); + return 1; + } + } else if (strcmp(argv[i], "--co-player-policy") == 0) { + if (i + 1 < argc) { + co_player_policy_name = argv[i + 1]; + i++; + } else { + fprintf(stderr, "Error: --co-player-policy option requires a file path\n"); + return 1; + } } } eval_gif(map_name, policy_name, show_grid, obs_only, lasers, show_human_logs, frame_skip, view_mode, output_topdown, - output_agent, num_maps, zoom_in); + output_agent, num_maps, zoom_in, ini_file, k_scenarios_cli, max_controlled_agents_cli, + co_player_policy_name); return 0; } diff --git a/pufferlib/ocean/env_config.h b/pufferlib/ocean/env_config.h index 4f26a8b32e..35f2806806 100644 --- a/pufferlib/ocean/env_config.h +++ b/pufferlib/ocean/env_config.h @@ -38,12 +38,18 @@ typedef struct { int goal_behavior; float goal_target_distance; int scenario_length; + int k_scenarios; int termination_mode; int init_steps; int init_mode; int control_mode; char map_dir[256]; conditioning_config *conditioning; + // Population play settings + int co_player_enabled; + int num_ego_agents; + char co_player_policy_path[256]; + conditioning_config *co_player_conditioning; } env_init_config; // INI file parser handler - parses all environment configuration from drive.ini @@ -97,6 +103,8 @@ static int handler(void *config, const char *section, const char *name, const ch env_config->dt = atof(value); } else if (MATCH("env", "scenario_length")) { env_config->scenario_length = atoi(value); + } else if (MATCH("env", "k_scenarios")) { + env_config->k_scenarios = atoi(value); } else if (MATCH("env", "termination_mode")) { env_config->termination_mode = atoi(value); } else if (MATCH("env", "init_steps")) { @@ -175,6 +183,85 @@ static int handler(void *config, const char *section, const char *name, const ch } env_config->conditioning->discount_weight_ub = atof(value); } + // Population play settings + else if (MATCH("env", "co_player_enabled")) { + if (strcmp(value, "True") == 0 || strcmp(value, "true") == 0 || strcmp(value, "1") == 0) { + env_config->co_player_enabled = 1; + } else { + env_config->co_player_enabled = 0; + } + } else if (MATCH("env", "num_ego_agents")) { + env_config->num_ego_agents = atoi(value); + } + // Co-player policy settings + else if (MATCH("env.co_player_policy", "policy_path")) { + if (sscanf(value, "\"%255[^\"]\"", env_config->co_player_policy_path) != 1) { + strncpy(env_config->co_player_policy_path, value, sizeof(env_config->co_player_policy_path) - 1); + env_config->co_player_policy_path[sizeof(env_config->co_player_policy_path) - 1] = '\0'; + } + } else if (MATCH("env.co_player_policy.conditioning", "type")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + if (value[0] == '"') { + size_t len = strlen(value) - 2; + env_config->co_player_conditioning->type = (char *)malloc(len + 1); + strncpy(env_config->co_player_conditioning->type, value + 1, len); + env_config->co_player_conditioning->type[len] = '\0'; + } else { + env_config->co_player_conditioning->type = strdup(value); + } + } else if (MATCH("env.co_player_policy.conditioning", "collision_weight_lb")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + env_config->co_player_conditioning->reward_collision_weight_lb = atof(value); + } else if (MATCH("env.co_player_policy.conditioning", "collision_weight_ub")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + env_config->co_player_conditioning->reward_collision_weight_ub = atof(value); + } else if (MATCH("env.co_player_policy.conditioning", "offroad_weight_lb")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + env_config->co_player_conditioning->reward_offroad_weight_lb = atof(value); + } else if (MATCH("env.co_player_policy.conditioning", "offroad_weight_ub")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + env_config->co_player_conditioning->reward_offroad_weight_ub = atof(value); + } else if (MATCH("env.co_player_policy.conditioning", "goal_weight_lb")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + env_config->co_player_conditioning->reward_goal_weight_lb = atof(value); + } else if (MATCH("env.co_player_policy.conditioning", "goal_weight_ub")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + env_config->co_player_conditioning->reward_goal_weight_ub = atof(value); + } else if (MATCH("env.co_player_policy.conditioning", "entropy_weight_lb")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + env_config->co_player_conditioning->entropy_weight_lb = atof(value); + } else if (MATCH("env.co_player_policy.conditioning", "entropy_weight_ub")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + env_config->co_player_conditioning->entropy_weight_ub = atof(value); + } else if (MATCH("env.co_player_policy.conditioning", "discount_weight_lb")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + env_config->co_player_conditioning->discount_weight_lb = atof(value); + } else if (MATCH("env.co_player_policy.conditioning", "discount_weight_ub")) { + if (env_config->co_player_conditioning == NULL) { + env_config->co_player_conditioning = (conditioning_config *)malloc(sizeof(conditioning_config)); + } + env_config->co_player_conditioning->discount_weight_ub = atof(value); + } else { return 0; // Unknown section/name, indicate failure to handle diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py index 270205ed70..758a269dcd 100644 --- a/pufferlib/pufferl.py +++ b/pufferlib/pufferl.py @@ -789,6 +789,38 @@ def train(self): ): pufferlib.utils.run_human_replay_eval_in_subprocess(self.config, self.logger, self.global_step) + # Eval rendering for adaptive agents (ego vs human logs) + if self.adaptive_driving_agent and self.config["eval"].get("human_replay_eval", False): + if self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training: + model_dir = os.path.join(self.config["data_dir"], f"{self.config['env']}_{self.logger.run_id}") + model_files = glob.glob(os.path.join(model_dir, "model_*.pt")) + + if model_files: + latest_cpt = max(model_files, key=os.path.getctime) + bin_path = f"{model_dir}.bin" + + try: + export_args = {"env_name": self.config["env"], "load_model_path": latest_cpt, **self.config} + export( + args=export_args, + env_name=self.config["env"], + vecenv=self.vecenv, + policy=self.uncompiled_policy, + path=bin_path, + silent=True, + ) + eval_video_dir = os.path.join(model_dir, "eval_videos") + pufferlib.utils.render_human_replay_videos( + config=self.config, + policy_bin_path=bin_path, + output_dir=eval_video_dir, + num_maps=self.config["eval"].get("human_replay_render_num_maps", 3), + logger=self.logger, + global_step=self.global_step, + ) + except Exception as e: + print(f"Failed to render eval videos: {e}") + def mean_and_log(self): config = self.config for k in list(self.stats.keys()): @@ -1673,17 +1705,16 @@ def load_policy(args, vecenv, env_name=""): policy_cls = getattr(env_module.torch, args["policy_name"]) policy = policy_cls(vecenv.driver_env, **args["policy"]) - # Handle both RNN and Transformer wrappers + # Handle both RNN and Transformer wrappers via rnn_name rnn_name = args.get("rnn_name") - transformer_name = args.get("transformer_name") - if transformer_name is not None: + if rnn_name == "Transformer": # Load transformer wrapper - transformer_cls = getattr(env_module.torch, transformer_name) + transformer_cls = getattr(env_module.torch, rnn_name) args["transformer"]["context_length"] = vecenv.driver_env.episode_length policy = transformer_cls(vecenv.driver_env, policy, **args["transformer"]) elif rnn_name is not None: - # Load RNN wrapper + # Load RNN wrapper (Recurrent) rnn_cls = getattr(env_module.torch, rnn_name) policy = rnn_cls(vecenv.driver_env, policy, **args["rnn"]) diff --git a/pufferlib/resources/drive/puffer_adaptive_drive_co_player.bin b/pufferlib/resources/drive/puffer_adaptive_drive_co_player.bin new file mode 100644 index 0000000000..cc7567a82e Binary files /dev/null and b/pufferlib/resources/drive/puffer_adaptive_drive_co_player.bin differ diff --git a/pufferlib/utils.py b/pufferlib/utils.py index 8ef3cb4034..2bb240e485 100644 --- a/pufferlib/utils.py +++ b/pufferlib/utils.py @@ -38,13 +38,19 @@ def run_human_replay_eval_in_subprocess(config, logger, global_step): if is_adaptive: # Use evaluate_human_logs.py for adaptive agents with human replay + # Get policy_architecture from train config section + train_config = config.get("train", {}) + policy_architecture = train_config.get( + "policy_architecture", config.get("policy_architecture", "Recurrent") + ) + cmd = [ sys.executable, "evaluate_human_logs.py", "--policy-path", latest_cpt, "--policy-architecture", - config.get("policy_architecture", "Transformer"), + policy_architecture, "--adaptive-driving-agent", "1", "--k-scenarios", @@ -302,9 +308,37 @@ def render_videos(config, vecenv, logger, epoch, global_step, bin_path): env_vars = os.environ.copy() env_vars["ASAN_OPTIONS"] = "exitcode=0" + # Detect if this is an adaptive agent + env_name = config.get("env", "") + is_adaptive = "adaptive" in env_name + + # Select correct INI file based on agent type + if is_adaptive: + ini_file = "pufferlib/config/ocean/adaptive.ini" + else: + ini_file = "pufferlib/config/ocean/drive.ini" + # Base command with only visualization flags (env config comes from INI) base_cmd = ["xvfb-run", "-a", "-s", "-screen 0 1280x720x24", "./visualize"] + # Pass the correct INI file + base_cmd.extend(["--ini-file", ini_file]) + + # Get env config for k_scenarios and co-player settings + env_config = config.get("env_config", {}) + + # Pass k_scenarios for adaptive agents (longer videos) + k_scenarios = env_config.get("k_scenarios", 1) + if k_scenarios > 1: + base_cmd.extend(["--k-scenarios", str(k_scenarios)]) + + # Pass co-player policy if population play is enabled + co_player_enabled = env_config.get("co_player_enabled", False) + if co_player_enabled: + co_player_path = f"resources/drive/{config['env']}_co_player.bin" + if os.path.exists(co_player_path): + base_cmd.extend(["--co-player-policy", co_player_path]) + # Visualization config flags only if config.get("show_grid", False): base_cmd.append("--show-grid") @@ -407,3 +441,122 @@ def render_videos(config, vecenv, logger, epoch, global_step, bin_path): # Clean up bin weights file if os.path.exists(expected_weights_path): os.remove(expected_weights_path) + + +def render_human_replay_videos(config, policy_bin_path, output_dir, num_maps=5, logger=None, global_step=0): + """ + Render videos for human replay evaluation (1 ego agent + human log trajectories). + + In this mode, only one agent is policy-controlled (the ego), while all other agents + follow their logged human trajectories (rendered in GOLD). + + Args: + config: Configuration dictionary with env settings + policy_bin_path: Path to the policy weights .bin file + output_dir: Directory to save output videos + num_maps: Number of maps to render + logger: Optional logger with wandb attribute for logging + global_step: Current training step for wandb logging + + Returns: + List of output video paths + """ + if not os.path.exists(policy_bin_path): + print(f"Policy weights file does not exist: {policy_bin_path}") + return [] + + try: + os.makedirs(output_dir, exist_ok=True) + + # Copy the binary weights to the expected location + expected_weights_path = "resources/drive/puffer_drive_weights.bin" + os.makedirs(os.path.dirname(expected_weights_path), exist_ok=True) + shutil.copy2(policy_bin_path, expected_weights_path) + + env_vars = os.environ.copy() + env_vars["ASAN_OPTIONS"] = "exitcode=0" + + # Get env config + env_config = config.get("env_config", config.get("env", {})) + k_scenarios = env_config.get("k_scenarios", 2) + + # Build command for human replay rendering + cmd = [ + "xvfb-run", + "-a", + "-s", + "-screen 0 1280x720x24", + "./visualize", + "--ini-file", + "pufferlib/config/ocean/adaptive.ini", + "--policy-name", + expected_weights_path, + "--max-controlled-agents", + "1", # Only 1 ego agent + "--k-scenarios", + str(k_scenarios), + "--num-maps", + str(num_maps), + "--log-trajectories", # Show human trajectory logs + "--zoom-in", + "--view", + "both", + "--output-topdown", + "resources/drive/output_topdown.mp4", + "--output-agent", + "resources/drive/output_agent.mp4", + ] + + output_videos = [] + videos_to_log_world = [] + videos_to_log_agent = [] + + for map_idx in range(num_maps): + result = subprocess.run(cmd, cwd=os.getcwd(), capture_output=True, text=True, timeout=600, env=env_vars) + + vids_exist = os.path.exists("resources/drive/output_topdown.mp4") and os.path.exists( + "resources/drive/output_agent.mp4" + ) + + if result.returncode == 0 or (result.returncode == 1 and vids_exist): + videos = [ + ("resources/drive/output_topdown.mp4", f"human_replay_map{map_idx:02d}_topdown.mp4"), + ("resources/drive/output_agent.mp4", f"human_replay_map{map_idx:02d}_agent.mp4"), + ] + + for source_vid, target_filename in videos: + if os.path.exists(source_vid): + target_path = os.path.join(output_dir, target_filename) + shutil.move(source_vid, target_path) + output_videos.append(target_path) + + if logger and hasattr(logger, "wandb") and logger.wandb: + import wandb + + if "topdown" in target_filename: + videos_to_log_world.append(wandb.Video(target_path, format="mp4")) + else: + videos_to_log_agent.append(wandb.Video(target_path, format="mp4")) + else: + print(f"Human replay rendering failed for map {map_idx}: {result.stderr}") + + # Log to wandb + if logger and hasattr(logger, "wandb") and logger.wandb and (videos_to_log_world or videos_to_log_agent): + payload = {} + if videos_to_log_world: + payload["eval/human_replay_world_view"] = videos_to_log_world + if videos_to_log_agent: + payload["eval/human_replay_agent_view"] = videos_to_log_agent + logger.wandb.log(payload, step=global_step) + + return output_videos + + except subprocess.TimeoutExpired: + print("Human replay rendering timed out") + return [] + except Exception as e: + print(f"Failed to render human replay videos: {e}") + return [] + finally: + if os.path.exists(expected_weights_path): + os.remove(expected_weights_path) diff --git a/scripts/run_baseline.sh b/scripts/run_baseline.sh new file mode 100755 index 0000000000..7e2f5b5e60 --- /dev/null +++ b/scripts/run_baseline.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=puffer_baseline +#SBATCH --output=/scratch/mmk9418/logs/%A_%x.out +#SBATCH --error=/scratch/mmk9418/logs/%A_%x.err +#SBATCH --mem=128GB +#SBATCH --time=24:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --account=torch_pr_355_tandon_priority +#SBATCH --cpus-per-task=48 +#SBATCH --gres=gpu:1 + +# Vanilla baseline training script +# No co-players - other vehicles follow recorded human trajectories from Waymo data +# For comparison with co-player experiments in run.sh + +singularity exec --nv \ + --overlay "$OVERLAY_FILE:ro" \ + "$SINGULARITY_IMAGE" \ + bash -c " + set -e + + source ~/.bashrc + cd /scratch/mmk9418/projects/Adaptive_Driving_Agent + source .venv/bin/activate + + # Start GPU heartbeat in background (for RL training which is CPU-bound) + nice -n 19 python scripts/gpu_heartbeat.py & + HEARTBEAT_PID=\$! + echo \"Started GPU Heartbeat with PID: \$HEARTBEAT_PID\" + + puffer train puffer_adaptive_drive --wandb --tag adaptive_baseline \ + --env.num-maps 1000 \ + --env.conditioning.type none \ + --env.co-player-enabled 0 \ + --train.seed 42 + + kill \$HEARTBEAT_PID + " diff --git a/scripts/run_nuplan_coplayers.sh b/scripts/run_nuplan_coplayers.sh new file mode 100755 index 0000000000..3c172eb280 --- /dev/null +++ b/scripts/run_nuplan_coplayers.sh @@ -0,0 +1,84 @@ +#!/bin/bash +#SBATCH --job-name=nuplan_coplayer +#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out +#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err +#SBATCH --mem=128GB +#SBATCH --time=24:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --account=torch_pr_355_tandon_priority +#SBATCH --cpus-per-task=48 +#SBATCH --gres=gpu:1 +#SBATCH --array=0-15 + +# Train co-player policies on NuPlan data with varying entropy/discount conditioning +# These trained policies can later be used as co-players for adaptive agent training +# +# PREREQUISITE: Convert NuPlan JSON to binary format first: +# python -c "from pufferlib.ocean.drive.drive import process_all_maps; \ +# process_all_maps('data/nuplan_gpudrive/nuplan', max_maps=5000)" +# +# This will create: resources/drive/binaries/nuplan/map_*.bin (dataset_name derived from folder) + +# Define configurations for each array task ID +# Grid: 4 entropy levels × 4 discount levels = 16 configurations +# Each entry: "entropy_weight_ub discount_weight_lb" +ZIPPED_RUNS=( + "0.5 0.8" + "0.1 0.8" + "0.01 0.8" + "0 0.8" + + "0.5 0.6" + "0.1 0.6" + "0.01 0.6" + "0 0.6" + + "0.5 0.4" + "0.1 0.4" + "0.01 0.4" + "0 0.4" + + "0.5 0.2" + "0.1 0.2" + "0.01 0.2" + "0 0.2" +) + +# Parse the values for this array task +read -r ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}" + +# Fixed values +CONDITION_TYPE="all" +DISCOUNT_UB=1 +ENTROPY_LB=0 + +# NuPlan has ~5235 maps +NUPLAN_NUM_MAPS=5000 + +singularity exec --nv \ + --overlay "$OVERLAY_FILE:ro" \ + "$SINGULARITY_IMAGE" \ + bash -c " + set -e + + source ~/.bashrc + cd /scratch/mmk9418/projects/Adaptive_Driving_Agent + source .venv/bin/activate + + # Start GPU heartbeat in background (for RL training which is CPU-bound) + nice -n 19 python scripts/gpu_heartbeat.py & + HEARTBEAT_PID=\$! + echo \"Started GPU Heartbeat with PID: \$HEARTBEAT_PID\" + + puffer train puffer_drive --wandb --tag nuplan_coplayer_training \ + --env.map-dir resources/drive/binaries/nuplan \ + --env.num-maps $NUPLAN_NUM_MAPS \ + --env.conditioning.type $CONDITION_TYPE \ + --env.conditioning.entropy-weight-lb $ENTROPY_LB \ + --env.conditioning.entropy-weight-ub $ENTROPY_UB \ + --env.conditioning.discount-weight-lb $DISCOUNT_LB \ + --env.conditioning.discount-weight-ub $DISCOUNT_UB + + kill \$HEARTBEAT_PID + " diff --git a/scripts/run_transformer_adaptive.sh b/scripts/run_transformer_adaptive.sh new file mode 100755 index 0000000000..79b2a7caae --- /dev/null +++ b/scripts/run_transformer_adaptive.sh @@ -0,0 +1,78 @@ +#!/bin/bash +#SBATCH --job-name=puffer_transformer +#SBATCH --output=/scratch/mmk9418/logs/%A_%a_%x.out +#SBATCH --error=/scratch/mmk9418/logs/%A_%a_%x.err +#SBATCH --mem=128GB +#SBATCH --time=24:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --account=torch_pr_355_tandon_priority +#SBATCH --cpus-per-task=48 +#SBATCH --gres=gpu:1 +#SBATCH --array=0-15 + +# Transformer-based adaptive agent training with co-players +# Same co-player configurations as run.sh, but using Transformer architecture + +# Define configurations for each array task ID +# Each entry: "path entropy_weight_ub discount_weight_lb" +ZIPPED_RUNS=( + "experiments/puffer_drive_8u92j3ts/model_puffer_drive_003000.pt 0.5 0.8" + "experiments/puffer_drive_x4xs711x.pt 0.1 0.8" + "experiments/puffer_drive_hhzdzhl8.pt 0.01 0.8" + "experiments/puffer_drive_3xd48djp.pt 0 0.8" + + "experiments/puffer_drive_fgglgofu.pt 0.5 0.6" + "experiments/puffer_drive_g3x9e5rn.pt 0.01 0.6" + "experiments/puffer_drive_gzuuzs0o.pt 0.1 0.6" + "experiments/puffer_drive_6nzf7xha.pt 0 0.6" + + "experiments/puffer_drive_3iefv59j.pt 0.5 0.4" + "experiments/puffer_drive_7h07nrxy.pt 0.1 0.4" + "experiments/puffer_drive_bot2wl0m.pt 0.01 0.4" + "experiments/puffer_drive_n7mx9f4b.pt 0 0.4" + + "experiments/puffer_drive_9jv4q77m.pt 0.5 0.2" + "experiments/puffer_drive_5p8gpw84.pt 0.1 0.2" + "experiments/puffer_drive_jskw659g.pt 0.01 0.2" + "experiments/puffer_drive_eeyizdrk.pt 0 0.2" +) + +# Parse the values for this array task +read -r COPLAYER_PATH ENTROPY_UB DISCOUNT_LB <<< "${ZIPPED_RUNS[$SLURM_ARRAY_TASK_ID]}" + +# Fixed values +CONDITION_TYPE="all" +DISCOUNT_UB=1 +ENTROPY_LB=0 + +singularity exec --nv \ + --overlay "$OVERLAY_FILE:ro" \ + "$SINGULARITY_IMAGE" \ + bash -c " + set -e + + source ~/.bashrc + cd /scratch/mmk9418/projects/Adaptive_Driving_Agent + source .venv/bin/activate + + # Start GPU heartbeat in background (for RL training which is CPU-bound) + nice -n 19 python scripts/gpu_heartbeat.py & + HEARTBEAT_PID=\$! + echo \"Started GPU Heartbeat with PID: \$HEARTBEAT_PID\" + + puffer train puffer_adaptive_drive --wandb --tag adaptive_transformer \ + --env.num-maps 1000 \ + --env.conditioning.type none \ + --env.co-player-enabled 1 \ + --env.co-player-policy.policy-path $COPLAYER_PATH \ + --env.co-player-policy.conditioning.type $CONDITION_TYPE \ + --env.co-player-policy.conditioning.discount-weight-lb $DISCOUNT_LB \ + --env.co-player-policy.conditioning.discount-weight-ub $DISCOUNT_UB \ + --env.co-player-policy.conditioning.entropy-weight-lb $ENTROPY_LB \ + --env.co-player-policy.conditioning.entropy-weight-ub $ENTROPY_UB \ + --rnn-name Transformer \ + --train.policy-architecture Transformer + + kill \$HEARTBEAT_PID + "