diff --git a/examples/eval_pipeline.py b/examples/eval_pipeline.py index 3a90687fac..94155e0a08 100644 --- a/examples/eval_pipeline.py +++ b/examples/eval_pipeline.py @@ -1,3 +1,4 @@ +import copy import numpy as np import pandas as pd import torch @@ -11,6 +12,35 @@ POLICY_DIR = "models" +# Policy configurations: (path, dynamics_model, type) +POLICY_CONFIGS = { + "bc_policy": { + "path": POLICY_DIR + "/bc_policy.pt", + "dynamics": "classic", + "type": "bc", + }, + # "self_play_rl (classic)": { + # "path": POLICY_DIR + "/self_play_rl_simple_policy.pt", + # "dynamics": "classic", + # "type": "rl", + # }, + # "guided_self_play_rl (classic)": { + # "path": POLICY_DIR + "/guided_self_play_classic_policy.pt", + # "dynamics": "classic", + # "type": "rl", + # }, + # "self_play_rl (jerk)": { + # "path": POLICY_DIR + "/self_play_jerk_policy.pt", + # "dynamics": "jerk", + # "type": "rl", + # }, + # "guided_self_play_rl (jerk)": { + # "path": POLICY_DIR + "/guided_self_play_jerk_policy.pt", + # "dynamics": "jerk", + # "type": "rl", + # }, +} + COLUMN_ORDER = [ "policy", "realism_meta_score", @@ -222,13 +252,31 @@ def evaluate_bc_policy(config, vecenv, evaluator, policy_path): def evaluate_rl_policy(config, vecenv, evaluator, policy_path): + """Evaluate an RL policy using WOSAC metrics.""" + + # Use a copy to avoid mutating the original config + config = copy.deepcopy(config) + + # Ensure evaluator is in RL mode + evaluator.mode = "rl" + + # Enable RNN state initialization for LSTM-based policies config["train"]["use_rnn"] = True - evaluator.mode = "rl_policy" + config["load_model_path"] = policy_path + + # Load policy policy = load_policy(config, vecenv, "puffer_drive") + policy.eval() gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv) - simulated_trajectories = evaluator.collect_simulated_trajectories(config, vecenv, policy) + + # Roll out trained policy in the simulator + simulated_trajectories = evaluator.collect_simulated_trajectories( + args=config, + puffer_env=vecenv, + policy=policy, + ) # Compute metrics agent_state = vecenv.driver_env.get_global_agent_state() @@ -242,18 +290,16 @@ def evaluate_rl_policy(config, vecenv, evaluator, policy_path): return results -def pipeline(env_name="puffer_drive"): - """Obtain WOSAC scores for various baselines and policies.""" - +def create_config_and_env(env_name, dynamics_model="classic"): + """Create a config and vecenv for a specific dynamics model.""" config = load_config(env_name) - # Dataset configuration - config["env"]["map_dir"] = "pufferlib/resources/drive/binaries/training" + # Common WOSAC evaluation settings + config["env"]["num_maps"] = 100 + config["env"]["map_dir"] = "pufferlib/resources/drive/binaries/validation" config["eval"]["wosac_target_scenarios"] = 1000 config["eval"]["wosac_batch_size"] = 100 - config["eval"]["wosac_scenario_pool_size"] = 10_0000 - - # WOSAC settings + config["eval"]["wosac_scenario_pool_size"] = 10_000 config["wosac"]["enabled"] = True config["vec"]["backend"] = "PufferEnv" config["vec"]["num_envs"] = 1 @@ -264,63 +310,100 @@ def pipeline(env_name="puffer_drive"): config["env"]["goal_radius"] = 1.0 config["env"]["save_data_to_disk"] = False - # Make env - vecenv = load_env(env_name, config) + # Set dynamics model + config["env"]["dynamics_model"] = dynamics_model + + # Disable human data preparation for jerk dynamics (not implemented) + if dynamics_model == "jerk": + config["env"]["prep_human_data"] = False - # Make evaluator + vecenv = load_env(env_name, config) evaluator = WOSACEvaluator(config) - # Baseline: Ground truth - evaluator.eval_mode = "ground_truth" - df_results_gt = evaluator.evaluate(config, vecenv, policy=None) - df_results_gt["policy"] = "ground_truth" + return config, vecenv, evaluator - # Baseline: Agent with inferred human actions (using classic bicycle dynamics model) - # df_results_inferred_human = evaluate_human_inferred_actions(config, vecenv, evaluator) - # df_results_inferred_human["policy"] = "inferred_human_actions" - - # Baseline: Imitation learning policy - # df_results_bc = evaluate_bc_policy(config, vecenv, evaluator, POLICY_DIR + "/bc_policy.pt") - # df_results_bc["policy"] = "bc_policy" - - # Baseline: Self-play RL policy - # run: https://wandb.ai/emerge_/gsp/runs/qld2z6tn?nw=nwuserdaphnecor - # df_results_self_play = evaluate_rl_policy( - # config, vecenv, evaluator, "pufferlib/resources/drive/pufferdrive_weights.pt" - # ) # POLICY_DIR + "/puffer_drive_sp_qld2z6tn.pt") - # df_results_self_play["policy"] = "self_play_rl_base" - - # TODO: Guided self-play policy (guidance in rewards) - # ... - - # TODO: Guided self-play policy (regularization) - # ... - - # Baseline: Random policy - # df_results_random = evaluate_random_policy(config, vecenv, evaluator) - # df_results_random["policy"] = "random" - - # Combine - df = pd.concat( - [ - df_results_gt, - # df_results_inferred_human, - # df_results_random, - # df_results_bc, - # df_results_self_play, - ], - ignore_index=True, - ) +def pipeline(env_name="puffer_drive"): + """Obtain WOSAC scores for various baselines and policies across dynamics models.""" + + all_results = [] + + config_classic, vecenv_classic, evaluator_classic = create_config_and_env(env_name, "classic") + + # Ground truth (dynamics-agnostic, only need to run once) + print("Evaluating: ground_truth") + evaluator_classic.eval_mode = "ground_truth" + df_results_gt = evaluator_classic.evaluate(config_classic, vecenv_classic, policy=None) + df_results_gt["policy"] = "ground_truth" + all_results.append(df_results_gt) + + # Inferred human actions (classic only - not implemented for jerk) + print("Evaluating: inferred_human_actions (classic)") + df_results_inferred_human = evaluate_human_inferred_actions(config_classic, vecenv_classic, evaluator_classic) + df_results_inferred_human["policy"] = "inferred_human (classic)" + all_results.append(df_results_inferred_human) + + # --- Classic dynamics evaluations --- + print("=" * 60) + print("Running evaluations with CLASSIC dynamics model...") + print("=" * 60) + + # Random baseline for classic + print("Evaluating: random (classic)") + df_results_random_classic = evaluate_random_policy(config_classic, vecenv_classic, evaluator_classic) + df_results_random_classic["policy"] = "random (classic)" + all_results.append(df_results_random_classic) + + # Evaluate classic dynamics policies + evaluator_classic.eval_mode = "policy" + for policy_name, policy_cfg in POLICY_CONFIGS.items(): + if policy_cfg["dynamics"] != "classic": + continue + print(f"Evaluating: {policy_name}") + if policy_cfg["type"] == "bc": + df_result = evaluate_bc_policy(config_classic, vecenv_classic, evaluator_classic, policy_cfg["path"]) + else: + df_result = evaluate_rl_policy(config_classic, vecenv_classic, evaluator_classic, policy_cfg["path"]) + df_result["policy"] = policy_name + all_results.append(df_result) + + # --- Jerk dynamics evaluations --- + # Check if any jerk policies are configured + jerk_policies = {k: v for k, v in POLICY_CONFIGS.items() if v["dynamics"] == "jerk"} + + if jerk_policies: + print("=" * 60) + print("Running evaluations with JERK dynamics model...") + print("=" * 60) + + config_jerk, vecenv_jerk, evaluator_jerk = create_config_and_env(env_name, "jerk") + + # Random baseline for jerk (different action space, so separate baseline) + print("Evaluating: random (jerk)") + df_results_random_jerk = evaluate_random_policy(config_jerk, vecenv_jerk, evaluator_jerk) + df_results_random_jerk["policy"] = "random (jerk)" + all_results.append(df_results_random_jerk) + + # Evaluate jerk dynamics policies + for policy_name, policy_cfg in jerk_policies.items(): + print(f"Evaluating: {policy_name}") + df_result = evaluate_rl_policy(config_jerk, vecenv_jerk, evaluator_jerk, policy_cfg["path"]) + df_result["policy"] = policy_name + all_results.append(df_result) + + # Combine all results + df = pd.concat(all_results, ignore_index=True) df = df[COLUMN_ORDER] # Visualize plot_wosac_results(df) plot_realism_score_distributions(df) - print(f"total agents: {df_results_gt['num_agents_per_scene'].sum().item()}") - - print(df.groupby("policy")["realism_meta_score"].mean()) + # Print summary + print("\n" + "=" * 60) + print("RESULTS SUMMARY") + print("=" * 60) + print(df.groupby("policy")["realism_meta_score"].mean().sort_values(ascending=False)) print("---") print(df.groupby("policy")["kinematic_metrics"].mean()) print("---") diff --git a/greene/a100_baseline_sweep.sh b/greene/a100_baseline_sweep.sh index ca34d82059..2ff845ca7b 100644 --- a/greene/a100_baseline_sweep.sh +++ b/greene/a100_baseline_sweep.sh @@ -78,4 +78,4 @@ puffer sweep puffer_drive \ # Print completion info echo "Sweep completed" -date \ No newline at end of file +date diff --git a/greene/h100_gsp_rew_sweep.sh b/greene/h100_gsp_rew_sweep.sh index dde3f2d040..71b219dd86 100644 --- a/greene/h100_gsp_rew_sweep.sh +++ b/greene/h100_gsp_rew_sweep.sh @@ -35,8 +35,8 @@ source .venv/bin/activate NUM_WORKERS=48 NUM_ENVS=48 VEC_BATCH_SIZE=6 -NUM_AGENTS=1024 -BPTT_HORIZON=32 +NUM_AGENTS=1024 +BPTT_HORIZON=32 HUMAN_REG_COEF=0.0 USE_GUIDANCE_REWARDS=0 @@ -79,4 +79,4 @@ puffer sweep puffer_drive \ # Print completion info echo "Sweep completed" -date \ No newline at end of file +date diff --git a/greene/rtx8000_sweep.sh b/greene/rtx8000_sweep.sh index 6b8ba2d42b..79e051f632 100644 --- a/greene/rtx8000_sweep.sh +++ b/greene/rtx8000_sweep.sh @@ -47,8 +47,8 @@ fi NUM_WORKERS=32 NUM_ENVS=32 VEC_BATCH_SIZE=4 -NUM_AGENTS=1024 -BPTT_HORIZON=32 +NUM_AGENTS=1024 +BPTT_HORIZON=32 HUMAN_REG_COEF=0.0 USE_GUIDANCE_REWARDS=1 @@ -91,4 +91,4 @@ puffer sweep puffer_drive \ # Print completion info echo "Sweep completed" -date \ No newline at end of file +date diff --git a/models/guided_self_play_jerk_policy.pt b/models/guided_self_play_jerk_policy.pt new file mode 100644 index 0000000000..fabdd5ce35 Binary files /dev/null and b/models/guided_self_play_jerk_policy.pt differ diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini index f369faae17..2ae2b45e77 100644 --- a/pufferlib/config/ocean/drive.ini +++ b/pufferlib/config/ocean/drive.ini @@ -31,6 +31,7 @@ use_guided_autonomy = 0 guidance_speed_weight = 0.0 guidance_heading_weight = 0.0 waypoint_reach_threshold = 2.0 +use_guidance_observations = 0 dt = 0.1 reward_goal = 1.0 reward_goal_post_respawn = 0.25 @@ -161,12 +162,64 @@ human_replay_control_mode = "control_sdc_only" ; Number of scenarios for human replay evaluation equals the number of agents human_replay_num_agents = 128 -; General sweep settings: +[render] +; Mode to render a bunch of maps with a given policy +; Path to dataset used for rendering +map_dir = "resources/drive/binaries/training" +; Directory to output rendered videos +output_dir = "resources/drive/render_videos" +; Evaluation will run on the first num_maps maps in the map_dir directory +num_maps = 100 +; "both", "topdown", "agent"; Other args are passed from train confs +view_mode = "both" +; Policy bin file used for rendering videos +policy_path = "resources/drive/puffer_drive_weights.bin" +; Allows more than cpu cores workers for rendering +overwork = False +; If True, show exactly what the agent sees in agent observation +obs_only = True +; Show grid lines +show_grid = False +; Draws lines from ego agent observed ORUs and road elements to show detection range +show_lasers = False +; Display human xy logs in the background +show_human_logs = False +; If True, zoom in on a part of the map. Otherwise, show full map +zoom_in = True +; If True, render single PNG snapshots with trajectory overlays instead of videos +snapshot_only = False + +; ============================================================================ +; SWEEP CONFIGURATION +; ============================================================================ +; Run with: puffer sweep puffer_drive --wandb --wandb-project drive-sweep --max-runs 50 + +[sweep] +method = Random +metric = score +goal = maximize +downsample = 5 + +; === Core RL Hyperparameters === [sweep.train.learning_rate] distribution = log_normal -min = 0.001 +min = 0.0003 mean = 0.003 -max = 0.005 +max = 0.01 +scale = auto + +[sweep.train.gamma] +distribution = logit_normal +min = 0.95 +mean = 0.98 +max = 0.995 +scale = auto + +[sweep.train.gae_lambda] +distribution = logit_normal +min = 0.9 +mean = 0.95 +max = 0.99 scale = auto [sweep.train.ent_coef] @@ -176,30 +229,61 @@ mean = 0.005 max = 0.03 scale = auto -[sweep.train.gamma] +[sweep.train.vf_coef] distribution = log_normal -min = 0.97 -mean = 0.98 -max = 0.999 +min = 0.5 +mean = 2.0 +max = 5.0 scale = auto -; [sweep.env.guidance_heading_weight] -; distribution = uniform -; min = 0.1 -; max = 0.5 -; mean = 0.3 -; scale = auto +[sweep.train.clip_coef] +distribution = uniform +min = 0.1 +mean = 0.2 +max = 0.3 +scale = auto -; [sweep.env.guidance_speed_weight] -; distribution = uniform -; min = 0.1 -; max = 0.5 -; mean = 0.3 -; scale = auto +; === Minibatch Size === +; Must be divisible by bptt_horizon (32) and <= batch_size (393216) +; uniform_pow2 samples powers of 2: 8192, 16384, 32768, 65536 +[sweep.train.minibatch_size] +distribution = uniform_pow2 +min = 8192 +mean = 24576 +max = 65536 +scale = auto + +; === Learning Rate Decay Ablation === +; 0 = no decay (constant LR), 1 = cosine annealing +[sweep.train.anneal_lr] +distribution = int_uniform +min = 0 +mean = 1 +max = 1 +scale = auto ; Settings for controlled experiments [controlled_exp.train.human_ll_coef] values = [0.0, 0.1, 0.5, 0.9] -; [controlled_exp.env.max_expert_sequences] -; values = [512, 1024, 2048] +; === Guidance Reward Parameters === +[sweep.env.guidance_speed_weight] +distribution = uniform +min = 0.05 +mean = 0.1 +max = 0.3 +scale = auto + +[sweep.env.guidance_heading_weight] +distribution = uniform +min = 0.05 +mean = 0.1 +max = 0.3 +scale = auto + +[sweep.env.waypoint_reach_threshold] +distribution = uniform +min = 1.0 +mean = 2.0 +max = 5.0 +scale = auto diff --git a/pufferlib/ocean/benchmark/metrics_sanity_check.py b/pufferlib/ocean/benchmark/metrics_sanity_check.py index 0983dfa963..9cabc42377 100644 --- a/pufferlib/ocean/benchmark/metrics_sanity_check.py +++ b/pufferlib/ocean/benchmark/metrics_sanity_check.py @@ -1,22 +1,14 @@ """ -Comprehensive evaluation script for Drive environment checkpoints. +Validation script for WOSAC log-likelihood metrics. -Evaluates all .pt checkpoints in a folder using: -1. WOSAC metrics (realism, ADE, likelihood metrics) -2. Collision rates (SDC-only control mode) - -Includes baselines for ground truth and random policy. +Idea is to check how the log-likelihood metrics change as we replace +increasing numbers of random rollouts with ground-truth data. """ import argparse -import os -import glob -from pathlib import Path import numpy as np -import pandas as pd -import torch -from pufferlib.pufferl import load_env, load_policy +from pufferlib.pufferl import load_config, load_env, load_policy from pufferlib.ocean.benchmark.evaluator import WOSACEvaluator @@ -37,7 +29,6 @@ def replace_rollouts_with_gt(simulated_traj, gt_traj, num_replacements): def run_validation_experiment(config, vecenv): evaluator = WOSACEvaluator(config) - # Collect ground truth trajectories gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv) simulated_trajectories = evaluator.collect_wosac_random_baseline(vecenv) agent_state = vecenv.driver_env.get_global_agent_state() @@ -88,24 +79,9 @@ def format_results_table(results): def main(): - parser = argparse.ArgumentParser( - description="Evaluate Drive environment checkpoints", formatter_class=argparse.RawDescriptionHelpFormatter - ) - parser.add_argument("checkpoint_dir", type=str, help="Directory containing .pt checkpoint files") - parser.add_argument( - "--output", - type=str, - default="evaluation_results.csv", - help="Output CSV file path (default: evaluation_results.csv)", - ) - parser.add_argument( - "--num-collision-episodes", - type=int, - default=100, - help="Number of episodes for collision rate evaluation (default: 100)", - ) - parser.add_argument("--skip-wosac", action="store_true", help="Skip WOSAC evaluation") - parser.add_argument("--skip-collision", action="store_true", help="Skip collision rate evaluation") + parser = argparse.ArgumentParser(description="Validate WOSAC log-likelihood metrics") + parser.add_argument("--env", default="puffer_drive") + parser.add_argument("--config", default="config/ocean/drive.ini") args = parser.parse_args() config = load_config(args.env) diff --git a/pufferlib/ocean/drive/binding.c b/pufferlib/ocean/drive/binding.c index 16ac2ca2a3..0e5b4056e2 100644 --- a/pufferlib/ocean/drive/binding.c +++ b/pufferlib/ocean/drive/binding.c @@ -191,29 +191,66 @@ static int my_init(Env *env, PyObject *args, PyObject *kwargs) { PyErr_SetString(PyExc_ValueError, "episode_length must be > 0 (set in INI or kwargs)"); return -1; } + +// Allow all settings to be overridden via kwargs (ini provides defaults) +#define OVERRIDE_INT(field) \ + if (kwargs && PyDict_GetItemString(kwargs, #field)) { \ + conf.field = (int)unpack(kwargs, #field); \ + } +#define OVERRIDE_FLOAT(field) \ + if (kwargs && PyDict_GetItemString(kwargs, #field)) { \ + conf.field = (float)unpack(kwargs, #field); \ + } + + OVERRIDE_INT(action_type); + OVERRIDE_INT(dynamics_model); + OVERRIDE_FLOAT(reward_vehicle_collision); + OVERRIDE_FLOAT(reward_offroad_collision); + OVERRIDE_FLOAT(reward_goal); + OVERRIDE_FLOAT(reward_goal_post_respawn); + OVERRIDE_INT(use_guided_autonomy); + OVERRIDE_FLOAT(guidance_speed_weight); + OVERRIDE_FLOAT(guidance_heading_weight); + OVERRIDE_FLOAT(waypoint_reach_threshold); + OVERRIDE_INT(use_guidance_observations); + OVERRIDE_INT(collision_behavior); + OVERRIDE_INT(offroad_behavior); + OVERRIDE_FLOAT(dt); + OVERRIDE_INT(termination_mode); + OVERRIDE_INT(init_mode); + OVERRIDE_INT(control_mode); + OVERRIDE_INT(goal_behavior); + OVERRIDE_FLOAT(goal_target_distance); + OVERRIDE_FLOAT(goal_radius); + OVERRIDE_FLOAT(goal_speed); + +#undef OVERRIDE_INT +#undef OVERRIDE_FLOAT + + // Assign all settings from conf (which now has ini defaults + any kwargs overrides) env->action_type = conf.action_type; env->dynamics_model = conf.dynamics_model; env->reward_vehicle_collision = conf.reward_vehicle_collision; env->reward_offroad_collision = conf.reward_offroad_collision; env->reward_goal = conf.reward_goal; env->reward_goal_post_respawn = conf.reward_goal_post_respawn; - env->episode_length = conf.episode_length; - env->termination_mode = conf.termination_mode; env->use_guided_autonomy = conf.use_guided_autonomy; env->guidance_speed_weight = conf.guidance_speed_weight; env->guidance_heading_weight = conf.guidance_heading_weight; env->waypoint_reach_threshold = conf.waypoint_reach_threshold; - env->goal_radius = conf.goal_radius; + env->use_guidance_observations = conf.use_guidance_observations; + env->episode_length = conf.episode_length; + env->termination_mode = conf.termination_mode; env->collision_behavior = conf.collision_behavior; env->offroad_behavior = conf.offroad_behavior; - env->max_controlled_agents = unpack(kwargs, "max_controlled_agents"); env->dt = conf.dt; - env->init_mode = (int)unpack(kwargs, "init_mode"); - env->control_mode = (int)unpack(kwargs, "control_mode"); - env->goal_behavior = (int)unpack(kwargs, "goal_behavior"); - env->goal_target_distance = (float)unpack(kwargs, "goal_target_distance"); - env->goal_radius = (float)unpack(kwargs, "goal_radius"); - env->goal_speed = (float)unpack(kwargs, "goal_speed"); + env->init_mode = conf.init_mode; + env->control_mode = conf.control_mode; + env->goal_behavior = conf.goal_behavior; + env->goal_target_distance = conf.goal_target_distance; + env->goal_radius = conf.goal_radius; + env->goal_speed = conf.goal_speed; + env->max_controlled_agents = unpack(kwargs, "max_controlled_agents"); char *map_dir = unpack_str(kwargs, "map_dir"); int map_id = unpack(kwargs, "map_id"); int max_agents = unpack(kwargs, "max_agents"); diff --git a/pufferlib/ocean/drive/drive.h b/pufferlib/ocean/drive/drive.h index bbea158a98..c89a9b6e1d 100644 --- a/pufferlib/ocean/drive/drive.h +++ b/pufferlib/ocean/drive/drive.h @@ -82,8 +82,13 @@ #define PARTNER_FEATURES 7 // Ego features depend on dynamics model -#define EGO_FEATURES_CLASSIC 7 -#define EGO_FEATURES_JERK 10 +#define EGO_FEATURES_CLASSIC 8 +#define EGO_FEATURES_JERK 11 + +// Guidance waypoint observations +#define GUIDANCE_WAYPOINT_FEATURES 2 // x, y per waypoint +#define NUM_GUIDANCE_WAYPOINTS 91 // Full trajectory length +#define GUIDANCE_OBS_SIZE (NUM_GUIDANCE_WAYPOINTS * GUIDANCE_WAYPOINT_FEATURES) // 182 // Observation normalization constants #define MAX_SPEED 100.0f @@ -340,6 +345,7 @@ struct Drive { float guidance_speed_weight; // Weight for speed deviation penalty float guidance_heading_weight; // Weight for heading deviation penalty float waypoint_reach_threshold; // Distance threshold for hitting waypoints (e.g., 2.0m) + int use_guidance_observations; // Boolean: whether to include egocentric guidance waypoints in observations char *map_name; float world_mean_x; float world_mean_y; @@ -582,7 +588,6 @@ Entity *load_map_binary(const char *filename, Drive *env) { fread(&env->num_roads, sizeof(int), 1, file); env->num_entities = env->num_objects + env->num_roads; Entity *entities = (Entity *)malloc(env->num_entities * sizeof(Entity)); - for (int i = 0; i < env->num_entities; i++) { // Read base entity data fread(&entities[i].scenario_id, sizeof(int), 1, file); @@ -1320,7 +1325,7 @@ void compute_agent_metrics(Drive *env, int agent_idx) { entity = &env->entities[entity_list[i].entity_idx]; // Check for offroad collision with road edges - if (entity->type == ROAD_EDGE) { + if (entity->type == ROAD_EDGE && agent->type == VEHICLE) { int geometry_idx = entity_list[i].geometry_idx; float start[2] = {entity->traj_x[geometry_idx], entity->traj_y[geometry_idx]}; float end[2] = {entity->traj_x[geometry_idx + 1], entity->traj_y[geometry_idx + 1]}; @@ -1519,7 +1524,7 @@ void set_active_agents(Drive *env) { is_controlled = should_control_agent(env, i); - if (is_controlled) { + if (is_controlled && env->active_agent_count < env->num_agents) { active_agent_indices[env->active_agent_count] = i; env->active_agent_count++; env->entities[i].active_agent = 1; @@ -1660,7 +1665,9 @@ void c_close(Drive *env) { void allocate(Drive *env) { init(env); - int ego_dim = (env->dynamics_model == JERK) ? EGO_FEATURES_JERK : EGO_FEATURES_CLASSIC; + int ego_dim_base = (env->dynamics_model == JERK) ? EGO_FEATURES_JERK : EGO_FEATURES_CLASSIC; + int guidance_size = env->use_guidance_observations ? GUIDANCE_OBS_SIZE : 0; + int ego_dim = ego_dim_base + guidance_size; int max_obs = ego_dim + PARTNER_FEATURES * (MAX_AGENTS - 1) + ROAD_FEATURES * MAX_ROAD_SEGMENT_OBSERVATIONS; env->observations = (float *)calloc(env->active_agent_count * max_obs, sizeof(float)); env->actions = (float *)calloc(env->active_agent_count * 2, sizeof(float)); @@ -1687,8 +1694,6 @@ float clipSpeed(float speed) { return speed; } -float normalize_value(float value, float min, float max) { return (value - min) / (max - min); } - void move_dynamics(Drive *env, int action_idx, int agent_idx) { Entity *agent = &env->entities[agent_idx]; if (agent->removed) @@ -1951,7 +1956,9 @@ void c_get_road_edge_polylines(Drive *env, float *x_out, float *y_out, int *leng } void compute_observations(Drive *env) { - int ego_dim = (env->dynamics_model == JERK) ? EGO_FEATURES_JERK : EGO_FEATURES_CLASSIC; + int ego_dim_base = (env->dynamics_model == JERK) ? EGO_FEATURES_JERK : EGO_FEATURES_CLASSIC; + int guidance_size = env->use_guidance_observations ? GUIDANCE_OBS_SIZE : 0; + int ego_dim = ego_dim_base + guidance_size; int max_obs = ego_dim + PARTNER_FEATURES * (MAX_AGENTS - 1) + ROAD_FEATURES * MAX_ROAD_SEGMENT_OBSERVATIONS; memset(env->observations, 0, max_obs * env->active_agent_count * sizeof(float)); float (*observations)[max_obs] = (float (*)[max_obs])env->observations; @@ -1989,8 +1996,39 @@ void compute_observations(Drive *env) { (ego_entity->a_long < 0) ? ego_entity->a_long / (-JERK_LONG[0]) : ego_entity->a_long / JERK_LONG[3]; obs[8] = ego_entity->a_lat / JERK_LAT[2]; obs[9] = (ego_entity->respawn_timestep != -1) ? 1 : 0; + obs[10] = ego_entity->type / 3.0f; } else { obs[6] = (ego_entity->respawn_timestep != -1) ? 1 : 0; + obs[7] = ego_entity->type / 3.0f; + } + + // Egocentric guidance waypoint observations + if (env->use_guidance_observations) { + int guidance_start_idx = ego_dim_base; + for (int w = 0; w < NUM_GUIDANCE_WAYPOINTS; w++) { + int traj_idx = env->timestep + w; + float wp_ego_x = 0.0f; + float wp_ego_y = 0.0f; + + // Check if trajectory index is valid and within bounds + if (traj_idx < ego_entity->array_size && ego_entity->traj_valid[traj_idx]) { + float wp_world_x = ego_entity->traj_x[traj_idx]; + float wp_world_y = ego_entity->traj_y[traj_idx]; + + // Skip invalid positions + if (wp_world_x != INVALID_POSITION && wp_world_y != INVALID_POSITION) { + // Transform to egocentric coordinates + float dx = wp_world_x - ego_entity->x; + float dy = wp_world_y - ego_entity->y; + wp_ego_x = dx * cos_heading + dy * sin_heading; + wp_ego_y = -dx * sin_heading + dy * cos_heading; + } + } + + // Store normalized egocentric waypoint (same scale as goal: 0.005) + obs[guidance_start_idx + w * 2] = wp_ego_x * 0.005f; + obs[guidance_start_idx + w * 2 + 1] = wp_ego_y * 0.005f; + } } // Relative Pos of other cars @@ -2316,7 +2354,6 @@ void c_step(Drive *env) { int lane_aligned = env->entities[agent_idx].metrics_array[LANE_ALIGNED_IDX]; env->logs[i].lane_alignment_rate = lane_aligned; - // Apply guided autonomy reward if (env->use_guided_autonomy) { float ga_reward = compute_guided_autonomy_reward(env, agent_idx, i); @@ -2383,8 +2420,10 @@ void c_step(Drive *env) { void c_collect_expert_data(Drive *env, float *expert_actions_discrete_out, float *expert_actions_continuous_out, float *expert_obs_out) { - int ego_dim = (env->dynamics_model == JERK) ? 10 : 7; - int max_obs = ego_dim + 7 * (MAX_AGENTS - 1) + 7 * MAX_ROAD_SEGMENT_OBSERVATIONS; + int ego_dim_base = (env->dynamics_model == JERK) ? EGO_FEATURES_JERK : EGO_FEATURES_CLASSIC; + int guidance_size = env->use_guidance_observations ? GUIDANCE_OBS_SIZE : 0; + int ego_dim = ego_dim_base + guidance_size; + int max_obs = ego_dim + PARTNER_FEATURES * (MAX_AGENTS - 1) + ROAD_FEATURES * MAX_ROAD_SEGMENT_OBSERVATIONS; int original_timestep = env->timestep; @@ -2636,7 +2675,9 @@ void draw_agent_obs(Drive *env, int agent_index, int mode, int obs_only, int las return; } - int ego_dim = (env->dynamics_model == JERK) ? EGO_FEATURES_JERK : EGO_FEATURES_CLASSIC; + int ego_dim_base = (env->dynamics_model == JERK) ? EGO_FEATURES_JERK : EGO_FEATURES_CLASSIC; + int guidance_size = env->use_guidance_observations ? GUIDANCE_OBS_SIZE : 0; + int ego_dim = ego_dim_base + guidance_size; int max_obs = ego_dim + PARTNER_FEATURES * (MAX_AGENTS - 1) + ROAD_FEATURES * MAX_ROAD_SEGMENT_OBSERVATIONS; float (*observations)[max_obs] = (float (*)[max_obs])env->observations; float *agent_obs = &observations[agent_index][0]; diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py index 1d4e186e27..0d5a385519 100644 --- a/pufferlib/ocean/drive/drive.py +++ b/pufferlib/ocean/drive/drive.py @@ -29,6 +29,7 @@ def __init__( guidance_speed_weight=0.0, guidance_heading_weight=0.0, waypoint_reach_threshold=2.0, + use_guidance_observations=0, goal_behavior=0, goal_target_distance=10.0, goal_radius=2.0, @@ -76,6 +77,7 @@ def __init__( self.guidance_speed_weight = guidance_speed_weight self.guidance_heading_weight = guidance_heading_weight self.waypoint_reach_threshold = waypoint_reach_threshold + self.use_guidance_observations = use_guidance_observations self.human_agent_idx = human_agent_idx self.episode_length = episode_length self.termination_mode = termination_mode @@ -85,9 +87,11 @@ def __init__( self.save_data_to_disk = save_data_to_disk # Observation space calculation - self.ego_features = {"classic": binding.EGO_FEATURES_CLASSIC, "jerk": binding.EGO_FEATURES_JERK}.get( + ego_features_base = {"classic": binding.EGO_FEATURES_CLASSIC, "jerk": binding.EGO_FEATURES_JERK}.get( dynamics_model ) + guidance_obs_size = binding.GUIDANCE_OBS_SIZE if use_guidance_observations else 0 + self.ego_features = ego_features_base + guidance_obs_size # Extract observation shapes from constants # These need to be defined in C, since they determine the shape of the arrays @@ -149,6 +153,7 @@ def __init__( raise ValueError(f"action_space must be 'discrete' or 'continuous'. Got: {action_type}") self._action_type_flag = 0 if action_type == "discrete" else 1 + self._dynamics_model_flag = 0 if dynamics_model == "classic" else 1 # Check if resources directory exists binary_path = f"{map_dir}/map_000.bin" @@ -195,6 +200,7 @@ def __init__( self.truncations[cur:nxt], seed, action_type=self._action_type_flag, + dynamics_model=self._dynamics_model_flag, human_agent_idx=human_agent_idx, reward_vehicle_collision=reward_vehicle_collision, reward_offroad_collision=reward_offroad_collision, @@ -204,6 +210,7 @@ def __init__( guidance_speed_weight=guidance_speed_weight, guidance_heading_weight=guidance_heading_weight, waypoint_reach_threshold=waypoint_reach_threshold, + use_guidance_observations=use_guidance_observations, goal_radius=goal_radius, goal_speed=goal_speed, goal_behavior=self.goal_behavior, diff --git a/pufferlib/ocean/drive/drivenet.h b/pufferlib/ocean/drive/drivenet.h index 714cc0fc3b..7e503bfd4a 100644 --- a/pufferlib/ocean/drive/drivenet.h +++ b/pufferlib/ocean/drive/drivenet.h @@ -46,10 +46,12 @@ struct DriveNet { Multidiscrete *multidiscrete; }; -DriveNet *init_drivenet(Weights *weights, int num_agents, int dynamics_model) { +DriveNet *init_drivenet(Weights *weights, int num_agents, int dynamics_model, int use_guidance_observations) { DriveNet *net = calloc(1, sizeof(DriveNet)); // Use constants directly from drive.h - int ego_dim = (dynamics_model == JERK) ? EGO_FEATURES_JERK : EGO_FEATURES_CLASSIC; + int ego_dim_base = (dynamics_model == JERK) ? EGO_FEATURES_JERK : EGO_FEATURES_CLASSIC; + int guidance_size = use_guidance_observations ? GUIDANCE_OBS_SIZE : 0; + int ego_dim = ego_dim_base + guidance_size; int max_partners = MAX_AGENTS - 1; int max_road_obs = MAX_ROAD_SEGMENT_OBSERVATIONS; int partner_features = PARTNER_FEATURES; diff --git a/pufferlib/ocean/drive/visualize.c b/pufferlib/ocean/drive/visualize.c index 3a511a2624..73554425dd 100644 --- a/pufferlib/ocean/drive/visualize.c +++ b/pufferlib/ocean/drive/visualize.c @@ -281,6 +281,134 @@ static void set_actions(Drive *env, int timestep, int control, DriveNet *net) { } } +void render_snapshot(Drive *env, Client *client, DriveNet *net, int frame_count, float map_height, int show_human_logs, + int show_grid, int lasers, int zoom_in, int img_width, int img_height, + const char *output_filename) { + + // Allocate trajectory storage + int num_agents = env->active_agent_count; + float *traj_x = (float *)calloc(num_agents * frame_count, sizeof(float)); + float *traj_y = (float *)calloc(num_agents * frame_count, sizeof(float)); + int *traj_valid = (int *)calloc(num_agents * frame_count, sizeof(int)); + + // Track whether each agent is done (respawned, stopped, or removed) + int *agent_done = (int *)calloc(num_agents, sizeof(int)); + + // Run simulation, record positions + printf("Running simulation for %d steps...\n", frame_count); + for (int t = 0; t < frame_count; t++) { + // Record positions before stepping + for (int i = 0; i < num_agents; i++) { + int idx = env->active_agent_indices[i]; + traj_x[i * frame_count + t] = env->entities[idx].x; + traj_y[i * frame_count + t] = env->entities[idx].y; + traj_valid[i * frame_count + t] = !agent_done[i]; + } + forward(net, env->observations, (int *)env->actions); + c_step(env); + // Check for done agents after stepping (respawn/stop/remove happens inside c_step) + for (int i = 0; i < num_agents; i++) { + int idx = env->active_agent_indices[i]; + if (env->entities[idx].respawn_timestep != -1 || env->entities[idx].stopped || env->entities[idx].removed) { + agent_done[i] = 1; + } + } + } + free(agent_done); + + // Reset env to get initial scene for the background + c_reset(env); + + // Render one top-down frame + BeginDrawing(); + + Camera3D camera = {0}; + if (zoom_in) { + camera.position = (Vector3){0.0f, 0.0f, 500.0f}; + camera.target = (Vector3){0.0f, 0.0f, 0.0f}; + camera.fovy = map_height; + } else { + camera.position = (Vector3){env->grid_map->top_left_x, env->grid_map->bottom_right_y, 500.0f}; + camera.target = (Vector3){env->grid_map->top_left_x, env->grid_map->bottom_right_y, 0.0f}; + camera.fovy = 2 * map_height; + } + camera.up = (Vector3){0.0f, -1.0f, 0.0f}; + camera.projection = CAMERA_ORTHOGRAPHIC; + + client->width = img_width; + client->height = img_height; + + Color road = (Color){35, 35, 37, 255}; + ClearBackground(road); + BeginMode3D(camera); + rlEnableDepthTest(); + + // Draw human replay trajectories if enabled + if (show_human_logs) { + for (int i = 0; i < num_agents; i++) { + int idx = env->active_agent_indices[i]; + Vector3 prev_point = {0}; + bool has_prev = false; + + for (int j = 0; j < env->entities[idx].array_size; j++) { + float x = env->entities[idx].traj_x[j]; + float y = env->entities[idx].traj_y[j]; + float valid = env->entities[idx].traj_valid[j]; + + if (!valid) { + has_prev = false; + continue; + } + + Vector3 curr_point = {x, y, 0.5f}; + if (has_prev) { + DrawLine3D(prev_point, curr_point, Fade(LIGHTGREEN, 0.6f)); + } + prev_point = curr_point; + has_prev = true; + } + } + } + + // Draw policy trajectories (blue for all vehicles) + for (int i = 0; i < num_agents; i++) { + // Draw trajectory lines + for (int t = 1; t < frame_count; t++) { + if (traj_valid[i * frame_count + t] && traj_valid[i * frame_count + t - 1]) { + Vector3 start = {traj_x[i * frame_count + t - 1], traj_y[i * frame_count + t - 1], 1.0f}; + Vector3 end = {traj_x[i * frame_count + t], traj_y[i * frame_count + t], 1.0f}; + DrawLine3D(start, end, BLUE); + } + } + + // Draw start marker + if (traj_valid[i * frame_count]) { + DrawSphere((Vector3){traj_x[i * frame_count], traj_y[i * frame_count], 1.0f}, 0.8f, BLUE); + } + + // Draw end marker (last valid position) + for (int t = frame_count - 1; t >= 0; t--) { + if (traj_valid[i * frame_count + t]) { + DrawSphere((Vector3){traj_x[i * frame_count + t], traj_y[i * frame_count + t], 1.0f}, 0.8f, + Fade(BLUE, 0.5f)); + break; + } + } + } + + // Draw the scene (roads, vehicles at t=0, curbs, grid) + draw_scene(env, client, 1, 0, lasers, show_grid); + EndMode3D(); + EndDrawing(); + + TakeScreenshot(output_filename); + printf("Snapshot saved to %s\n", output_filename); + + free(traj_x); + free(traj_y); + free(traj_valid); +} + int eval_gif(const char *map_name, const char *policy_name, int show_grid, int obs_only, int lasers, int show_human_logs, int frame_skip, const char *view_mode, const char *output_topdown, const char *output_agent, int num_maps, int zoom_in, int control) { @@ -338,6 +466,7 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o .init_steps = conf.init_steps, .init_mode = conf.init_mode, .control_mode = conf.control_mode, + .use_guidance_observations = conf.use_guidance_observations, .map_name = (char *)map_name, }; @@ -392,10 +521,50 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o if (control == ACTIONS_FROM_POLICY) { weights = load_weights(policy_name); printf("Active agents in map: %d\n", env.active_agent_count); - net = init_drivenet(weights, env.active_agent_count, env.dynamics_model); + net = init_drivenet(weights, env.active_agent_count, env.dynamics_model, env.use_guidance_observations); } int frame_count = env.episode_length > 0 ? env.episode_length : TRAJECTORY_LENGTH_DEFAULT; + + // Snapshot mode: render single PNG with trajectory overlays + if (conf.snapshot_only) { + char snapshot_filename[256]; + if (output_topdown != NULL) { + strcpy(snapshot_filename, output_topdown); + char *ext = strrchr(snapshot_filename, '.'); + if (ext != NULL) { + strcpy(ext, ".png"); + } + } else { + char policy_base[256]; + strcpy(policy_base, policy_name); + *strrchr(policy_base, '.') = '\0'; + + char map[256]; + strcpy(map, basename((char *)map_name)); + *strrchr(map, '.') = '\0'; + + char video_dir[256]; + sprintf(video_dir, "%s/video", policy_base); + char mkdir_cmd[512]; + snprintf(mkdir_cmd, sizeof(mkdir_cmd), "mkdir -p \"%s\"", video_dir); + system(mkdir_cmd); + + sprintf(snapshot_filename, "%s/video/%s_snapshot.png", policy_base, map); + } + + render_snapshot(&env, client, net, frame_count, map_height, show_human_logs, show_grid, lasers, zoom_in, + img_width, img_height, snapshot_filename); + + CloseWindow(); + free(client); + free_allocated(&env); + free_drivenet(net); + free(weights); + return 0; + } + + // Video mode (existing behavior) char filename_topdown[256]; char filename_agent[256]; @@ -508,6 +677,13 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o } int main(int argc, char *argv[]) { + // Parse configuration from INI file + env_init_config conf = {0}; + const char *ini_file = "pufferlib/config/ocean/drive.ini"; + if (ini_parse(ini_file, handler, &conf) < 0) { + fprintf(stderr, "Error: Could not load %s. Cannot determine environment configuration.\n", ini_file); + return -1; + } // Visualization-only parameters (not in [env] section) int show_grid = 0; int obs_only = 0; @@ -522,7 +698,7 @@ int main(int argc, char *argv[]) { const char *policy_name = "resources/drive/puffer_drive_weights.bin"; const char *output_topdown = NULL; const char *output_agent = NULL; - int num_maps = 100; + int num_maps = conf.num_maps; int control = ACTIONS_FROM_POLICY; // ACTIONS_INFERRED_FROM_HUMAN; // Parse command line arguments diff --git a/pufferlib/ocean/env_binding.h b/pufferlib/ocean/env_binding.h index 803dcdfc82..3672ab24a0 100644 --- a/pufferlib/ocean/env_binding.h +++ b/pufferlib/ocean/env_binding.h @@ -1006,7 +1006,6 @@ static PyObject *vec_collect_expert_data(PyObject *self, PyObject *args) { } int trajectory_length = PyArray_DIM(expert_actions_discrete, 0); - int max_obs = 7 + 7 * (MAX_AGENTS - 1) + 7 * MAX_ROAD_SEGMENT_OBSERVATIONS; // Process each environment int agent_offset = 0; @@ -1014,6 +1013,12 @@ static PyObject *vec_collect_expert_data(PyObject *self, PyObject *args) { Env *env = vec->envs[i]; int num_agents = env->active_agent_count; + // Compute max_obs dynamically based on environment config + int ego_dim_base = (env->dynamics_model == JERK) ? EGO_FEATURES_JERK : EGO_FEATURES_CLASSIC; + int guidance_size = env->use_guidance_observations ? GUIDANCE_OBS_SIZE : 0; + int ego_dim = ego_dim_base + guidance_size; + int max_obs = ego_dim + PARTNER_FEATURES * (MAX_AGENTS - 1) + ROAD_FEATURES * MAX_ROAD_SEGMENT_OBSERVATIONS; + // Allocate temporary buffers for this environment float *env_actions_discrete = (float *)malloc(trajectory_length * num_agents * 1 * sizeof(float)); float *env_actions_continuous = (float *)malloc(trajectory_length * num_agents * 2 * sizeof(float)); @@ -1115,6 +1120,7 @@ PyMODINIT_FUNC PyInit_binding(void) { PyModule_AddIntConstant(m, "PARTNER_FEATURES", PARTNER_FEATURES); PyModule_AddIntConstant(m, "EGO_FEATURES_CLASSIC", EGO_FEATURES_CLASSIC); PyModule_AddIntConstant(m, "EGO_FEATURES_JERK", EGO_FEATURES_JERK); + PyModule_AddIntConstant(m, "GUIDANCE_OBS_SIZE", GUIDANCE_OBS_SIZE); return m; } diff --git a/pufferlib/ocean/env_config.h b/pufferlib/ocean/env_config.h index f4dd7acdf5..391d0a1c98 100644 --- a/pufferlib/ocean/env_config.h +++ b/pufferlib/ocean/env_config.h @@ -19,6 +19,7 @@ typedef struct { float guidance_speed_weight; // Weight for speed deviation penalty float guidance_heading_weight; // Weight for heading deviation penalty float waypoint_reach_threshold; // Distance threshold for hitting waypoints + int use_guidance_observations; // Boolean: whether to include egocentric guidance waypoints in observations float goal_radius; float goal_speed; int collision_behavior; @@ -32,7 +33,9 @@ typedef struct { int init_steps; int init_mode; int control_mode; + int num_maps; char map_dir[256]; + int snapshot_only; } env_init_config; // INI file parser handler - parses all environment configuration from drive.ini @@ -80,6 +83,8 @@ static int handler(void *config, const char *section, const char *name, const ch env_config->guidance_heading_weight = atof(value); } else if (MATCH("env", "waypoint_reach_threshold")) { env_config->waypoint_reach_threshold = atof(value); + } else if (MATCH("env", "use_guidance_observations")) { + env_config->use_guidance_observations = atoi(value); } else if (MATCH("env", "goal_radius")) { env_config->goal_radius = atof(value); } else if (MATCH("env", "goal_speed")) { @@ -126,6 +131,10 @@ static int handler(void *config, const char *section, const char *name, const ch env_config->map_dir[sizeof(env_config->map_dir) - 1] = '\0'; } // printf("Parsed map_dir: '%s'\n", env_config->map_dir); + } else if (MATCH("env", "num_maps")) { + env_config->num_maps = atoi(value); + } else if (MATCH("render", "snapshot_only")) { + env_config->snapshot_only = (strcmp(value, "True") == 0 || strcmp(value, "true") == 0 || atoi(value) == 1); } else { return 0; // Unknown section/name, indicate failure to handle } diff --git a/pufferlib/ocean/torch.py b/pufferlib/ocean/torch.py index 98cc11a4ea..57899227b7 100644 --- a/pufferlib/ocean/torch.py +++ b/pufferlib/ocean/torch.py @@ -22,8 +22,8 @@ def __init__(self, env, input_size=128, hidden_size=128, **kwargs): self.road_features = env.road_features self.road_features_after_onehot = env.road_features + 6 # 6 is the number of one-hot encoded categories - # Determine ego dimension from environment's dynamics model - self.ego_dim = 10 if env.dynamics_model == "jerk" else 7 + # Determine ego dimension from environment (includes guidance observations if enabled) + self.ego_dim = env.ego_features self.ego_encoder = nn.Sequential( pufferlib.pytorch.layer_init(nn.Linear(self.ego_dim, input_size)), diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py index af6e56e66e..0cd116ae47 100644 --- a/pufferlib/pufferl.py +++ b/pufferlib/pufferl.py @@ -24,6 +24,7 @@ import numpy as np import psutil +from multiprocessing.pool import ThreadPool import torch import torch.distributed @@ -430,31 +431,33 @@ def train(self): approx_kl = ((ratio - 1) - logratio).mean() clipfrac = ((ratio - 1.0).abs() > config["clip_coef"]).float().mean() - # Compute log likelihood loss of human actions under current policy. - # 1: Sample a batch of human actions and observations from dataset - # Shape: [n_sequences, bptt_horizon, feature_dim] - discrete_human_actions, continuous_human_actions, human_observations = ( - self.vecenv.driver_env.sample_expert_data(n_samples=config["human_sequences"], return_both=True) - ) - - use_continuous = self.vecenv.driver_env._action_type_flag == 1 - human_actions = continuous_human_actions if use_continuous else discrete_human_actions - human_actions = human_actions.reshape(-1, 1).to(device) - human_observations = human_observations.to(device) + # Compute log likelihood loss of human actions under current policy (if enabled) + if config["human_sequences"] > 0: + # 1: Sample a batch of human actions and observations from dataset + # Shape: [n_sequences, bptt_horizon, feature_dim] + discrete_human_actions, continuous_human_actions, human_observations = ( + self.vecenv.driver_env.sample_expert_data(n_samples=config["human_sequences"], return_both=True) + ) - # 2: Compute the log-likelihood of human actions under the current policy, - # given the corresponding human observations. A higher likelihood indicates - # that the policy behaves more like a human under the same observations. - human_state = dict( - lstm_h=None, - lstm_c=None, - ) + # Select appropriate action type for training + use_continuous = self.vecenv.driver_env._action_type_flag == 1 + human_actions = continuous_human_actions if use_continuous else discrete_human_actions + human_actions = human_actions.to(device) + human_observations = human_observations.to(device) + + # 2: Compute the log-likelihood of human actions under the current policy, + # given the corresponding human observations. A higher likelihood indicates + # that the policy behaves more like a human under the same observations. + human_state = dict( + lstm_h=None, + lstm_c=None, + ) - human_logits, _ = self.policy(human_observations, human_state) + human_logits, _ = self.policy(human_observations, human_state) - _, human_log_prob, human_entropy = pufferlib.pytorch.sample_logits( - logits=human_logits, action=human_actions - ) + _, human_log_prob, human_entropy = pufferlib.pytorch.sample_logits( + logits=human_logits, action=human_actions + ) adv = advantages[idx] adv = compute_puff_advantage( @@ -482,7 +485,10 @@ def train(self): v_loss_clipped = (v_clipped - mb_returns) ** 2 v_loss = 0.5 * torch.max(v_loss_unclipped, v_loss_clipped).mean() - human_nll = human_log_prob.mean() + if config["human_sequences"] > 0: + human_nll = human_log_prob.mean() + else: + human_nll = torch.tensor(0.0, device=device) entropy_loss = entropy.mean() @@ -507,7 +513,8 @@ def train(self): losses["clipfrac"] += clipfrac.item() / self.total_minibatches losses["importance"] += ratio.mean().item() / self.total_minibatches losses["human_nll"] += human_nll / self.total_minibatches - self.realism["human_log_prob"] = human_log_prob.mean().item() + if config["human_sequences"] > 0: + self.realism["human_log_prob"] = human_log_prob.mean().item() # Learn on accumulated minibatches profile("learn", epoch) @@ -1077,7 +1084,7 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None): elif args["wandb"]: logger = WandbLogger(args) - train_config = dict(**args["train"], env=env_name, eval=args.get("eval", {})) + train_config = dict(**args["train"], env=env_name, eval=args.get("eval", {}), env_args=args.get("env", {})) pufferl = PuffeRL(train_config, vecenv, policy, logger) all_logs = [] @@ -1630,8 +1637,96 @@ def puffer_type(value): return args +def render(env_name, args=None): + args = args or load_config(env_name) + render_configs = args.get("render", {}) + + # Renders first num_maps from map_dir using visualize binary + try: + map_dir = render_configs["map_dir"] + num_maps = render_configs.get("num_maps", 1) + view_mode = render_configs["view_mode"] + render_policy_path = render_configs["policy_path"] + overwork = render_configs.get("overwork", False) + num_workers = args["vec"]["num_workers"] + output_dir = render_configs["output_dir"] + except KeyError as e: + raise pufferlib.APIUsageError(f"Missing render config: {e}") + + cpu_cores = psutil.cpu_count(logical=False) + if num_workers > cpu_cores and not overwork: + raise pufferlib.APIUsageError( + " ".join( + [ + f"num_workers ({num_workers}) > hardware cores ({cpu_cores}) is disallowed by default.", + "PufferLib multiprocessing is heavily optimized for 1 process per hardware core.", + "If you really want to do this, set overwork=True (--vec-overwork in our demo.py).", + ] + ) + ) + + if num_maps > len(os.listdir(map_dir)): + num_maps = len(os.listdir(map_dir)) + + render_maps = [os.path.join(map_dir, f) for f in sorted(os.listdir(map_dir)) if f.endswith(".bin")][:num_maps] + os.makedirs(output_dir, exist_ok=True) + + # Rebuild visualize binary + ensure_drive_binary() + + snapshot_only = render_configs.get("snapshot_only", False) + + def render_task(map_path): + base_cmd = ( + ["./visualize"] + if sys.platform == "darwin" + else ["xvfb-run", "-a", "-s", "-screen 0 1280x720x24", "./visualize"] + ) + cmd = base_cmd.copy() + cmd.extend(["--map-name", map_path]) + if render_configs.get("show_grid", False): + cmd.append("--show-grid") + if render_configs.get("obs_only", False): + cmd.append("--obs-only") + if render_configs.get("show_lasers", False): + cmd.append("--lasers") + if render_configs.get("show_human_logs", False): + cmd.append("--show-human-logs") + if render_configs.get("zoom_in", False): + cmd.append("--zoom-in") + cmd.extend(["--view", view_mode]) + if render_policy_path is not None: + cmd.extend(["--policy-name", render_policy_path]) + + map_name = os.path.basename(map_path).replace(".bin", "") + + if snapshot_only: + cmd.extend(["--output-topdown", os.path.join(output_dir, f"topdown_{map_name}.png")]) + else: + if view_mode == "topdown" or view_mode == "both": + cmd.extend(["--output-topdown", os.path.join(output_dir, f"topdown_{map_name}.mp4")]) + if view_mode == "agent" or view_mode == "both": + cmd.extend(["--output-agent", os.path.join(output_dir, f"agent_{map_name}.mp4")]) + + env_vars = os.environ.copy() + env_vars["ASAN_OPTIONS"] = "exitcode=0" + try: + result = subprocess.run(cmd, cwd=os.getcwd(), capture_output=True, text=True, timeout=600, env=env_vars) + if result.returncode != 0: + print(f"Error rendering {map_name}: {result.stderr}") + except subprocess.TimeoutExpired: + print(f"Timeout rendering {map_name}: exceeded 600 seconds") + + if render_maps: + output_type = "snapshots" if snapshot_only else "videos" + print(f"Rendering {len(render_maps)} {output_type} from {map_dir} with {num_workers} workers...") + with ThreadPool(num_workers) as pool: + pool.map(render_task, render_maps) + print(f"Finished rendering {output_type} to {output_dir}") + + def main(): - err = "Usage: puffer [train, eval, sweep, controlled_exp, autotune, profile, export, sanity] [env_name] [optional args]. --help for more info" + err = "Usage: puffer [train, eval, sweep, controlled_exp, autotune, profile, export, sanity, render] [env_name] [optional args]. --help for more info" if len(sys.argv) < 3: raise pufferlib.APIUsageError(err) @@ -1653,6 +1748,8 @@ def main(): export(env_name=env_name) elif mode == "sanity": sanity(env_name=env_name) + elif mode == "render": + render(env_name=env_name) else: raise pufferlib.APIUsageError(err) diff --git a/pufferlib/utils.py b/pufferlib/utils.py index e82c1cd620..12611fba47 100644 --- a/pufferlib/utils.py +++ b/pufferlib/utils.py @@ -24,6 +24,7 @@ def run_human_replay_eval_in_subprocess(config, logger, global_step): # Prepare evaluation command eval_config = config["eval"] + env_config = config.get("env_args", {}) cmd = [ sys.executable, "-m", @@ -40,6 +41,11 @@ def run_human_replay_eval_in_subprocess(config, logger, global_step): str(eval_config["human_replay_num_agents"]), "--eval.human-replay-control-mode", str(eval_config["human_replay_control_mode"]), + # Pass through environment config that affects observation size + "--env.use-guidance-observations", + str(env_config.get("use_guidance_observations", 0)), + "--env.dynamics-model", + str(env_config.get("dynamics_model", "classic")), ] # Run human replay evaluation in subprocess @@ -100,6 +106,7 @@ def run_wosac_eval_in_subprocess(config, logger, global_step): # Prepare evaluation command eval_config = config.get("eval", {}) + env_config = config.get("env_args", {}) cmd = [ sys.executable, "-m", @@ -126,6 +133,13 @@ def run_wosac_eval_in_subprocess(config, logger, global_step): str(eval_config.get("wosac_goal_radius", 1.0)), "--eval.wosac-sanity-check", str(eval_config.get("wosac_sanity_check", False)), + "--eval.wosac-aggregate-results", + str(eval_config.get("wosac_aggregate_results", True)), + # Pass through environment config that affects observation size + "--env.use-guidance-observations", + str(env_config.get("use_guidance_observations", 0)), + "--env.dynamics-model", + str(env_config.get("dynamics_model", "classic")), ] if not model_files: