Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -194,3 +194,4 @@ pufferlib/resources/drive/output*.mp4

# Local TODO tracking
TODO.md
*.mp4
60 changes: 32 additions & 28 deletions evaluate_human_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def main():
print("Beginning human evaluations using HumanReplayEvaluator")
parser = argparse.ArgumentParser()
parser.add_argument("--policy-path", type=str, required=True)
parser.add_argument("--policy-architecture", type=str, default="Recurrent")
parser.add_argument("--rnn-name", type=str, default="Recurrent")
parser.add_argument("--num-maps", type=int, default=10)
parser.add_argument("--num-rollouts", type=int, default=100)
parser.add_argument("--num-agents", type=int, default=64)
Expand All @@ -119,7 +119,7 @@ def main():

print(f"Evaluation Configuration:")
print(f" Policy: {args_parsed.policy_path}")
print(f" Policy Architecture: {args_parsed.policy_architecture}")
print(f" RNN Name: {args_parsed.rnn_name}")
print(f" Num maps: {args_parsed.num_maps}")
print(f" Total rollouts: {args_parsed.num_rollouts}")
print(f" Num agents per env: {args_parsed.num_agents}")
Expand All @@ -133,14 +133,13 @@ def main():
make_env = env_creator(env_name)

scenario_length = 91
context_length = args_parsed.k_scenarios * scenario_length
horizon = args_parsed.k_scenarios * scenario_length

args = {
"rnn_name": args_parsed.rnn_name,
"train": {
"device": args_parsed.device,
"use_rnn": args_parsed.policy_architecture == "Recurrent",
"policy_architecture": args_parsed.policy_architecture,
"context_window": context_length,
"horizon": horizon,
},
"env": {
"num_agents": args_parsed.num_agents,
Expand Down Expand Up @@ -176,10 +175,10 @@ def main():
print("Loading policy...")
temp_env = make_env(**args["env"])

if args_parsed.policy_architecture == "Recurrent":
base_policy = Drive(temp_env, input_size=64, hidden_size=256)
if args_parsed.rnn_name == "Recurrent":
base_policy = Drive(temp_env, input_size=128, hidden_size=256)
policy = Recurrent(temp_env, base_policy, input_size=256, hidden_size=256).to(args_parsed.device)
elif args_parsed.policy_architecture == "Transformer":
elif args_parsed.rnn_name == "Transformer":
base_policy = Drive(temp_env, input_size=128, hidden_size=256)
policy = Transformer(
temp_env,
Expand All @@ -188,7 +187,7 @@ def main():
hidden_size=256,
num_layers=2,
num_heads=4,
context_length=context_length,
horizon=horizon,
).to(args_parsed.device)

state_dict = torch.load(args_parsed.policy_path, map_location=args_parsed.device)
Expand Down Expand Up @@ -242,6 +241,13 @@ def main():
values = [r.get(key, 0) for r in all_results]
aggregated[key] = float(np.mean(values))

# Aggregate scenario-specific metrics (scenario_0_*, scenario_1_*, etc.)
scenario_keys = [k for k in all_keys if k.startswith("scenario_")]
for key in scenario_keys:
values = [r.get(key, 0) for r in all_results]
aggregated[key] = float(np.mean(values))

if delta_keys:
# Derive last scenario metrics from first + delta
# Extract first scenario metrics
first_scenario_keys = [k for k in metric_keys if k not in ["n"]]
Expand Down Expand Up @@ -272,24 +278,22 @@ def main():
json.dump(aggregated, f, indent=2)

# Print results
if args_parsed.adaptive_driving_agent and delta_keys:
print(f"\n0-Shot Performance (First Scenario):")
print(f" Score: {aggregated.get('first_scenario_score', float('nan')):.3f}")
print(f" Collision: {aggregated.get('first_scenario_collision_rate', float('nan')):.3f}")
print(f" Offroad: {aggregated.get('first_scenario_offroad_rate', float('nan')):.3f}")
print(f" Return: {aggregated.get('first_scenario_episode_return', float('nan')):.2f}")

print(f"\nAdapted Performance (Last Scenario):")
print(f" Score: {aggregated.get('last_scenario_score', float('nan')):.3f}")
print(f" Collision: {aggregated.get('last_scenario_collision_rate', float('nan')):.3f}")
print(f" Offroad: {aggregated.get('last_scenario_offroad_rate', float('nan')):.3f}")
print(f" Return: {aggregated.get('last_scenario_episode_return', float('nan')):.2f}")

print(f"\nAdaptive Metrics (Delta):")
print(f" Score: {aggregated.get('ada_delta_score', float('nan')):.4f}")
print(f" Collision rate: {aggregated.get('ada_delta_collision_rate', float('nan')):.4f}")
print(f" Offroad rate: {aggregated.get('ada_delta_offroad_rate', float('nan')):.4f}")
print(f" Episode return: {aggregated.get('ada_delta_episode_return', float('nan')):.4f}")
if args_parsed.adaptive_driving_agent:
# Print per-scenario metrics
for i in range(args_parsed.k_scenarios):
label = "0-Shot" if i == 0 else f"Scenario {i}"
print(f"\n{label} Performance:")
print(f" Score: {aggregated.get(f'scenario_{i}_score', float('nan')):.3f}")
print(f" Collision: {aggregated.get(f'scenario_{i}_collision_rate', float('nan')):.3f}")
print(f" Offroad: {aggregated.get(f'scenario_{i}_offroad_rate', float('nan')):.3f}")
print(f" Return: {aggregated.get(f'scenario_{i}_episode_return', float('nan')):.2f}")

if delta_keys:
print(f"\nAdaptive Metrics (Delta: Last - First):")
print(f" Score: {aggregated.get('ada_delta_score', float('nan')):.4f}")
print(f" Collision rate: {aggregated.get('ada_delta_collision_rate', float('nan')):.4f}")
print(f" Offroad rate: {aggregated.get('ada_delta_offroad_rate', float('nan')):.4f}")
print(f" Episode return: {aggregated.get('ada_delta_episode_return', float('nan')):.4f}")

print(f"\nSaved to {args_parsed.output}")
import sys
Expand Down
4 changes: 2 additions & 2 deletions pufferlib/config/default.ini
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ minibatch_size = 8192

# Accumulate gradients above this size
max_minibatch_size = 32768
bptt_horizon = 64
horizon = 64
compile = False
compile_mode = max-autotune-no-cudagraphs
compile_fullgraph = True
Expand Down Expand Up @@ -81,7 +81,7 @@ downsample = 10
; mean = 1e8
; scale = time

; [sweep.train.bptt_horizon]
; [sweep.train.horizon]
; distribution = uniform_pow2
; min = 16
; max = 64
Expand Down
48 changes: 29 additions & 19 deletions pufferlib/config/ocean/adaptive.ini
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,20 @@
package = ocean
env_name = puffer_adaptive_drive
policy_name = Drive
transformer_name = Transformer
; Changed from rnn_name
policy_architecture = Transformer

[vec]
num_workers = 16
num_envs = 16
batch_size = 2
batch_size = 1
; backend = Serial

[policy]
input_size = 128
; Increased from 64 for richer representations
hidden_size = 256

[rnn]
input_size = 256
hidden_size = 256

[transformer]
Expand All @@ -23,14 +25,13 @@ num_layers = 2
; Number of transformer layers
num_heads = 4
; Number of attention heads (must divide hidden_size)
; context_length = 182
; k_scenarios (2) * scenario_length (91) = maximum attention span
; Transformer uses `horizon` from [train] section for attention span
dropout = 0.0
; Dropout (keep at 0 for RL stability initially)

[env]
num_agents = 1512
num_ego_agents = 756
num_agents = 1024
num_ego_agents = 512
; Options: discrete, continuous
action_type = discrete
; Options: classic, jerk
Expand Down Expand Up @@ -60,15 +61,15 @@ k_scenarios = 2
termination_mode = 1
; 0 - terminate at episode_length, 1 - terminate after all agents have been reset
map_dir = "resources/drive/binaries/training"
num_maps = 1000
num_maps = 10000
; Determines which step of the trajectory to initialize the agents at upon reset
init_steps = 0
; Options: "control_vehicles", "control_agents", "control_wosac", "control_sdc_only"
control_mode = "control_vehicles"
; Options: "created_all_valid", "create_only_controlled"
init_mode = "create_all_valid"
; train with co players
co_player_enabled = False
co_player_enabled = True


[env.conditioning]
Expand All @@ -87,15 +88,24 @@ discount_weight_ub = 0.98

[env.co_player_policy]
policy_name = Drive
rnn_name = Recurrent
; Options: "Recurrent", "Transformer"
architecture = Recurrent
policy_path = "pufferlib/resources/drive/policies/varied_discount.pt"
input_size = 64
input_size = 128
hidden_size = 256

[env.co_player_policy.rnn]
input_size = 256
hidden_size = 256

[env.co_player_policy.transformer]
input_size = 256
hidden_size = 256
num_layers = 2
num_heads = 4
horizon = 91
dropout = 0.0

[env.co_player_policy.conditioning]
; Options: "none", "reward", "entropy", "discount", "all"
type = "all"
Expand All @@ -114,24 +124,22 @@ discount_weight_ub = 0.98
seed=42
total_timesteps = 2_000_000_000
anneal_lr = True
; Needs to be: num_agents * num_workers * context_window
; Needs to be: num_agents * num_workers * horizon
batch_size = auto
minibatch_size = 36400
; 400 * 91
; 200 * 182 (must be divisible by horizon = k_scenarios * scenario_length)
max_minibatch_size = 36400
minibatch_multiplier = 400
policy_architecture = Transformer
; Matches scenario_length for buffer organization
bptt_horizon = 32
; Keep for backward compatibility
; Sequence length - overridden to k_scenarios * scenario_length for adaptive
horizon = 91
adam_beta1 = 0.9
adam_beta2 = 0.999
adam_eps = 1e-8
clip_coef = 0.2
ent_coef = 0.005
gae_lambda = 0.95
gamma = 0.98
learning_rate = 0.0003
learning_rate = 0.003
; Reduced from 0.003 (transformers often need lower LR)
max_grad_norm = 1.0
prio_alpha = 0.85
Expand Down Expand Up @@ -193,6 +201,8 @@ human_replay_num_agents = 32
human_replay_num_rollouts = 100
; Number of maps to use for human replay evaluation
human_replay_num_maps = 100
; Number of maps to render for human replay (subset of eval maps)
human_replay_render_num_maps = 2

[sweep.train.learning_rate]
distribution = log_normal
Expand Down
59 changes: 32 additions & 27 deletions pufferlib/config/ocean/drive.ini
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
package = ocean
env_name = puffer_drive
policy_name = Drive
rnn_name = Transformer
policy_architecture = Transformer

[vec]
num_workers = 16
Expand All @@ -11,12 +11,12 @@ batch_size = 2
; backend = Serial

[policy]
input_size = 64
input_size = 128
hidden_size = 256

; [rnn]
; input_size = 256
; hidden_size = 256
[rnn]
input_size = 256
hidden_size = 256

[transformer]
input_size = 256
Expand All @@ -25,13 +25,12 @@ num_layers = 2
; Number of transformer layers
num_heads = 4
; Number of attention heads (must divide hidden_size)
context_window = 32
; k_scenarios (2) * scenario_length (91) = maximum attention span
dropout = 0.0
; Dropout (keep at 0 for RL stability initially)

[env]
num_agents = 512
num_agents = 1024
num_ego_agents = 512
; Options: discrete, continuous
action_type = discrete
Expand All @@ -47,7 +46,7 @@ goal_radius = 2.0
; Max target speed in m/s for the agent to maintain towards the goal
goal_speed = 100.0
; What to do when the goal is reached. Options: 0:"respawn", 1:"generate_new_goals", 2:"stop"
goal_behavior = 1
goal_behavior = 0
; Determines the target distance to the new goal in the case of goal_behavior = generate_new_goals.
; Large numbers will select a goal point further away from the agent's current position.
goal_target_distance = 30.0
Expand Down Expand Up @@ -112,16 +111,15 @@ discount_weight_ub = 0.80
[train]
seed=42
total_timesteps = 2_000_000_000
# learning_rate = 0.02
# gamma = 0.985
anneal_lr = True
; Needs to be: num_agents * num_workers * BPTT horizon
; Needs to be: num_agents * num_workers * horizon
batch_size = auto
minibatch_size = 32768
max_minibatch_size = 32768
; minibatch_size = 256
; max_minibatch_size = 256
bptt_horizon = 32
minibatch_size = 32760
; 360 * 91
max_minibatch_size = 32760
minibatch_multiplier = 360
; Sequence length for training - matches scenario_length for full episode context
horizon = 91
adam_beta1 = 0.9
adam_beta2 = 0.999
adam_eps = 1e-8
Expand All @@ -130,17 +128,16 @@ ent_coef = 0.005
gae_lambda = 0.95
gamma = 0.98
learning_rate = 0.003
max_grad_norm = 1
prio_alpha = 0.8499999999999999
prio_beta0 = 0.8499999999999999
max_grad_norm = 1.0
prio_alpha = 0.85
prio_beta0 = 0.85
update_epochs = 1
vf_clip_coef = 0.1999999999999999
vf_coef = 2
vf_clip_coef = 0.2
vf_coef = 2.0
vtrace_c_clip = 1
vtrace_rho_clip = 1
checkpoint_interval = 100
use_transformer = True
context_window = 32
context_length = 32
# Rendering options
render = True
render_interval = 100
Expand All @@ -158,7 +155,7 @@ zoom_in = True
render_map = none

[eval]
eval_interval = 1000
eval_interval = 100
; Path to dataset used for evaluation
map_dir = "resources/drive/binaries/training"
; Evaluation will run on the first num_maps maps in the map_dir directory
Expand All @@ -183,9 +180,17 @@ wosac_sanity_check = False
; Only return aggregate results across all scenes
wosac_aggregate_results = True
; If True, enable human replay evaluation (pair policy-controlled agent with human replays)
human_replay_eval = False
; Control only the self-driving car
human_replay_control_mode = "control_sdc_only"
human_replay_eval = True
; Control mode for human replay (control_vehicles with max_controlled_agents=1 controls one agent)
human_replay_control_mode = "control_vehicles"
; Number of agents in human replay evaluation environment
human_replay_num_agents = 32
; Number of rollouts for human replay evaluation
human_replay_num_rollouts = 100
; Number of maps to use for human replay evaluation
human_replay_num_maps = 100
; Number of maps to render for human replay (subset of eval maps)
human_replay_render_num_maps = 2

[sweep.train.learning_rate]
distribution = log_normal
Expand Down
Loading
Loading