Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
698eb4d
WIP guidance reward
nadarenator Nov 7, 2025
d266d2f
remove irrelevant const
nadarenator Nov 14, 2025
10cb5d0
heading fix
nadarenator Nov 14, 2025
19a5260
minor refactor
nadarenator Nov 14, 2025
41f2bdc
Merge branch 'gsp_dev' into kj/guidance_reward
nadarenator Nov 14, 2025
edc94f7
fix normalize function defn
nadarenator Nov 14, 2025
16e5cc0
removed master weight
nadarenator Nov 14, 2025
4bf7376
guidance reward sweeps
nadarenator Nov 14, 2025
5b1dbda
Merge branch 'gsp_dev' into kj/guidance_reward
nadarenator Nov 18, 2025
d0c4c07
Merge branch 'main' into kj/guidance_reward
nadarenator Dec 19, 2025
33da028
minor fixes
nadarenator Dec 19, 2025
a1a8f67
Merge branch 'main' into kj/guidance_reward
nadarenator Dec 27, 2025
298f848
merge fix
nadarenator Dec 27, 2025
1d2add0
precommit
nadarenator Dec 27, 2025
a6f81ec
Merge branch 'gsp_dev' into kj/guidance_reward
nadarenator Jan 22, 2026
3fe6cd7
fix merge conflict artifact
nadarenator Jan 22, 2026
17e2971
make log likelihood stuff optional
nadarenator Jan 22, 2026
65a3c4c
Merge branch 'gsp_dev' into kj/guidance_reward
nadarenator Jan 24, 2026
989a51c
add rl eval to pipeline
nadarenator Jan 24, 2026
392b55c
config parsing fixes and pipeline compatibility
nadarenator Jan 28, 2026
e2a6d5d
dropout still has memory leaks?
nadarenator Feb 4, 2026
c082807
unsure if this is needed
nadarenator Feb 4, 2026
ffc0def
type-awareness and conditional off-road
nadarenator Feb 9, 2026
1233341
Merge branch 'gsp_dev' into kj/guidance_reward
nadarenator Feb 9, 2026
cbed67d
pre-commit
nadarenator Feb 9, 2026
553a909
minor cleanup
nadarenator Feb 9, 2026
bcc0771
pre-commit
nadarenator Feb 9, 2026
787482c
rendering and snapshot
nadarenator Feb 9, 2026
4169872
trying to remove trajectory artifacts
nadarenator Feb 10, 2026
36e42d8
precommit
nadarenator Feb 10, 2026
5a3a354
Merge remote-tracking branch 'origin/gsp_dev' into kj/guidance_reward
daphne-cornelisse Feb 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
195 changes: 139 additions & 56 deletions examples/eval_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import numpy as np
import pandas as pd
import torch
Expand All @@ -11,6 +12,35 @@

POLICY_DIR = "models"

# Policy configurations: (path, dynamics_model, type)
POLICY_CONFIGS = {
"bc_policy": {
"path": POLICY_DIR + "/bc_policy.pt",
"dynamics": "classic",
"type": "bc",
},
# "self_play_rl (classic)": {
# "path": POLICY_DIR + "/self_play_rl_simple_policy.pt",
# "dynamics": "classic",
# "type": "rl",
# },
# "guided_self_play_rl (classic)": {
# "path": POLICY_DIR + "/guided_self_play_classic_policy.pt",
# "dynamics": "classic",
# "type": "rl",
# },
# "self_play_rl (jerk)": {
# "path": POLICY_DIR + "/self_play_jerk_policy.pt",
# "dynamics": "jerk",
# "type": "rl",
# },
# "guided_self_play_rl (jerk)": {
# "path": POLICY_DIR + "/guided_self_play_jerk_policy.pt",
# "dynamics": "jerk",
# "type": "rl",
# },
}

COLUMN_ORDER = [
"policy",
"realism_meta_score",
Expand Down Expand Up @@ -222,13 +252,31 @@ def evaluate_bc_policy(config, vecenv, evaluator, policy_path):


def evaluate_rl_policy(config, vecenv, evaluator, policy_path):
"""Evaluate an RL policy using WOSAC metrics."""

# Use a copy to avoid mutating the original config
config = copy.deepcopy(config)

# Ensure evaluator is in RL mode
evaluator.mode = "rl"

# Enable RNN state initialization for LSTM-based policies
config["train"]["use_rnn"] = True
evaluator.mode = "rl_policy"

config["load_model_path"] = policy_path

# Load policy
policy = load_policy(config, vecenv, "puffer_drive")
policy.eval()

gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv)
simulated_trajectories = evaluator.collect_simulated_trajectories(config, vecenv, policy)

# Roll out trained policy in the simulator
simulated_trajectories = evaluator.collect_simulated_trajectories(
args=config,
puffer_env=vecenv,
policy=policy,
)

# Compute metrics
agent_state = vecenv.driver_env.get_global_agent_state()
Expand All @@ -242,18 +290,16 @@ def evaluate_rl_policy(config, vecenv, evaluator, policy_path):
return results


def pipeline(env_name="puffer_drive"):
"""Obtain WOSAC scores for various baselines and policies."""

def create_config_and_env(env_name, dynamics_model="classic"):
"""Create a config and vecenv for a specific dynamics model."""
config = load_config(env_name)

# Dataset configuration
config["env"]["map_dir"] = "pufferlib/resources/drive/binaries/training"
# Common WOSAC evaluation settings
config["env"]["num_maps"] = 100
config["env"]["map_dir"] = "pufferlib/resources/drive/binaries/validation"
config["eval"]["wosac_target_scenarios"] = 1000
config["eval"]["wosac_batch_size"] = 100
config["eval"]["wosac_scenario_pool_size"] = 10_0000

# WOSAC settings
config["eval"]["wosac_scenario_pool_size"] = 10_000
config["wosac"]["enabled"] = True
config["vec"]["backend"] = "PufferEnv"
config["vec"]["num_envs"] = 1
Expand All @@ -264,63 +310,100 @@ def pipeline(env_name="puffer_drive"):
config["env"]["goal_radius"] = 1.0
config["env"]["save_data_to_disk"] = False

# Make env
vecenv = load_env(env_name, config)
# Set dynamics model
config["env"]["dynamics_model"] = dynamics_model

# Disable human data preparation for jerk dynamics (not implemented)
if dynamics_model == "jerk":
config["env"]["prep_human_data"] = False

# Make evaluator
vecenv = load_env(env_name, config)
evaluator = WOSACEvaluator(config)

# Baseline: Ground truth
evaluator.eval_mode = "ground_truth"
df_results_gt = evaluator.evaluate(config, vecenv, policy=None)
df_results_gt["policy"] = "ground_truth"
return config, vecenv, evaluator

# Baseline: Agent with inferred human actions (using classic bicycle dynamics model)
# df_results_inferred_human = evaluate_human_inferred_actions(config, vecenv, evaluator)
# df_results_inferred_human["policy"] = "inferred_human_actions"

# Baseline: Imitation learning policy
# df_results_bc = evaluate_bc_policy(config, vecenv, evaluator, POLICY_DIR + "/bc_policy.pt")
# df_results_bc["policy"] = "bc_policy"

# Baseline: Self-play RL policy
# run: https://wandb.ai/emerge_/gsp/runs/qld2z6tn?nw=nwuserdaphnecor
# df_results_self_play = evaluate_rl_policy(
# config, vecenv, evaluator, "pufferlib/resources/drive/pufferdrive_weights.pt"
# ) # POLICY_DIR + "/puffer_drive_sp_qld2z6tn.pt")
# df_results_self_play["policy"] = "self_play_rl_base"

# TODO: Guided self-play policy (guidance in rewards)
# ...

# TODO: Guided self-play policy (regularization)
# ...

# Baseline: Random policy
# df_results_random = evaluate_random_policy(config, vecenv, evaluator)
# df_results_random["policy"] = "random"

# Combine
df = pd.concat(
[
df_results_gt,
# df_results_inferred_human,
# df_results_random,
# df_results_bc,
# df_results_self_play,
],
ignore_index=True,
)

def pipeline(env_name="puffer_drive"):
"""Obtain WOSAC scores for various baselines and policies across dynamics models."""

all_results = []

config_classic, vecenv_classic, evaluator_classic = create_config_and_env(env_name, "classic")

# Ground truth (dynamics-agnostic, only need to run once)
print("Evaluating: ground_truth")
evaluator_classic.eval_mode = "ground_truth"
df_results_gt = evaluator_classic.evaluate(config_classic, vecenv_classic, policy=None)
df_results_gt["policy"] = "ground_truth"
all_results.append(df_results_gt)

# Inferred human actions (classic only - not implemented for jerk)
print("Evaluating: inferred_human_actions (classic)")
df_results_inferred_human = evaluate_human_inferred_actions(config_classic, vecenv_classic, evaluator_classic)
df_results_inferred_human["policy"] = "inferred_human (classic)"
all_results.append(df_results_inferred_human)

# --- Classic dynamics evaluations ---
print("=" * 60)
print("Running evaluations with CLASSIC dynamics model...")
print("=" * 60)

# Random baseline for classic
print("Evaluating: random (classic)")
df_results_random_classic = evaluate_random_policy(config_classic, vecenv_classic, evaluator_classic)
df_results_random_classic["policy"] = "random (classic)"
all_results.append(df_results_random_classic)

# Evaluate classic dynamics policies
evaluator_classic.eval_mode = "policy"
for policy_name, policy_cfg in POLICY_CONFIGS.items():
if policy_cfg["dynamics"] != "classic":
continue
print(f"Evaluating: {policy_name}")
if policy_cfg["type"] == "bc":
df_result = evaluate_bc_policy(config_classic, vecenv_classic, evaluator_classic, policy_cfg["path"])
else:
df_result = evaluate_rl_policy(config_classic, vecenv_classic, evaluator_classic, policy_cfg["path"])
df_result["policy"] = policy_name
all_results.append(df_result)

# --- Jerk dynamics evaluations ---
# Check if any jerk policies are configured
jerk_policies = {k: v for k, v in POLICY_CONFIGS.items() if v["dynamics"] == "jerk"}

if jerk_policies:
print("=" * 60)
print("Running evaluations with JERK dynamics model...")
print("=" * 60)

config_jerk, vecenv_jerk, evaluator_jerk = create_config_and_env(env_name, "jerk")

# Random baseline for jerk (different action space, so separate baseline)
print("Evaluating: random (jerk)")
df_results_random_jerk = evaluate_random_policy(config_jerk, vecenv_jerk, evaluator_jerk)
df_results_random_jerk["policy"] = "random (jerk)"
all_results.append(df_results_random_jerk)

# Evaluate jerk dynamics policies
for policy_name, policy_cfg in jerk_policies.items():
print(f"Evaluating: {policy_name}")
df_result = evaluate_rl_policy(config_jerk, vecenv_jerk, evaluator_jerk, policy_cfg["path"])
df_result["policy"] = policy_name
all_results.append(df_result)

# Combine all results
df = pd.concat(all_results, ignore_index=True)
df = df[COLUMN_ORDER]

# Visualize
plot_wosac_results(df)
plot_realism_score_distributions(df)

print(f"total agents: {df_results_gt['num_agents_per_scene'].sum().item()}")

print(df.groupby("policy")["realism_meta_score"].mean())
# Print summary
print("\n" + "=" * 60)
print("RESULTS SUMMARY")
print("=" * 60)
print(df.groupby("policy")["realism_meta_score"].mean().sort_values(ascending=False))
print("---")
print(df.groupby("policy")["kinematic_metrics"].mean())
print("---")
Expand Down
2 changes: 1 addition & 1 deletion greene/a100_baseline_sweep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -78,4 +78,4 @@ puffer sweep puffer_drive \

# Print completion info
echo "Sweep completed"
date
date
6 changes: 3 additions & 3 deletions greene/h100_gsp_rew_sweep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ source .venv/bin/activate
NUM_WORKERS=48
NUM_ENVS=48
VEC_BATCH_SIZE=6
NUM_AGENTS=1024
BPTT_HORIZON=32
NUM_AGENTS=1024
BPTT_HORIZON=32
HUMAN_REG_COEF=0.0
USE_GUIDANCE_REWARDS=0

Expand Down Expand Up @@ -79,4 +79,4 @@ puffer sweep puffer_drive \

# Print completion info
echo "Sweep completed"
date
date
6 changes: 3 additions & 3 deletions greene/rtx8000_sweep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ fi
NUM_WORKERS=32
NUM_ENVS=32
VEC_BATCH_SIZE=4
NUM_AGENTS=1024
BPTT_HORIZON=32
NUM_AGENTS=1024
BPTT_HORIZON=32
HUMAN_REG_COEF=0.0
USE_GUIDANCE_REWARDS=1

Expand Down Expand Up @@ -91,4 +91,4 @@ puffer sweep puffer_drive \

# Print completion info
echo "Sweep completed"
date
date
Binary file added models/guided_self_play_jerk_policy.pt
Binary file not shown.
Loading