From 53f474f86c215bf45cf47dea5e32b4dbdffc4c27 Mon Sep 17 00:00:00 2001 From: "charliemolony59@gmail.com" Date: Thu, 20 Nov 2025 10:19:48 -0500 Subject: [PATCH 01/13] fixing population with new config --- pufferlib/config/ocean/adaptive.ini | 16 +++++---- pufferlib/ocean/drive/drive.py | 56 ++++++++++------------------- pufferlib/vector.py | 34 ++++++++++-------- 3 files changed, 49 insertions(+), 57 deletions(-) diff --git a/pufferlib/config/ocean/adaptive.ini b/pufferlib/config/ocean/adaptive.ini index 616da426f4..51b71a0013 100644 --- a/pufferlib/config/ocean/adaptive.ini +++ b/pufferlib/config/ocean/adaptive.ini @@ -51,8 +51,11 @@ init_steps = 0 control_mode = "control_vehicles" ; Options: "created_all_valid", "create_only_controlled" init_mode = "create_all_valid" +; train with co players +co_player_enabled = False -[policy.conditioning] + +[env.policy.conditioning] ; Options: "none", "reward", "entropy", "discount", "all" type = "none" collision_weight_lb = -1.0 @@ -66,7 +69,7 @@ entropy_weight_ub = 0.001 discount_weight_lb = 0.98 discount_weight_ub = 0.80 -[co_player_policy] +[env.co_player_policy] enabled = True num_ego_agents = 512 policy_name = Drive @@ -75,7 +78,11 @@ policy_path = "resources/drive/policies/varied_discount.pt" input_size = 64 hidden_size = 256 -[co_player_policy.conditioning] +[env.co_player_policy.rnn] +input_size = 256 +hidden_size = 256 + +[env.co_player_policy.conditioning] ; Options: "none", "reward", "entropy", "discount", "all" type = "all" collision_weight_lb = -1.0 @@ -89,9 +96,6 @@ entropy_weight_ub = 0.001 discount_weight_lb = 0.98 discount_weight_ub = 0.80 -[co_player_rnn] -input_size = 256 -hidden_size = 256 [train] total_timesteps = 2_000_000_000 diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py index 856c577774..720428ccd2 100644 --- a/pufferlib/ocean/drive/drive.py +++ b/pufferlib/ocean/drive/drive.py @@ -41,6 +41,7 @@ def __init__( k_scenarios=1, adaptive_driving_agent=False, ini_file="pufferlib/config/ocean/drive.ini", + policy= None, # Main policy conditioning (from [policy.conditioning]) policy_cond_type="none", policy_cond_collision_lb=-0.5, @@ -56,28 +57,8 @@ def __init__( # Co-player policy settings (from [co_player_policy]) co_player_enabled=False, co_player_num_ego=512, - co_player_policy_name=None, - co_player_rnn_name=None, - co_player_policy_path=None, - co_player_input_size=64, - co_player_hidden_size=256, co_player_policy=None, # Co-player RNN settings (from [co_player_rnn]) - co_player_rnn_input_size=256, - co_player_rnn_hidden_size=256, - co_player_rnn=None, - # Co-player conditioning (from [co_player_policy.conditioning]) - co_player_cond_type="none", - co_player_cond_collision_lb=-0.5, - co_player_cond_collision_ub=-0.5, - co_player_cond_offroad_lb=-0.2, - co_player_cond_offroad_ub=-0.2, - co_player_cond_goal_lb=1.0, - co_player_cond_goal_ub=1.0, - co_player_cond_entropy_lb=0.001, - co_player_cond_entropy_ub=0.001, - co_player_cond_discount_lb=0.98, - co_player_cond_discount_ub=0.98, ): # env self.dt = dt @@ -148,22 +129,24 @@ def __init__( self.num_ego_agents = co_player_num_ego if self.population_play else num_agents # Co-player conditioning setup - self.co_player_condition_type = co_player_cond_type + self.co_player_conditioning = co_player_policy.get("conditioning") + self.co_player_condition_type = self.co_player_conditioning.get("type") + self.co_player_reward_conditioned = self.co_player_condition_type in ("reward", "all") self.co_player_entropy_conditioned = self.co_player_condition_type in ("entropy", "all") self.co_player_discount_conditioned = self.co_player_condition_type in ("discount", "all") - # Co-player conditioning weights - self.co_player_collision_weight_lb = co_player_cond_collision_lb - self.co_player_collision_weight_ub = co_player_cond_collision_ub - self.co_player_offroad_weight_lb = co_player_cond_offroad_lb - self.co_player_offroad_weight_ub = co_player_cond_offroad_ub - self.co_player_goal_weight_lb = co_player_cond_goal_lb - self.co_player_goal_weight_ub = co_player_cond_goal_ub - self.co_player_entropy_weight_lb = co_player_cond_entropy_lb - self.co_player_entropy_weight_ub = co_player_cond_entropy_ub - self.co_player_discount_weight_lb = co_player_cond_discount_lb - self.co_player_discount_weight_ub = co_player_cond_discount_ub + + self.co_player_collision_weight_lb = self.co_player_conditioning.get("collision_weight_lb", -0.5) + self.co_player_collision_weight_ub = self.co_player_conditioning.get("collision_weight_ub", -0.5) + self.co_player_offroad_weight_lb = self.co_player_conditioning.get("offroad_weight_lb", -0.2) + self.co_player_offroad_weight_ub = self.co_player_conditioning.get("offroad_weight_ub", -0.2) + self.co_player_goal_weight_lb = self.co_player_conditioning.get("goal_weight_lb", 1.0) + self.co_player_goal_weight_ub = self.co_player_conditioning.get("goal_weight_ub", 1.0) + self.co_player_entropy_weight_lb = self.co_player_conditioning.get("entropy_weight_lb", 0.001) + self.co_player_entropy_weight_ub = self.co_player_conditioning.get("entropy_weight_ub", 0.001) + self.co_player_discount_weight_lb = self.co_player_conditioning.get("discount_weight_lb", 0.98) + self.co_player_discount_weight_ub = self.co_player_conditioning.get("discount_weight_ub", 0.98) self.init_steps = init_steps self.init_mode_str = init_mode self.control_mode_str = control_mode @@ -230,9 +213,9 @@ def __init__( self._set_env_variables() if self.population_play: - self.co_player_policy_name = co_player_policy_name - self.co_player_rnn_name = co_player_rnn_name - self.co_player_policy = co_player_policy + self.co_player_policy_name = co_player_policy.get("policy_name") + self.co_player_rnn_name = co_player_policy.get("rnn_name") + self.co_player_policy = co_player_policy.get("co_player_policy_func") self._set_co_player_state() super().__init__(buf=buf) @@ -335,7 +318,6 @@ def _set_env_variables(self): self.num_ego_agents = len(self.ego_ids) self.num_co_players = len(self.co_player_ids) - print(self.num_co_players, self.num_ego_agents) if ego_set & co_player_set: raise ValueError("Overlap between ego ids and co player ids") @@ -402,7 +384,7 @@ def get_co_player_actions(self): co_player_action = co_player_action.cpu().numpy().reshape(self.co_player_actions.shape) return co_player_action - def _set_co_player_state(self): ## set in init (state doesnt get updated anywhere else) + def _set_co_player_state(self): with torch.no_grad(): self.state = dict( lstm_h=torch.zeros(self.num_co_players, self.co_player_policy.hidden_size), diff --git a/pufferlib/vector.py b/pufferlib/vector.py index adfca605ec..d1456faf4d 100644 --- a/pufferlib/vector.py +++ b/pufferlib/vector.py @@ -831,7 +831,7 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer # TODO: First step action space check env_k = env_kwargs[0] - if env_k.get("population_play", False): + if env_k.get("co_player_enabled", False): import torch import os from types import SimpleNamespace @@ -851,12 +851,16 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer input_size = co_player_policy.get("input_size", 256) hidden_size = co_player_policy.get("hidden_size", 256) + co_player_rnn = co_player_policy.get("rnn",None) + + # Get conditioning type from env_k - condition_type = env_k.get("co_player_condition_type", "none") + co_player_conditioning = co_player_policy.get("conditioning") + condition_type = co_player_conditioning.get("type", "none") reward_conditioned = condition_type in ("reward", "all") entropy_conditioned = condition_type in ("entropy", "all") discount_conditioned = condition_type in ("discount", "all") - print(f"DEBUG: condition type: {condition_type}", flush=True) + print(f"DEBUG: Co player condition type: {condition_type}", flush=True) # Calculate conditioning dimensions conditioning_dims = ( (3 if reward_conditioned else 0) + (1 if entropy_conditioned else 0) + (1 if discount_conditioned else 0) @@ -866,7 +870,7 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer num_obs = ego_features + conditioning_dims + 63 * 7 + 200 * 7 temp_env = SimpleNamespace( - single_action_space=gymnasium.spaces.MultiDiscrete([7, 13]), + single_action_space= gymnasium.spaces.MultiDiscrete([7 * 13]), single_observation_space=gymnasium.spaces.Box(low=-1, high=1, shape=(num_obs,), dtype=np.float32), reward_conditioned=reward_conditioned, entropy_conditioned=entropy_conditioned, @@ -876,14 +880,15 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer base_policy = Drive(temp_env, input_size=input_size, hidden_size=hidden_size) - policy = pufferlib.models.LSTMWrapper( - temp_env, - base_policy, - input_size=hidden_size, - hidden_size=hidden_size, - ) + if co_player_rnn: + policy = pufferlib.models.LSTMWrapper( + temp_env, + base_policy, + input_size=co_player_rnn.get("input_size"), + hidden_size=co_player_rnn.get("hidden_size"), + ) - checkpoint_path = env_k["co_player_policy_path"] + checkpoint_path = co_player_policy.get("policy_path") if not os.path.exists(checkpoint_path): raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}") @@ -898,8 +903,9 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer ) # Store policy and conditioning info in env_k - env_k["co_player_policy"] = policy - env_k["co_player_condition_type"] = condition_type + env_k["co_player_policy"]["co_player_policy_func"] = policy + + torch.set_num_threads( 1 ) # NOTE this is the only way I could get co-player policies to work inside environment evaluation @@ -917,7 +923,7 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer pass for i in range(len(env_kwargs)): - env_kwargs[i]["co_player_policy"] = policy + env_kwargs[i]["co_player_policy"]["co_player_policy_func"] = policy return backend(env_creators, env_args, env_kwargs, num_envs, **kwargs) From 4d65bad6dd62b9d448f243c1356d031736d21bf4 Mon Sep 17 00:00:00 2001 From: "charliemolony59@gmail.com" Date: Thu, 20 Nov 2025 10:21:28 -0500 Subject: [PATCH 02/13] running pre-commit --- pufferlib/ocean/drive/drive.py | 6 ++---- pufferlib/vector.py | 8 +++----- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py index 720428ccd2..93a7ff2df1 100644 --- a/pufferlib/ocean/drive/drive.py +++ b/pufferlib/ocean/drive/drive.py @@ -41,7 +41,7 @@ def __init__( k_scenarios=1, adaptive_driving_agent=False, ini_file="pufferlib/config/ocean/drive.ini", - policy= None, + policy=None, # Main policy conditioning (from [policy.conditioning]) policy_cond_type="none", policy_cond_collision_lb=-0.5, @@ -136,7 +136,6 @@ def __init__( self.co_player_entropy_conditioned = self.co_player_condition_type in ("entropy", "all") self.co_player_discount_conditioned = self.co_player_condition_type in ("discount", "all") - self.co_player_collision_weight_lb = self.co_player_conditioning.get("collision_weight_lb", -0.5) self.co_player_collision_weight_ub = self.co_player_conditioning.get("collision_weight_ub", -0.5) self.co_player_offroad_weight_lb = self.co_player_conditioning.get("offroad_weight_lb", -0.2) @@ -318,7 +317,6 @@ def _set_env_variables(self): self.num_ego_agents = len(self.ego_ids) self.num_co_players = len(self.co_player_ids) - if ego_set & co_player_set: raise ValueError("Overlap between ego ids and co player ids") @@ -384,7 +382,7 @@ def get_co_player_actions(self): co_player_action = co_player_action.cpu().numpy().reshape(self.co_player_actions.shape) return co_player_action - def _set_co_player_state(self): + def _set_co_player_state(self): with torch.no_grad(): self.state = dict( lstm_h=torch.zeros(self.num_co_players, self.co_player_policy.hidden_size), diff --git a/pufferlib/vector.py b/pufferlib/vector.py index d1456faf4d..fae61ab5d9 100644 --- a/pufferlib/vector.py +++ b/pufferlib/vector.py @@ -851,8 +851,7 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer input_size = co_player_policy.get("input_size", 256) hidden_size = co_player_policy.get("hidden_size", 256) - co_player_rnn = co_player_policy.get("rnn",None) - + co_player_rnn = co_player_policy.get("rnn", None) # Get conditioning type from env_k co_player_conditioning = co_player_policy.get("conditioning") @@ -870,7 +869,7 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer num_obs = ego_features + conditioning_dims + 63 * 7 + 200 * 7 temp_env = SimpleNamespace( - single_action_space= gymnasium.spaces.MultiDiscrete([7 * 13]), + single_action_space=gymnasium.spaces.MultiDiscrete([7 * 13]), single_observation_space=gymnasium.spaces.Box(low=-1, high=1, shape=(num_obs,), dtype=np.float32), reward_conditioned=reward_conditioned, entropy_conditioned=entropy_conditioned, @@ -905,7 +904,6 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer # Store policy and conditioning info in env_k env_k["co_player_policy"]["co_player_policy_func"] = policy - torch.set_num_threads( 1 ) # NOTE this is the only way I could get co-player policies to work inside environment evaluation @@ -923,7 +921,7 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer pass for i in range(len(env_kwargs)): - env_kwargs[i]["co_player_policy"]["co_player_policy_func"] = policy + env_kwargs[i]["co_player_policy"]["co_player_policy_func"] = policy return backend(env_creators, env_args, env_kwargs, num_envs, **kwargs) From d457778f721e44a15da119f86067947fd55c9106 Mon Sep 17 00:00:00 2001 From: "charliemolony59@gmail.com" Date: Thu, 20 Nov 2025 10:27:28 -0500 Subject: [PATCH 03/13] running pre-commit --- pufferlib/vector.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pufferlib/vector.py b/pufferlib/vector.py index fae61ab5d9..4c3c12dc77 100644 --- a/pufferlib/vector.py +++ b/pufferlib/vector.py @@ -850,7 +850,6 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer input_size = co_player_policy.get("input_size", 256) hidden_size = co_player_policy.get("hidden_size", 256) - co_player_rnn = co_player_policy.get("rnn", None) # Get conditioning type from env_k @@ -859,12 +858,10 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer reward_conditioned = condition_type in ("reward", "all") entropy_conditioned = condition_type in ("entropy", "all") discount_conditioned = condition_type in ("discount", "all") - print(f"DEBUG: Co player condition type: {condition_type}", flush=True) # Calculate conditioning dimensions conditioning_dims = ( (3 if reward_conditioned else 0) + (1 if entropy_conditioned else 0) + (1 if discount_conditioned else 0) ) - print(f"DEBUG: condition dims {conditioning_dims}", flush=True) # Base observations + conditioning observations num_obs = ego_features + conditioning_dims + 63 * 7 + 200 * 7 @@ -900,7 +897,6 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer f"Co player policy loaded with {conditioning_dims} conditioning dims (condition_type={condition_type})", flush=True, ) - # Store policy and conditioning info in env_k env_k["co_player_policy"]["co_player_policy_func"] = policy From b73327f6917c2c11c3ce2e4ff6fcab7fe5fb97e9 Mon Sep 17 00:00:00 2001 From: "charliemolony59@gmail.com" Date: Thu, 20 Nov 2025 15:23:17 -0500 Subject: [PATCH 04/13] Making it a little prettier --- pufferlib/config/ocean/adaptive.ini | 4 +-- pufferlib/config/ocean/drive.ini | 2 +- pufferlib/ocean/drive/drive.py | 56 +++++++++++++++++------------ pufferlib/vector.py | 2 ++ 4 files changed, 38 insertions(+), 26 deletions(-) diff --git a/pufferlib/config/ocean/adaptive.ini b/pufferlib/config/ocean/adaptive.ini index 51b71a0013..67ff3edb3c 100644 --- a/pufferlib/config/ocean/adaptive.ini +++ b/pufferlib/config/ocean/adaptive.ini @@ -55,7 +55,7 @@ init_mode = "create_all_valid" co_player_enabled = False -[env.policy.conditioning] +[env.conditioning] ; Options: "none", "reward", "entropy", "discount", "all" type = "none" collision_weight_lb = -1.0 @@ -105,7 +105,7 @@ anneal_lr = True ; Needs to be: num_agents * num_workers * BPTT horizon batch_size = auto minibatch_size = 372736 -minibatch_multiplier = 512 +minibatch_multiplier = 256 max_minibatch_size = 372736 ; BPTT horizon (overridden by pufferl.py for adaptive agents to k_scenarios * scenario_length) bptt_horizon = 32 diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini index 0365251243..45d6061af7 100644 --- a/pufferlib/config/ocean/drive.ini +++ b/pufferlib/config/ocean/drive.ini @@ -49,7 +49,7 @@ control_mode = "control_vehicles" ; Options: "created_all_valid", "create_only_controlled" init_mode = "create_all_valid" -[policy.conditioning] +[env.conditioning] ; Options: "none", "reward", "entropy", "discount", "all" type = "none" collision_weight_lb = -1.0 diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py index 93a7ff2df1..58ee4d0b72 100644 --- a/pufferlib/ocean/drive/drive.py +++ b/pufferlib/ocean/drive/drive.py @@ -41,19 +41,8 @@ def __init__( k_scenarios=1, adaptive_driving_agent=False, ini_file="pufferlib/config/ocean/drive.ini", - policy=None, + conditioning=None, ## ego conditioning # Main policy conditioning (from [policy.conditioning]) - policy_cond_type="none", - policy_cond_collision_lb=-0.5, - policy_cond_collision_ub=-0.5, - policy_cond_offroad_lb=-0.2, - policy_cond_offroad_ub=-0.2, - policy_cond_goal_lb=1.0, - policy_cond_goal_ub=1.0, - policy_cond_entropy_lb=0.001, - policy_cond_entropy_ub=0.001, - policy_cond_discount_lb=0.98, - policy_cond_discount_ub=0.98, # Co-player policy settings (from [co_player_policy]) co_player_enabled=False, co_player_num_ego=512, @@ -87,21 +76,40 @@ def __init__( self.current_scenario_infos = [] # Accumulate infos for current scenario # Main policy conditioning setup - self.condition_type = policy_cond_type + + self.conditioning = conditioning + + self.condition_type = self.conditioning.get("type", "none") self.reward_conditioned = self.condition_type in ("reward", "all") self.entropy_conditioned = self.condition_type in ("entropy", "all") self.discount_conditioned = self.condition_type in ("discount", "all") - self.collision_weight_lb = policy_cond_collision_lb if self.reward_conditioned else reward_vehicle_collision - self.collision_weight_ub = policy_cond_collision_ub if self.reward_conditioned else reward_vehicle_collision - self.offroad_weight_lb = policy_cond_offroad_lb if self.reward_conditioned else reward_offroad_collision - self.offroad_weight_ub = policy_cond_offroad_ub if self.reward_conditioned else reward_offroad_collision - self.goal_weight_lb = policy_cond_goal_lb if self.reward_conditioned else 1.0 - self.goal_weight_ub = policy_cond_goal_ub if self.reward_conditioned else 1.0 - self.entropy_weight_lb = policy_cond_entropy_lb - self.entropy_weight_ub = policy_cond_entropy_ub - self.discount_weight_lb = policy_cond_discount_lb - self.discount_weight_ub = policy_cond_discount_ub + self.collision_weight_lb = ( + self.conditioning.get("collision_weight_lb", reward_vehicle_collision) + if self.reward_conditioned + else reward_vehicle_collision + ) + self.collision_weight_ub = ( + self.conditioning.get("collision_weight_ub", reward_vehicle_collision) + if self.reward_conditioned + else reward_vehicle_collision + ) + self.offroad_weight_lb = ( + self.conditioning.get("offroad_weight_lb", reward_offroad_collision) + if self.reward_conditioned + else reward_offroad_collision + ) + self.offroad_weight_ub = ( + self.conditioning.get("offroad_weight_ub", reward_offroad_collision) + if self.reward_conditioned + else reward_offroad_collision + ) + self.goal_weight_lb = self.conditioning.get("goal_weight_lb", 1.0) if self.reward_conditioned else 1.0 + self.goal_weight_ub = self.conditioning.get("goal_weight_ub", 1.0) if self.reward_conditioned else 1.0 + self.entropy_weight_lb = self.conditioning.get("entropy_weight_lb", 0.001) + self.entropy_weight_ub = self.conditioning.get("entropy_weight_ub", 0.001) + self.discount_weight_lb = self.conditioning.get("discount_weight_lb", 0.98) + self.discount_weight_ub = self.conditioning.get("discount_weight_ub", 0.98) conditioning_dims = ( (3 if self.reward_conditioned else 0) @@ -206,6 +214,8 @@ def __init__( raise ValueError( f"num ego agents ({self.num_ego_agents}) exceeds the number of total agents ({num_agents}))" ) + if self.condition_type != "none" and self.co_player_condition_type != "none": + raise NotImplementedError("Only one agent can be conditioned at once") self.max_controlled_agents = int(max_controlled_agents) diff --git a/pufferlib/vector.py b/pufferlib/vector.py index 4c3c12dc77..24ac492405 100644 --- a/pufferlib/vector.py +++ b/pufferlib/vector.py @@ -883,6 +883,8 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer input_size=co_player_rnn.get("input_size"), hidden_size=co_player_rnn.get("hidden_size"), ) + else: + policy = base_policy checkpoint_path = co_player_policy.get("policy_path") From cbd5da995a6da7e864e0373a315096989d0de448 Mon Sep 17 00:00:00 2001 From: "charliemolony59@gmail.com" Date: Thu, 20 Nov 2025 15:35:02 -0500 Subject: [PATCH 05/13] fixing the tests --- pufferlib/ocean/drive/drive.py | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py index 58ee4d0b72..b8330a0949 100644 --- a/pufferlib/ocean/drive/drive.py +++ b/pufferlib/ocean/drive/drive.py @@ -41,7 +41,7 @@ def __init__( k_scenarios=1, adaptive_driving_agent=False, ini_file="pufferlib/config/ocean/drive.ini", - conditioning=None, ## ego conditioning + conditioning={},# ego conditioning # Main policy conditioning (from [policy.conditioning]) # Co-player policy settings (from [co_player_policy]) co_player_enabled=False, @@ -76,34 +76,18 @@ def __init__( self.current_scenario_infos = [] # Accumulate infos for current scenario # Main policy conditioning setup - self.conditioning = conditioning - + self.condition_type = self.conditioning.get("type", "none") self.reward_conditioned = self.condition_type in ("reward", "all") self.entropy_conditioned = self.condition_type in ("entropy", "all") self.discount_conditioned = self.condition_type in ("discount", "all") - self.collision_weight_lb = ( - self.conditioning.get("collision_weight_lb", reward_vehicle_collision) - if self.reward_conditioned - else reward_vehicle_collision - ) - self.collision_weight_ub = ( - self.conditioning.get("collision_weight_ub", reward_vehicle_collision) - if self.reward_conditioned - else reward_vehicle_collision - ) - self.offroad_weight_lb = ( - self.conditioning.get("offroad_weight_lb", reward_offroad_collision) - if self.reward_conditioned - else reward_offroad_collision - ) - self.offroad_weight_ub = ( - self.conditioning.get("offroad_weight_ub", reward_offroad_collision) - if self.reward_conditioned - else reward_offroad_collision - ) + + self.collision_weight_lb = self.conditioning.get("collision_weight_lb", reward_vehicle_collision) if self.reward_conditioned else reward_vehicle_collision + self.collision_weight_ub = self.conditioning.get("collision_weight_ub", reward_vehicle_collision) if self.reward_conditioned else reward_vehicle_collision + self.offroad_weight_lb = self.conditioning.get("offroad_weight_lb", reward_offroad_collision) if self.reward_conditioned else reward_offroad_collision + self.offroad_weight_ub = self.conditioning.get("offroad_weight_ub", reward_offroad_collision) if self.reward_conditioned else reward_offroad_collision self.goal_weight_lb = self.conditioning.get("goal_weight_lb", 1.0) if self.reward_conditioned else 1.0 self.goal_weight_ub = self.conditioning.get("goal_weight_ub", 1.0) if self.reward_conditioned else 1.0 self.entropy_weight_lb = self.conditioning.get("entropy_weight_lb", 0.001) From 8632ab355af547a0f0153c0d00f86b8970e43d1f Mon Sep 17 00:00:00 2001 From: "charliemolony59@gmail.com" Date: Thu, 20 Nov 2025 15:38:32 -0500 Subject: [PATCH 06/13] Fixing tests --- pufferlib/ocean/drive/drive.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py index b8330a0949..195a9e639a 100644 --- a/pufferlib/ocean/drive/drive.py +++ b/pufferlib/ocean/drive/drive.py @@ -41,7 +41,7 @@ def __init__( k_scenarios=1, adaptive_driving_agent=False, ini_file="pufferlib/config/ocean/drive.ini", - conditioning={},# ego conditioning + conditioning={}, # ego conditioning # Main policy conditioning (from [policy.conditioning]) # Co-player policy settings (from [co_player_policy]) co_player_enabled=False, @@ -77,17 +77,32 @@ def __init__( # Main policy conditioning setup self.conditioning = conditioning - + self.condition_type = self.conditioning.get("type", "none") self.reward_conditioned = self.condition_type in ("reward", "all") self.entropy_conditioned = self.condition_type in ("entropy", "all") self.discount_conditioned = self.condition_type in ("discount", "all") - - self.collision_weight_lb = self.conditioning.get("collision_weight_lb", reward_vehicle_collision) if self.reward_conditioned else reward_vehicle_collision - self.collision_weight_ub = self.conditioning.get("collision_weight_ub", reward_vehicle_collision) if self.reward_conditioned else reward_vehicle_collision - self.offroad_weight_lb = self.conditioning.get("offroad_weight_lb", reward_offroad_collision) if self.reward_conditioned else reward_offroad_collision - self.offroad_weight_ub = self.conditioning.get("offroad_weight_ub", reward_offroad_collision) if self.reward_conditioned else reward_offroad_collision + self.collision_weight_lb = ( + self.conditioning.get("collision_weight_lb", reward_vehicle_collision) + if self.reward_conditioned + else reward_vehicle_collision + ) + self.collision_weight_ub = ( + self.conditioning.get("collision_weight_ub", reward_vehicle_collision) + if self.reward_conditioned + else reward_vehicle_collision + ) + self.offroad_weight_lb = ( + self.conditioning.get("offroad_weight_lb", reward_offroad_collision) + if self.reward_conditioned + else reward_offroad_collision + ) + self.offroad_weight_ub = ( + self.conditioning.get("offroad_weight_ub", reward_offroad_collision) + if self.reward_conditioned + else reward_offroad_collision + ) self.goal_weight_lb = self.conditioning.get("goal_weight_lb", 1.0) if self.reward_conditioned else 1.0 self.goal_weight_ub = self.conditioning.get("goal_weight_ub", 1.0) if self.reward_conditioned else 1.0 self.entropy_weight_lb = self.conditioning.get("entropy_weight_lb", 0.001) From 9f56d0ee47aba61e9f5a791914cf7a92999694e2 Mon Sep 17 00:00:00 2001 From: "charliemolony59@gmail.com" Date: Fri, 21 Nov 2025 05:22:22 -0500 Subject: [PATCH 07/13] fixing tests (again) --- pufferlib/ocean/drive/drive.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py index 195a9e639a..d1e0b1a3ce 100644 --- a/pufferlib/ocean/drive/drive.py +++ b/pufferlib/ocean/drive/drive.py @@ -42,12 +42,9 @@ def __init__( adaptive_driving_agent=False, ini_file="pufferlib/config/ocean/drive.ini", conditioning={}, # ego conditioning - # Main policy conditioning (from [policy.conditioning]) - # Co-player policy settings (from [co_player_policy]) co_player_enabled=False, co_player_num_ego=512, - co_player_policy=None, - # Co-player RNN settings (from [co_player_rnn]) + co_player_policy={}, ): # env self.dt = dt From d00a10572137176ab3eb9fb2393fc5cb1e3b85b3 Mon Sep 17 00:00:00 2001 From: "charliemolony59@gmail.com" Date: Fri, 21 Nov 2025 06:53:22 -0500 Subject: [PATCH 08/13] lets try one more time --- pufferlib/ocean/drive/drive.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py index d1e0b1a3ce..a3621fa27c 100644 --- a/pufferlib/ocean/drive/drive.py +++ b/pufferlib/ocean/drive/drive.py @@ -134,22 +134,24 @@ def __init__( # Co-player conditioning setup self.co_player_conditioning = co_player_policy.get("conditioning") - self.co_player_condition_type = self.co_player_conditioning.get("type") - - self.co_player_reward_conditioned = self.co_player_condition_type in ("reward", "all") - self.co_player_entropy_conditioned = self.co_player_condition_type in ("entropy", "all") - self.co_player_discount_conditioned = self.co_player_condition_type in ("discount", "all") - - self.co_player_collision_weight_lb = self.co_player_conditioning.get("collision_weight_lb", -0.5) - self.co_player_collision_weight_ub = self.co_player_conditioning.get("collision_weight_ub", -0.5) - self.co_player_offroad_weight_lb = self.co_player_conditioning.get("offroad_weight_lb", -0.2) - self.co_player_offroad_weight_ub = self.co_player_conditioning.get("offroad_weight_ub", -0.2) - self.co_player_goal_weight_lb = self.co_player_conditioning.get("goal_weight_lb", 1.0) - self.co_player_goal_weight_ub = self.co_player_conditioning.get("goal_weight_ub", 1.0) - self.co_player_entropy_weight_lb = self.co_player_conditioning.get("entropy_weight_lb", 0.001) - self.co_player_entropy_weight_ub = self.co_player_conditioning.get("entropy_weight_ub", 0.001) - self.co_player_discount_weight_lb = self.co_player_conditioning.get("discount_weight_lb", 0.98) - self.co_player_discount_weight_ub = self.co_player_conditioning.get("discount_weight_ub", 0.98) + if self.co_player_conditioning: + self.co_player_condition_type = self.co_player_conditioning.get("type") + + self.co_player_reward_conditioned = self.co_player_condition_type in ("reward", "all") + self.co_player_entropy_conditioned = self.co_player_condition_type in ("entropy", "all") + self.co_player_discount_conditioned = self.co_player_condition_type in ("discount", "all") + + self.co_player_collision_weight_lb = self.co_player_conditioning.get("collision_weight_lb", -0.5) + self.co_player_collision_weight_ub = self.co_player_conditioning.get("collision_weight_ub", -0.5) + self.co_player_offroad_weight_lb = self.co_player_conditioning.get("offroad_weight_lb", -0.2) + self.co_player_offroad_weight_ub = self.co_player_conditioning.get("offroad_weight_ub", -0.2) + self.co_player_goal_weight_lb = self.co_player_conditioning.get("goal_weight_lb", 1.0) + self.co_player_goal_weight_ub = self.co_player_conditioning.get("goal_weight_ub", 1.0) + self.co_player_entropy_weight_lb = self.co_player_conditioning.get("entropy_weight_lb", 0.001) + self.co_player_entropy_weight_ub = self.co_player_conditioning.get("entropy_weight_ub", 0.001) + self.co_player_discount_weight_lb = self.co_player_conditioning.get("discount_weight_lb", 0.98) + self.co_player_discount_weight_ub = self.co_player_conditioning.get("discount_weight_ub", 0.98) + self.init_steps = init_steps self.init_mode_str = init_mode self.control_mode_str = control_mode From 5f3578de3eb245ad5774e439a6337e85da397c7a Mon Sep 17 00:00:00 2001 From: "charliemolony59@gmail.com" Date: Fri, 21 Nov 2025 06:54:15 -0500 Subject: [PATCH 09/13] lets try one more time --- pufferlib/ocean/drive/drive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py index a3621fa27c..a3b3b0bba1 100644 --- a/pufferlib/ocean/drive/drive.py +++ b/pufferlib/ocean/drive/drive.py @@ -151,7 +151,7 @@ def __init__( self.co_player_entropy_weight_ub = self.co_player_conditioning.get("entropy_weight_ub", 0.001) self.co_player_discount_weight_lb = self.co_player_conditioning.get("discount_weight_lb", 0.98) self.co_player_discount_weight_ub = self.co_player_conditioning.get("discount_weight_ub", 0.98) - + self.init_steps = init_steps self.init_mode_str = init_mode self.control_mode_str = control_mode From 16b82c2064908dbd6907c78d09b1fb531024a32e Mon Sep 17 00:00:00 2001 From: "charliemolony59@gmail.com" Date: Fri, 21 Nov 2025 12:14:19 -0500 Subject: [PATCH 10/13] beginning implementation of 1 ego per sceneeee --- pufferlib/config/ocean/adaptive.ini | 12 +- pufferlib/models.py | 2 +- pufferlib/ocean/drive/binding.c | 29 ++- pufferlib/ocean/drive/binding.h | 265 +++++++++++++++++++++++++++- pufferlib/ocean/drive/drive.h | 70 +++++++- pufferlib/ocean/drive/drive.py | 172 ++++++++++++------ pufferlib/pufferl.py | 1 - pufferlib/pufferlib.py | 2 +- pufferlib/vector.py | 2 +- 9 files changed, 481 insertions(+), 74 deletions(-) diff --git a/pufferlib/config/ocean/adaptive.ini b/pufferlib/config/ocean/adaptive.ini index 67ff3edb3c..5592fe1cb1 100644 --- a/pufferlib/config/ocean/adaptive.ini +++ b/pufferlib/config/ocean/adaptive.ini @@ -49,11 +49,18 @@ num_maps = 1000 init_steps = 0 ; Options: "control_vehicles", "control_agents", "control_tracks_to_predict" control_mode = "control_vehicles" +; +max_controlled_agents = -1 ; Options: "created_all_valid", "create_only_controlled" init_mode = "create_all_valid" +; +create_expert_overflow = True ; train with co players -co_player_enabled = False - +co_player_enabled = True +; +one_ego_per_scene = False +; +num_ego_agents = 512 [env.conditioning] ; Options: "none", "reward", "entropy", "discount", "all" @@ -71,7 +78,6 @@ discount_weight_ub = 0.80 [env.co_player_policy] enabled = True -num_ego_agents = 512 policy_name = Drive rnn_name = Recurrent policy_path = "resources/drive/policies/varied_discount.pt" diff --git a/pufferlib/models.py b/pufferlib/models.py index 0893a9db47..9e5b47791c 100644 --- a/pufferlib/models.py +++ b/pufferlib/models.py @@ -134,7 +134,7 @@ def forward_eval(self, observations, state): # TODO: Don't break compile if h is not None: - assert h.shape[0] == c.shape[0] == observations.shape[0], "LSTM state must be (h, c)" + assert h.shape[0] == c.shape[0] == observations.shape[0], f"LSTM state must be (h, c), h shape {h.shape[0]}, observations shape: {observations.shape[0]}" lstm_state = (h, c) else: lstm_state = None diff --git a/pufferlib/ocean/drive/binding.c b/pufferlib/ocean/drive/binding.c index 60c03814b7..fa46a0ecc8 100644 --- a/pufferlib/ocean/drive/binding.c +++ b/pufferlib/ocean/drive/binding.c @@ -167,15 +167,42 @@ static int my_init(Env* env, PyObject* args, PyObject* kwargs) { env->ego_agent_ids = NULL; env->num_ego_agents = 0; } + + // Handle placeholder agents + env->num_place_holders = unpack(kwargs, "num_place_holders"); + if (env->num_place_holders > 0) { + double* place_holder_ids_d = unpack_float_array(kwargs, "place_holder_ids", &env->num_place_holders); + if (place_holder_ids_d != NULL) { + env->place_holder_ids = (int*)malloc(env->num_place_holders * sizeof(int)); + if (env->place_holder_ids == NULL) { + fprintf(stderr, "Error: Failed to allocate memory for place_holder_ids\n"); + free(place_holder_ids_d); + env->num_place_holders = 0; + } else { + for (int i = 0; i < env->num_place_holders; i++) { + env->place_holder_ids[i] = (int)place_holder_ids_d[i]; + } + free(place_holder_ids_d); + } + } else { + env->place_holder_ids = NULL; + env->num_place_holders = 0; + } + } else { + env->place_holder_ids = NULL; + env->num_place_holders = 0; + } } else { // Non-population play mode - set defaults env->num_ego_agents = 0; env->ego_agent_ids = NULL; + env->num_place_holders = 0; + env->place_holder_ids = NULL; } - env->init_mode = (int)unpack(kwargs, "init_mode"); env->control_mode = (int)unpack(kwargs, "control_mode"); + env->create_expert_overflow = (int)unpack(kwargs, "create_expert_overflow"); env->goal_behavior = (int)unpack(kwargs, "goal_behavior"); env->goal_radius = (float)unpack(kwargs, "goal_radius"); int map_id = unpack(kwargs, "map_id"); diff --git a/pufferlib/ocean/drive/binding.h b/pufferlib/ocean/drive/binding.h index b5ec9ed65d..56eb5d29b6 100644 --- a/pufferlib/ocean/drive/binding.h +++ b/pufferlib/ocean/drive/binding.h @@ -169,7 +169,256 @@ static double* unpack_float_array(PyObject* kwargs, char* key, Py_ssize_t* out_s return array; } -static PyObject* my_shared_population_play(PyObject* self, PyObject* args, PyObject* kwargs) { +static PyObject* my_shared_one_ego_per_scene(PyObject* self, PyObject* args, PyObject* kwargs){ + // Extract parameters + int num_agents = unpack(kwargs, "num_agents"); // total agents across all environments + int num_environments = unpack(kwargs, "num_ego_agents"); // number of environments to create + int num_maps = unpack(kwargs, "num_maps"); + int init_mode = unpack(kwargs, "init_mode"); + int population_play = unpack(kwargs, "population_play"); + int control_mode = unpack(kwargs, "control_mode"); + int init_steps = unpack(kwargs, "init_steps"); + int max_controlled_agents = unpack(kwargs, "max_controlled_agents"); + + // Verify that num_agents is divisible by num_environments + if (num_agents % num_environments != 0) { + PyErr_Format(PyExc_ValueError, + "num_agents (%d) must be divisible by num_ego_agents (%d)", + num_agents, num_environments); + return NULL; + } + + int agents_per_env = num_agents / num_environments; + + printf("Creating %d worlds with 1 ego + up to %d co-players per world (max_controlled: %d)\n", + num_environments, agents_per_env - 1, max_controlled_agents); + + // Use current time for randomness + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + srand(ts.tv_nsec); + + // Prepare return lists + PyObject* agent_offsets = PyList_New(num_environments + 1); + PyObject* map_ids = PyList_New(num_environments); + PyObject* ego_agent_ids = PyList_New(num_environments); + PyObject* coplayer_ids = PyList_New(num_environments); + + if (!agent_offsets || !map_ids || !ego_agent_ids || !coplayer_ids) { + Py_XDECREF(agent_offsets); + Py_XDECREF(map_ids); + Py_XDECREF(ego_agent_ids); + Py_XDECREF(coplayer_ids); + return PyErr_NoMemory(); + } + + int total_agent_count = 0; + int env_count = 0; + int max_retries = num_maps * 3; // Prevent infinite loops + int retry_count = 0; + int total_egos_assigned = 0; + int total_coplayers_assigned = 0; + + // Create exactly num_environments worlds + while (env_count < num_environments && retry_count < max_retries) { + char map_file[100]; + int map_id = rand() % num_maps; + Drive* env = calloc(1, sizeof(Drive)); + if (!env) { + Py_DECREF(agent_offsets); + Py_DECREF(map_ids); + Py_DECREF(ego_agent_ids); + Py_DECREF(coplayer_ids); + return PyErr_NoMemory(); + } + + sprintf(map_file, "resources/drive/binaries/map_%03d.bin", map_id); + env->entities = load_map_binary(map_file, env); + + env->num_agents = agents_per_env; + env->population_play = population_play; + env->init_mode = init_mode; + env->control_mode = control_mode; + env->init_steps = init_steps; + env->max_controlled_agents = max_controlled_agents; + + set_active_agents(env); + + // Check if this map has at least one active agent (for the ego) + if (env->active_agent_count == 0) { + fprintf(stderr, + "[one_ego_per_scene] WARNING: Map %d has no active agents. Skipping.\n", + map_id); + + for(int j=0; jnum_entities; j++) { + free_entity(&env->entities[j]); + } + free(env->entities); + free(env->active_agent_indices); + free(env->static_agent_indices); + free(env->expert_static_agent_indices); + free(env); + retry_count++; + continue; // Try another map + } + + // Store map_id + PyObject* map_id_obj = PyLong_FromLong(map_id); + PyList_SetItem(map_ids, env_count, map_id_obj); + + // Store agent offset + PyObject* offset = PyLong_FromLong(total_agent_count); + PyList_SetItem(agent_offsets, env_count, offset); + + // Calculate number of co-players based on active_agent_count + // We want 1 ego + as many co-players as the map supports (up to agents_per_env - 1) + int num_coplayers = (env->active_agent_count < agents_per_env) + ? (env->active_agent_count - 1) + : (agents_per_env - 1); + + int actual_agents = 1 + num_coplayers; // 1 ego + co-players + + // Create agent role array for this world (0 = coplayer, 1 = ego) + // Only for the actual agents (not placeholders) + int* agent_roles = malloc(actual_agents * sizeof(int)); + if (!agent_roles) { + for(int j=0; jnum_entities; j++) { + free_entity(&env->entities[j]); + } + free(env->entities); + free(env->active_agent_indices); + free(env->static_agent_indices); + free(env->expert_static_agent_indices); + free(env); + Py_DECREF(agent_offsets); + Py_DECREF(map_ids); + Py_DECREF(ego_agent_ids); + Py_DECREF(coplayer_ids); + return PyErr_NoMemory(); + } + + agent_roles[0] = 1; // First is ego + for (int i = 1; i < actual_agents; i++) { + agent_roles[i] = 0; // Rest are co-players + } + + // Fisher-Yates shuffle to randomize ego and co-player positions + for (int i = actual_agents - 1; i > 0; i--) { + int j = rand() % (i + 1); + int temp = agent_roles[i]; + agent_roles[i] = agent_roles[j]; + agent_roles[j] = temp; + } + + // Create ego and coplayer lists for this world + PyObject* ego_list = PyList_New(0); + PyObject* coplayer_list = PyList_New(0); + + if (!ego_list || !coplayer_list) { + Py_XDECREF(ego_list); + Py_XDECREF(coplayer_list); + free(agent_roles); + for(int j=0; jnum_entities; j++) { + free_entity(&env->entities[j]); + } + free(env->entities); + free(env->active_agent_indices); + free(env->static_agent_indices); + free(env->expert_static_agent_indices); + free(env); + Py_DECREF(agent_offsets); + Py_DECREF(map_ids); + Py_DECREF(ego_agent_ids); + Py_DECREF(coplayer_ids); + return PyErr_NoMemory(); + } + + // Assign agents based on shuffled roles + for (int a = 0; a < actual_agents; a++) { + PyObject* agent_id = PyLong_FromLong(total_agent_count); + + if (agent_roles[a] == 1) { + // This agent is the ego + PyList_Append(ego_list, agent_id); + total_egos_assigned++; + } else { + // This agent is a co-player + PyList_Append(coplayer_list, agent_id); + total_coplayers_assigned++; + } + + Py_DECREF(agent_id); + total_agent_count++; + } + + // Skip ahead for placeholder slots (not assigned to any agent) + int num_placeholders = agents_per_env - actual_agents; + total_agent_count += num_placeholders; + + PyList_SetItem(ego_agent_ids, env_count, ego_list); + PyList_SetItem(coplayer_ids, env_count, coplayer_list); + + printf("World %d (map %d): 1 ego, %d co-players, %d placeholder slots (active_agent_count: %d)\n", + env_count, map_id, num_coplayers, num_placeholders, env->active_agent_count); + + // Free the agent roles array + free(agent_roles); + + env_count++; + + // Free environment resources + for(int j=0; jnum_entities; j++) { + free_entity(&env->entities[j]); + } + free(env->entities); + free(env->active_agent_indices); + free(env->static_agent_indices); + free(env->expert_static_agent_indices); + free(env); + } + + // Check if we successfully created all environments + if (env_count < num_environments) { + Py_DECREF(agent_offsets); + Py_DECREF(map_ids); + Py_DECREF(ego_agent_ids); + Py_DECREF(coplayer_ids); + PyErr_Format(PyExc_RuntimeError, + "Could not create %d environments (only created %d). Not enough valid maps.", + num_environments, env_count); + return NULL; + } + + // Store final agent offset + PyObject* final_total_agent_count = PyLong_FromLong(total_agent_count); + PyList_SetItem(agent_offsets, num_environments, final_total_agent_count); + PyObject* final_env_count = PyLong_FromLong(env_count); + + // Create return tuple with 5 elements (same as my_shared_split_numerically) + PyObject* tuple = PyTuple_New(5); + if (!tuple) { + Py_DECREF(agent_offsets); + Py_DECREF(map_ids); + Py_DECREF(final_env_count); + Py_DECREF(ego_agent_ids); + Py_DECREF(coplayer_ids); + return PyErr_NoMemory(); + } + + PyTuple_SetItem(tuple, 0, agent_offsets); + PyTuple_SetItem(tuple, 1, map_ids); + PyTuple_SetItem(tuple, 2, final_env_count); + PyTuple_SetItem(tuple, 3, ego_agent_ids); + PyTuple_SetItem(tuple, 4, coplayer_ids); + + int total_placeholders = total_agent_count - total_egos_assigned - total_coplayers_assigned; + printf("Total: %d agents across %d worlds (%d egos, %d co-players, %d placeholders)\n", + total_agent_count, env_count, total_egos_assigned, total_coplayers_assigned, total_placeholders); + + return tuple; +} + +static PyObject* my_shared_split_numerically(PyObject* self, PyObject* args, PyObject* kwargs) { int num_agents = unpack(kwargs, "num_agents"); int num_maps = unpack(kwargs, "num_maps"); int num_ego_agents = unpack(kwargs, "num_ego_agents"); @@ -343,8 +592,8 @@ static PyObject* my_shared_population_play(PyObject* self, PyObject* args, PyObj PyList_SetItem(ego_agent_ids, env_count, ego_list); PyList_SetItem(coplayer_ids, env_count, coplayer_list); - printf("World %d (map %d): %d agents (%d egos, %d co-players)\n", - env_count, map_id, env->active_agent_count, world_egos, world_coplayers); + // printf("World %d (map %d): %d agents (%d egos, %d co-players)\n", + // env_count, map_id, env->active_agent_count, world_egos, world_coplayers); env_count++; @@ -393,3 +642,13 @@ static PyObject* my_shared_population_play(PyObject* self, PyObject* args, PyObj return tuple; } + +static PyObject* my_shared_population_play(PyObject* self, PyObject* args, PyObject* kwargs){ + int one_ego_per_scene = unpack(kwargs, "one_ego_per_scene"); + if (one_ego_per_scene){ + return my_shared_one_ego_per_scene(self, args, kwargs); + } + else{ + return my_shared_split_numerically(self, args, kwargs); + } +} diff --git a/pufferlib/ocean/drive/drive.h b/pufferlib/ocean/drive/drive.h index 7dbb2c205b..10d9231503 100644 --- a/pufferlib/ocean/drive/drive.h +++ b/pufferlib/ocean/drive/drive.h @@ -368,6 +368,7 @@ struct Drive { int* tracks_to_predict_indices; int init_mode; int control_mode; + int create_expert_overflow ; // Reward conditioning bool use_rc; @@ -397,6 +398,8 @@ struct Drive { int num_ego_agents; int* co_player_ids; int* ego_agent_ids; + int num_place_holders; + int* place_holder_ids; bool population_play; @@ -1446,6 +1449,8 @@ void set_active_agents(Drive* env){ if(env->num_agents == 0){ env->num_agents = MAX_AGENTS; } + int skipped_overflow_count = 0; // Track agents skipped due to overflow + int skipped_noncontrolled_count = 0; // Track non-controlled agents skipped // Iterate through entities to find agents to create and/or control for(int i = 0; i < env->num_objects && env->num_actors < MAX_AGENTS; i++){ @@ -1457,10 +1462,12 @@ void set_active_agents(Drive* env){ continue; } - // Determine if entity should be created + // Determine if entity should be created based on init_mode and control_mode bool should_create = false; if (env->init_mode == INIT_ALL_VALID) { should_create = true; // All valid entities + } else if (env->init_mode == INIT_ONLY_CONTROLLABLE_AGENTS) { + should_create = should_control_agent(env, i); } else if (env->control_mode == CONTROL_VEHICLES) { should_create = (entity->type == VEHICLE); } else { // Control all agents @@ -1469,17 +1476,57 @@ void set_active_agents(Drive* env){ if (!should_create) continue; - env->num_actors++; - // Determine if this agent should be policy-controlled - bool is_controlled = false; - is_controlled = should_control_agent(env, i); + bool is_controlled = should_control_agent(env, i); + + // NEW: When create_expert_overflow is disabled, skip vehicles/agents that won't be controlled + if (!env->create_expert_overflow && !is_controlled) { + if (env->control_mode == CONTROL_VEHICLES && entity->type == VEHICLE) { + // Skip non-controlled vehicles + skipped_noncontrolled_count++; + continue; + } else if (env->control_mode != CONTROL_VEHICLES && + (entity->type == VEHICLE || entity->type == PEDESTRIAN || entity->type == CYCLIST)) { + // Skip all non-controlled agents when controlling all + skipped_noncontrolled_count++; + continue; + } + } - if (is_controlled && env->active_agent_count >= env->max_controlled_agents && env->max_controlled_agents != -1) { + // Check if we should skip overflow agents when create_expert_overflow is disabled + if (is_controlled && + env->active_agent_count >= env->max_controlled_agents && + env->max_controlled_agents != -1 && + !env->create_expert_overflow) { + + // Determine what to skip based on control mode + bool should_skip = false; + if (env->control_mode == CONTROL_VEHICLES) { + // Only skip overflow vehicles + should_skip = (entity->type == VEHICLE); + } else { + // Skip all overflow controllable agents + should_skip = true; + } + + if (should_skip) { + skipped_overflow_count++; + continue; // Don't create this agent at all + } + } + + env->num_actors++; + + // NEW: Handle max_controlled_agents (creates experts ONLY if create_expert_overflow is enabled) + if (is_controlled && + env->active_agent_count >= env->max_controlled_agents && + env->max_controlled_agents != -1 && + env->create_expert_overflow) { is_controlled = false; entity->mark_as_expert = 1; } + // Assign agent to appropriate category if(is_controlled){ active_agent_indices[env->active_agent_count] = i; env->active_agent_count++; @@ -1500,13 +1547,14 @@ void set_active_agents(Drive* env){ env->active_agent_indices = (int*)malloc(env->active_agent_count * sizeof(int)); env->static_agent_indices = (int*)malloc(env->static_agent_count * sizeof(int)); env->expert_static_agent_indices = (int*)malloc(env->expert_static_agent_count * sizeof(int)); - for(int i=0;iactive_agent_count;i++){ + + for(int i=0; iactive_agent_count; i++){ env->active_agent_indices[i] = active_agent_indices[i]; } - for(int i=0;istatic_agent_count;i++){ + for(int i=0; istatic_agent_count; i++){ env->static_agent_indices[i] = static_agent_indices[i]; } - for(int i=0;iexpert_static_agent_count;i++){ + for(int i=0; iexpert_static_agent_count; i++){ env->expert_static_agent_indices[i] = expert_static_agent_indices[i]; } return; @@ -1708,6 +1756,10 @@ void c_close(Drive* env){ freeTopologyGraph(env->topology_graph); // free(env->map_name); free(env->ini_file); + if (env->place_holder_ids != NULL) { + free(env->place_holder_ids); + env->place_holder_ids = NULL; + } } diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py index a3b3b0bba1..4d1d1eb00a 100644 --- a/pufferlib/ocean/drive/drive.py +++ b/pufferlib/ocean/drive/drive.py @@ -38,13 +38,15 @@ def __init__( init_steps=0, init_mode="create_all_valid", control_mode="control_vehicles", + create_expert_overflow = True, k_scenarios=1, adaptive_driving_agent=False, ini_file="pufferlib/config/ocean/drive.ini", conditioning={}, # ego conditioning co_player_enabled=False, - co_player_num_ego=512, + num_ego_agents=512, co_player_policy={}, + one_ego_per_scene = False, ): # env self.dt = dt @@ -64,6 +66,7 @@ def __init__( self.scenario_length = scenario_length self.resample_frequency = resample_frequency self.ini_file = ini_file + self.one_ego_per_scene = one_ego_per_scene # Adaptive driving agent setup self.adaptive_driving_agent = int(adaptive_driving_agent) @@ -130,10 +133,11 @@ def __init__( # Co-player policy setup self.population_play = co_player_enabled self.num_agents = num_agents - self.num_ego_agents = co_player_num_ego if self.population_play else num_agents + self.num_ego_agents = num_ego_agents if self.population_play else num_agents # Co-player conditioning setup self.co_player_conditioning = co_player_policy.get("conditioning") + self.co_player_policy_info = co_player_policy if self.co_player_conditioning: self.co_player_condition_type = self.co_player_conditioning.get("type") @@ -155,6 +159,7 @@ def __init__( self.init_steps = init_steps self.init_mode_str = init_mode self.control_mode_str = control_mode + self.create_expert_overflow = create_expert_overflow if self.control_mode_str == "control_vehicles": self.control_mode = 0 @@ -214,25 +219,22 @@ def __init__( ) if self.condition_type != "none" and self.co_player_condition_type != "none": raise NotImplementedError("Only one agent can be conditioned at once") + + if self.one_ego_per_scene: + assert (max_controlled_agents * self.num_ego_agents == self.num_agents) + + if not self.population_play and self.one_ego_per_scene: + raise NotImplementedError("Must be population play if only one ego per scene") + + self.max_controlled_agents = int(max_controlled_agents) self._set_env_variables() - if self.population_play: - self.co_player_policy_name = co_player_policy.get("policy_name") - self.co_player_rnn_name = co_player_policy.get("rnn_name") - self.co_player_policy = co_player_policy.get("co_player_policy_func") - self._set_co_player_state() - super().__init__(buf=buf) if self.population_play: self.action_space = pufferlib.spaces.joint_space(self.single_action_space, self.num_ego_agents) - co_player_atn_space = pufferlib.spaces.joint_space(self.single_action_space, self.num_co_players) - if isinstance(self.single_action_space, pufferlib.spaces.Box): - self.co_player_actions = np.zeros(co_player_atn_space.shape, dtype=co_player_atn_space.dtype) - else: - self.co_player_actions = np.zeros(co_player_atn_space.shape, dtype=np.int32) env_ids = [] for i in range(self.num_envs): @@ -268,6 +270,8 @@ def __init__( co_player_ids=self.local_co_player_ids[i], ego_agent_ids=self.local_ego_ids[i], num_ego_agents=len(self.local_ego_ids[i]), + place_holder_ids = self.local_place_holder_ids[i], + num_place_holders = len(self.local_place_holder_ids[i]), init_steps=init_steps, use_rc=self.reward_conditioned, use_ec=self.entropy_conditioned, @@ -284,6 +288,7 @@ def __init__( discount_weight_ub=self.discount_weight_ub, init_mode=self.init_mode, control_mode=self.control_mode, + create_expert_overflow = self.create_expert_overflow ) env_ids.append(env_id) @@ -294,10 +299,10 @@ def reset(self, seed=0): info = [] if self.population_play: info.append(self.ego_ids) - self._reset_co_player_state() self.tick = 0 return self.observations, info + def _set_env_variables(self): my_shared_tuple = binding.shared( num_agents=self.num_agents, @@ -309,33 +314,59 @@ def _set_env_variables(self): goal_behavior=self.goal_behavior, population_play=self.population_play, num_ego_agents=self.num_ego_agents, + one_ego_per_scene=self.one_ego_per_scene, ) if self.population_play: + # Both return 5 elements now self.agent_offsets, self.map_ids, num_envs, ego_ids, co_player_ids = my_shared_tuple - + self.num_envs = num_envs + # Flatten the lists self.ego_ids = [item for sublist in ego_ids for item in sublist] self.co_player_ids = [item for sublist in co_player_ids for item in sublist] - + + # Compute placeholders on Python side all_agents = set(range(self.num_agents)) ego_set = set(self.ego_ids) co_player_set = set(self.co_player_ids) + self.place_holder_ids = sorted(list(all_agents - ego_set - co_player_set)) + + # print(f"DEBUG: Ego Ids: {self.ego_ids}", flush=True) + # print(f"DEBUG: co player ids {self.co_player_ids}", flush=True) + # print(f"DEBUG: placeholder ids {self.place_holder_ids}", flush=True) + + # Store counts self.num_ego_agents = len(self.ego_ids) self.num_co_players = len(self.co_player_ids) + self.num_place_holders = len(self.place_holder_ids) + # Validation checks if ego_set & co_player_set: raise ValueError("Overlap between ego ids and co player ids") - if ego_set | co_player_set != all_agents: - raise ValueError("Missing agent ids") + # Check total count matches + if self.num_ego_agents + self.num_co_players + self.num_place_holders != self.num_agents: + raise ValueError( + f"Agent count mismatch: {self.num_ego_agents} egos + {self.num_co_players} co-players + " + f"{self.num_place_holders} placeholders = " + f"{self.num_ego_agents + self.num_co_players + self.num_place_holders}, expected {self.num_agents}" + ) - if self.num_ego_agents + self.num_co_players != self.num_agents: - raise ValueError("Mismatch between number of ego/co players and number of agents") + # For one_ego_per_scene mode, verify we have the expected number of egos + if self.one_ego_per_scene: + expected_egos = self.num_envs + if self.num_ego_agents != expected_egos: + raise ValueError( + f"one_ego_per_scene mode: expected {expected_egos} egos (1 per env), got {self.num_ego_agents}" + ) - self.total_agents = self.num_co_players + self.num_ego_agents - self.num_agents = self.total_agents + # total_agents includes ALL agent slots (ego + co-player + placeholder) + self.total_agents = self.num_agents # This includes placeholders + + print(f"DEBUG: Total agents (all slots): {self.total_agents}", flush=True) + print(f"DEBUG: Breakdown - Egos: {self.num_ego_agents}, Co-players: {self.num_co_players}, Placeholders: {self.num_place_holders}", flush=True) # Build per-environment ID lists local_ego_ids = [] @@ -364,11 +395,48 @@ def _set_env_variables(self): local_co_player_ids.append([cid - min_id_in_world for cid in co_player_ids[i]]) - self.local_co_player_ids = local_co_player_ids + local_place_holder_ids = [] + for i in range(num_envs): + if len(ego_ids[i]) > 0 and len(co_player_ids[i]) > 0: + min_id_in_world = min(ego_ids[i] + co_player_ids[i]) + elif len(ego_ids[i]) > 0: + min_id_in_world = min(ego_ids[i]) + elif len(co_player_ids[i]) > 0: + min_id_in_world = min(co_player_ids[i]) + else: + min_id_in_world = 0 + + # Compute placeholder IDs for this environment + env_start = self.agent_offsets[i] + env_end = self.agent_offsets[i + 1] + env_agent_ids = set(range(env_start, env_end)) + env_ego_set = set(ego_ids[i]) + env_co_player_set = set(co_player_ids[i]) + env_placeholder_ids = sorted(list(env_agent_ids - env_ego_set - env_co_player_set)) + + # Convert to local IDs + local_place_holder_ids.append([pid - min_id_in_world for pid in env_placeholder_ids]) + + self.local_ego_ids = local_ego_ids + self.local_co_player_ids = local_co_player_ids + self.local_place_holder_ids = local_place_holder_ids + if self.co_player_condition_type is not None and self.co_player_condition_type != "none": self._set_co_player_conditioning() + if self.population_play: + self.co_player_policy_name = self.co_player_policy_info.get("policy_name") + self.co_player_rnn_name = self.co_player_policy_info.get("rnn_name") + self.co_player_policy = self.co_player_policy_info.get("co_player_policy_func") + self._set_co_player_state() + + + co_player_atn_space = pufferlib.spaces.joint_space(self.single_action_space, self.num_co_players) + if isinstance(self.single_action_space, pufferlib.spaces.Box): + self.co_player_actions = np.zeros(co_player_atn_space.shape, dtype=co_player_atn_space.dtype) + else: + self.co_player_actions = np.zeros(co_player_atn_space.shape, dtype=np.int32) else: self.agent_offsets, self.map_ids, self.num_envs = my_shared_tuple self.ego_ids = [i for i in range(self.agent_offsets[-1])] @@ -397,17 +465,6 @@ def _set_co_player_state(self): lstm_c=torch.zeros(self.num_co_players, self.co_player_policy.hidden_size), ) - def _reset_co_player_state(self, done_indices=None): - """Reset LSTM state for co-players whose episodes ended""" - with torch.no_grad(): - if done_indices is None: - # Reset all - self._set_co_player_state() - else: - # Reset only specific co-players - device = self.state["lstm_h"].device - self.state["lstm_h"][done_indices] = 0 - self.state["lstm_c"][done_indices] = 0 def _add_co_player_conditioning(self, observations): """Add pre-sampled conditioning variables to co-player observations""" @@ -501,21 +558,16 @@ def _aggregate_scenario_metrics(self, scenario_infos): return aggregated def _compute_delta_metrics(self): - """Compute delta metrics between first and last scenario.""" + """Compute absolute delta metrics between first and last scenario.""" if len(self.scenario_metrics) < 2: return {} first_metrics = self.scenario_metrics[0] last_metrics = self.scenario_metrics[-1] - def compute_delta_percent(first_val, last_val): - if abs(first_val) < 0.0001: - return 0.0 - return (last_val - first_val) / first_val * 100.0 - delta_metrics = {} - # Compute deltas for key metrics + # Compute absolute deltas for key metrics (last - first) metrics_to_track = [ "score", "collision_rate", @@ -532,7 +584,7 @@ def compute_delta_percent(first_val, last_val): for metric in metrics_to_track: if metric in first_metrics and metric in last_metrics: delta_key = f"ada_delta_{metric}" - delta_metrics[delta_key] = compute_delta_percent(first_metrics[metric], last_metrics[metric]) + delta_metrics[delta_key] = last_metrics[metric] - first_metrics[metric] # Add a count of how many agents this represents if "n" in last_metrics: @@ -557,28 +609,38 @@ def step(self, actions): if self.tick % self.report_interval == 0: log = binding.vec_log(self.c_envs) if log: + print(f"log is {log}", flush=True) + # Accumulate infos for adaptive agent tracking if self.adaptive_driving_agent: self.current_scenario_infos.append(log) + + # Only append log to info if it's the 0th scenario + if self.current_scenario == 0: + info.append(log) - info.append(log) - + # Check if we've completed a scenario if self.tick % self.scenario_length == 0: + # Before incrementing scenario, process the completed scenario if self.adaptive_driving_agent and self.current_scenario_infos: + # Aggregate metrics for the completed scenario scenario_log = self._aggregate_scenario_metrics(self.current_scenario_infos) - scenario_log["scenario_id"] = self.current_scenario + scenario_log['scenario_id'] = self.current_scenario self.scenario_metrics.append(scenario_log) - + + # Check if this is the last scenario if self.current_scenario == self.k_scenarios - 1: + # Compute and append delta metrics delta_metrics = self._compute_delta_metrics() - if delta_metrics and info: - info[-1].update(delta_metrics) - elif delta_metrics: + if delta_metrics: info.append(delta_metrics) - + + # Reset for next cycle self.scenario_metrics = [] - + + # Clear accumulated infos for next scenario self.current_scenario_infos = [] - + + # Increment scenario counter self.current_scenario = (self.current_scenario + 1) % self.k_scenarios if self.tick > 0 and self.resample_frequency > 0 and self.tick % self.resample_frequency == 0: @@ -593,7 +655,7 @@ def step(self, actions): self.scenario_metrics = [] self.current_scenario_infos = [] self.current_scenario = 0 - + binding.vec_close(self.c_envs) self._set_env_variables() env_ids = [] @@ -644,9 +706,12 @@ def step(self, actions): co_player_ids=self.local_co_player_ids[i], ego_agent_ids=self.local_ego_ids[i], num_ego_agents=len(self.local_ego_ids[i]), + place_holder_ids = self.local_place_holder_ids[i], + num_place_holders = len(self.local_place_holder_ids[i]), init_steps=self.init_steps, init_mode=self.init_mode, control_mode=self.control_mode, + create_expert_overflow = self.create_expert_overflow ) env_ids.append(env_id) self.c_envs = binding.vectorize(*env_ids) @@ -658,7 +723,6 @@ def step(self, actions): info.append(self.ego_ids) return (self.observations, self.rewards, self.terminals, self.truncations, info) - def get_global_agent_state(self): """Get current global state of all active agents. diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py index 125c4c9fd5..ffc9657b21 100644 --- a/pufferlib/pufferl.py +++ b/pufferlib/pufferl.py @@ -293,7 +293,6 @@ def evaluate(self): while self.full_rows < self.segments: profile("env", epoch) o, r, d, t, info, env_id, mask = self.vecenv.recv() - # print(f"o shape is {o.shape}", flush = True) if self.population_play: batch_size = self.vecenv.batch_size ego_ids = info[-1] diff --git a/pufferlib/pufferlib.py b/pufferlib/pufferlib.py index e718241a05..60cc2a9f68 100644 --- a/pufferlib/pufferlib.py +++ b/pufferlib/pufferlib.py @@ -38,7 +38,7 @@ def set_buffers(env, buf=None): env.truncations = buf["truncations"] env.masks = buf["masks"] env.actions = buf["actions"] - + class PufferEnv: def __init__(self, buf=None): diff --git a/pufferlib/vector.py b/pufferlib/vector.py index 24ac492405..bdd3d11b9b 100644 --- a/pufferlib/vector.py +++ b/pufferlib/vector.py @@ -408,7 +408,7 @@ def __init__( notify=np.ndarray(num_workers, dtype=bool, buffer=self.shm["notify"]), ) self.buf["semaphores"][:] = MAIN - + print(f"observation shape after buffer created {self.buf["observations"].shape}", flush = True) from multiprocessing import Pipe, Process self.send_pipes, w_recv_pipes = zip(*[Pipe() for _ in range(num_workers)]) From 284c9d866940d6b8b0cd936ce7149b244fca6e69 Mon Sep 17 00:00:00 2001 From: "charliemolony59@gmail.com" Date: Mon, 24 Nov 2025 09:47:05 -0500 Subject: [PATCH 11/13] Training with one ego per world working --- pufferlib/config/ocean/adaptive.ini | 22 +++--- pufferlib/models.py | 4 +- pufferlib/ocean/drive/binding.h | 38 ++++++----- pufferlib/ocean/drive/drive.h | 16 ++--- pufferlib/ocean/drive/drive.py | 101 ++++++++++++---------------- pufferlib/pufferlib.py | 2 +- pufferlib/vector.py | 2 +- 7 files changed, 87 insertions(+), 98 deletions(-) diff --git a/pufferlib/config/ocean/adaptive.ini b/pufferlib/config/ocean/adaptive.ini index 5592fe1cb1..8773a0703c 100644 --- a/pufferlib/config/ocean/adaptive.ini +++ b/pufferlib/config/ocean/adaptive.ini @@ -19,7 +19,7 @@ input_size = 256 hidden_size = 256 [env] -num_agents = 1024 +num_agents = 8192 ; Options: discrete, continuous action_type = discrete ; Options: classic, jerk @@ -50,17 +50,19 @@ init_steps = 0 ; Options: "control_vehicles", "control_agents", "control_tracks_to_predict" control_mode = "control_vehicles" ; -max_controlled_agents = -1 +max_controlled_agents = 16 ; Options: "created_all_valid", "create_only_controlled" init_mode = "create_all_valid" ; -create_expert_overflow = True -; train with co players -co_player_enabled = True -; +create_expert_overflow = False +; train with co players, co players must be defined below if this is true +co_player_enabled = False +; Train with only one ego agent per world, If trye then num_agents must be equal to max_controlled agents * num_ego agents one_ego_per_scene = False -; +; Total number of agents training across all worlds num_ego_agents = 512 +; + [env.conditioning] ; Options: "none", "reward", "entropy", "discount", "all" @@ -99,12 +101,12 @@ goal_weight_lb = 0.0 goal_weight_ub = 1.0 entropy_weight_lb = 0.0 entropy_weight_ub = 0.001 -discount_weight_lb = 0.98 -discount_weight_ub = 0.80 +discount_weight_lb = 0.80 +discount_weight_ub = 0.98 [train] -total_timesteps = 2_000_000_000 +total_timesteps = 3_000_000_000 # learning_rate = 0.02 # gamma = 0.985 anneal_lr = True diff --git a/pufferlib/models.py b/pufferlib/models.py index 9e5b47791c..bbb55fdc03 100644 --- a/pufferlib/models.py +++ b/pufferlib/models.py @@ -134,7 +134,9 @@ def forward_eval(self, observations, state): # TODO: Don't break compile if h is not None: - assert h.shape[0] == c.shape[0] == observations.shape[0], f"LSTM state must be (h, c), h shape {h.shape[0]}, observations shape: {observations.shape[0]}" + assert h.shape[0] == c.shape[0] == observations.shape[0], ( + f"LSTM state must be (h, c), h shape {h.shape[0]}, observations shape: {observations.shape[0]}" + ) lstm_state = (h, c) else: lstm_state = None diff --git a/pufferlib/ocean/drive/binding.h b/pufferlib/ocean/drive/binding.h index 56eb5d29b6..807576ec09 100644 --- a/pufferlib/ocean/drive/binding.h +++ b/pufferlib/ocean/drive/binding.h @@ -168,7 +168,6 @@ static double* unpack_float_array(PyObject* kwargs, char* key, Py_ssize_t* out_s return array; } - static PyObject* my_shared_one_ego_per_scene(PyObject* self, PyObject* args, PyObject* kwargs){ // Extract parameters int num_agents = unpack(kwargs, "num_agents"); // total agents across all environments @@ -189,7 +188,7 @@ static PyObject* my_shared_one_ego_per_scene(PyObject* self, PyObject* args, PyO } int agents_per_env = num_agents / num_environments; - + printf("Creating %d worlds with 1 ego + up to %d co-players per world (max_controlled: %d)\n", num_environments, agents_per_env - 1, max_controlled_agents); @@ -231,7 +230,7 @@ static PyObject* my_shared_one_ego_per_scene(PyObject* self, PyObject* args, PyO Py_DECREF(coplayer_ids); return PyErr_NoMemory(); } - + sprintf(map_file, "resources/drive/binaries/map_%03d.bin", map_id); env->entities = load_map_binary(map_file, env); @@ -249,7 +248,7 @@ static PyObject* my_shared_one_ego_per_scene(PyObject* self, PyObject* args, PyO fprintf(stderr, "[one_ego_per_scene] WARNING: Map %d has no active agents. Skipping.\n", map_id); - + for(int j=0; jnum_entities; j++) { free_entity(&env->entities[j]); } @@ -272,12 +271,12 @@ static PyObject* my_shared_one_ego_per_scene(PyObject* self, PyObject* args, PyO // Calculate number of co-players based on active_agent_count // We want 1 ego + as many co-players as the map supports (up to agents_per_env - 1) - int num_coplayers = (env->active_agent_count < agents_per_env) - ? (env->active_agent_count - 1) + int num_coplayers = (env->active_agent_count < agents_per_env) + ? (env->active_agent_count - 1) : (agents_per_env - 1); - + int actual_agents = 1 + num_coplayers; // 1 ego + co-players - + // Create agent role array for this world (0 = coplayer, 1 = ego) // Only for the actual agents (not placeholders) int* agent_roles = malloc(actual_agents * sizeof(int)); @@ -296,12 +295,16 @@ static PyObject* my_shared_one_ego_per_scene(PyObject* self, PyObject* args, PyO Py_DECREF(coplayer_ids); return PyErr_NoMemory(); } - - agent_roles[0] = 1; // First is ego - for (int i = 1; i < actual_agents; i++) { - agent_roles[i] = 0; // Rest are co-players + + // Initialize all as co-players + for (int i = 0; i < actual_agents; i++) { + agent_roles[i] = 0; } - + + // Randomly select one agent to be the ego + int ego_idx = rand() % actual_agents; + agent_roles[ego_idx] = 1; + // Fisher-Yates shuffle to randomize ego and co-player positions for (int i = actual_agents - 1; i > 0; i--) { int j = rand() % (i + 1); @@ -336,7 +339,7 @@ static PyObject* my_shared_one_ego_per_scene(PyObject* self, PyObject* args, PyO // Assign agents based on shuffled roles for (int a = 0; a < actual_agents; a++) { PyObject* agent_id = PyLong_FromLong(total_agent_count); - + if (agent_roles[a] == 1) { // This agent is the ego PyList_Append(ego_list, agent_id); @@ -346,7 +349,7 @@ static PyObject* my_shared_one_ego_per_scene(PyObject* self, PyObject* args, PyO PyList_Append(coplayer_list, agent_id); total_coplayers_assigned++; } - + Py_DECREF(agent_id); total_agent_count++; } @@ -363,7 +366,7 @@ static PyObject* my_shared_one_ego_per_scene(PyObject* self, PyObject* args, PyO // Free the agent roles array free(agent_roles); - + env_count++; // Free environment resources @@ -404,7 +407,7 @@ static PyObject* my_shared_one_ego_per_scene(PyObject* self, PyObject* args, PyO Py_DECREF(coplayer_ids); return PyErr_NoMemory(); } - + PyTuple_SetItem(tuple, 0, agent_offsets); PyTuple_SetItem(tuple, 1, map_ids); PyTuple_SetItem(tuple, 2, final_env_count); @@ -417,7 +420,6 @@ static PyObject* my_shared_one_ego_per_scene(PyObject* self, PyObject* args, PyO return tuple; } - static PyObject* my_shared_split_numerically(PyObject* self, PyObject* args, PyObject* kwargs) { int num_agents = unpack(kwargs, "num_agents"); int num_maps = unpack(kwargs, "num_maps"); diff --git a/pufferlib/ocean/drive/drive.h b/pufferlib/ocean/drive/drive.h index 10d9231503..e245700efa 100644 --- a/pufferlib/ocean/drive/drive.h +++ b/pufferlib/ocean/drive/drive.h @@ -1485,7 +1485,7 @@ void set_active_agents(Drive* env){ // Skip non-controlled vehicles skipped_noncontrolled_count++; continue; - } else if (env->control_mode != CONTROL_VEHICLES && + } else if (env->control_mode != CONTROL_VEHICLES && (entity->type == VEHICLE || entity->type == PEDESTRIAN || entity->type == CYCLIST)) { // Skip all non-controlled agents when controlling all skipped_noncontrolled_count++; @@ -1494,11 +1494,11 @@ void set_active_agents(Drive* env){ } // Check if we should skip overflow agents when create_expert_overflow is disabled - if (is_controlled && - env->active_agent_count >= env->max_controlled_agents && + if (is_controlled && + env->active_agent_count >= env->max_controlled_agents && env->max_controlled_agents != -1 && !env->create_expert_overflow) { - + // Determine what to skip based on control mode bool should_skip = false; if (env->control_mode == CONTROL_VEHICLES) { @@ -1508,7 +1508,7 @@ void set_active_agents(Drive* env){ // Skip all overflow controllable agents should_skip = true; } - + if (should_skip) { skipped_overflow_count++; continue; // Don't create this agent at all @@ -1518,8 +1518,8 @@ void set_active_agents(Drive* env){ env->num_actors++; // NEW: Handle max_controlled_agents (creates experts ONLY if create_expert_overflow is enabled) - if (is_controlled && - env->active_agent_count >= env->max_controlled_agents && + if (is_controlled && + env->active_agent_count >= env->max_controlled_agents && env->max_controlled_agents != -1 && env->create_expert_overflow) { is_controlled = false; @@ -1547,7 +1547,7 @@ void set_active_agents(Drive* env){ env->active_agent_indices = (int*)malloc(env->active_agent_count * sizeof(int)); env->static_agent_indices = (int*)malloc(env->static_agent_count * sizeof(int)); env->expert_static_agent_indices = (int*)malloc(env->expert_static_agent_count * sizeof(int)); - + for(int i=0; iactive_agent_count; i++){ env->active_agent_indices[i] = active_agent_indices[i]; } diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py index 4d1d1eb00a..b9caeb0a3d 100644 --- a/pufferlib/ocean/drive/drive.py +++ b/pufferlib/ocean/drive/drive.py @@ -38,7 +38,7 @@ def __init__( init_steps=0, init_mode="create_all_valid", control_mode="control_vehicles", - create_expert_overflow = True, + create_expert_overflow=True, k_scenarios=1, adaptive_driving_agent=False, ini_file="pufferlib/config/ocean/drive.ini", @@ -46,7 +46,7 @@ def __init__( co_player_enabled=False, num_ego_agents=512, co_player_policy={}, - one_ego_per_scene = False, + one_ego_per_scene=False, ): # env self.dt = dt @@ -219,14 +219,12 @@ def __init__( ) if self.condition_type != "none" and self.co_player_condition_type != "none": raise NotImplementedError("Only one agent can be conditioned at once") - + if self.one_ego_per_scene: - assert (max_controlled_agents * self.num_ego_agents == self.num_agents) - + assert max_controlled_agents * self.num_ego_agents == self.num_agents + if not self.population_play and self.one_ego_per_scene: raise NotImplementedError("Must be population play if only one ego per scene") - - self.max_controlled_agents = int(max_controlled_agents) @@ -270,8 +268,8 @@ def __init__( co_player_ids=self.local_co_player_ids[i], ego_agent_ids=self.local_ego_ids[i], num_ego_agents=len(self.local_ego_ids[i]), - place_holder_ids = self.local_place_holder_ids[i], - num_place_holders = len(self.local_place_holder_ids[i]), + place_holder_ids=self.local_place_holder_ids[i], + num_place_holders=len(self.local_place_holder_ids[i]), init_steps=init_steps, use_rc=self.reward_conditioned, use_ec=self.entropy_conditioned, @@ -288,7 +286,7 @@ def __init__( discount_weight_ub=self.discount_weight_ub, init_mode=self.init_mode, control_mode=self.control_mode, - create_expert_overflow = self.create_expert_overflow + create_expert_overflow=self.create_expert_overflow, ) env_ids.append(env_id) @@ -302,7 +300,6 @@ def reset(self, seed=0): self.tick = 0 return self.observations, info - def _set_env_variables(self): my_shared_tuple = binding.shared( num_agents=self.num_agents, @@ -320,23 +317,19 @@ def _set_env_variables(self): if self.population_play: # Both return 5 elements now self.agent_offsets, self.map_ids, num_envs, ego_ids, co_player_ids = my_shared_tuple - + self.num_envs = num_envs # Flatten the lists self.ego_ids = [item for sublist in ego_ids for item in sublist] self.co_player_ids = [item for sublist in co_player_ids for item in sublist] - + # Compute placeholders on Python side all_agents = set(range(self.num_agents)) ego_set = set(self.ego_ids) co_player_set = set(self.co_player_ids) self.place_holder_ids = sorted(list(all_agents - ego_set - co_player_set)) - - # print(f"DEBUG: Ego Ids: {self.ego_ids}", flush=True) - # print(f"DEBUG: co player ids {self.co_player_ids}", flush=True) - # print(f"DEBUG: placeholder ids {self.place_holder_ids}", flush=True) - + # Store counts self.num_ego_agents = len(self.ego_ids) self.num_co_players = len(self.co_player_ids) @@ -364,9 +357,12 @@ def _set_env_variables(self): # total_agents includes ALL agent slots (ego + co-player + placeholder) self.total_agents = self.num_agents # This includes placeholders - + print(f"DEBUG: Total agents (all slots): {self.total_agents}", flush=True) - print(f"DEBUG: Breakdown - Egos: {self.num_ego_agents}, Co-players: {self.num_co_players}, Placeholders: {self.num_place_holders}", flush=True) + print( + f"DEBUG: Breakdown - Egos: {self.num_ego_agents}, Co-players: {self.num_co_players}, Placeholders: {self.num_place_holders}", + flush=True, + ) # Build per-environment ID lists local_ego_ids = [] @@ -405,7 +401,7 @@ def _set_env_variables(self): min_id_in_world = min(co_player_ids[i]) else: min_id_in_world = 0 - + # Compute placeholder IDs for this environment env_start = self.agent_offsets[i] env_end = self.agent_offsets[i + 1] @@ -413,15 +409,14 @@ def _set_env_variables(self): env_ego_set = set(ego_ids[i]) env_co_player_set = set(co_player_ids[i]) env_placeholder_ids = sorted(list(env_agent_ids - env_ego_set - env_co_player_set)) - + # Convert to local IDs local_place_holder_ids.append([pid - min_id_in_world for pid in env_placeholder_ids]) - self.local_ego_ids = local_ego_ids self.local_co_player_ids = local_co_player_ids self.local_place_holder_ids = local_place_holder_ids - + if self.co_player_condition_type is not None and self.co_player_condition_type != "none": self._set_co_player_conditioning() @@ -431,7 +426,6 @@ def _set_env_variables(self): self.co_player_policy = self.co_player_policy_info.get("co_player_policy_func") self._set_co_player_state() - co_player_atn_space = pufferlib.spaces.joint_space(self.single_action_space, self.num_co_players) if isinstance(self.single_action_space, pufferlib.spaces.Box): self.co_player_actions = np.zeros(co_player_atn_space.shape, dtype=co_player_atn_space.dtype) @@ -444,6 +438,7 @@ def _set_env_variables(self): raise ValueError("mismatch between number of ego agents and number of agents") self.local_co_player_ids = [[] for i in range(self.num_envs)] self.local_ego_ids = [[0] for i in range(self.num_envs)] + self.local_place_holder_ids = [[] for i in range(self.num_envs)] def get_co_player_actions(self): with torch.no_grad(): @@ -465,7 +460,6 @@ def _set_co_player_state(self): lstm_c=torch.zeros(self.num_co_players, self.co_player_policy.hidden_size), ) - def _add_co_player_conditioning(self, observations): """Add pre-sampled conditioning variables to co-player observations""" if self.cached_conditioning_array.shape[1] == 0: # No conditioning @@ -544,21 +538,14 @@ def _aggregate_scenario_metrics(self, scenario_infos): if isinstance(value, (int, float)): aggregated[key] = aggregated.get(key, 0.0) + value - # Average by number of logs (note: 'n' is already a count, don't average it) - if "n" in aggregated: - n = aggregated["n"] - for key in aggregated: - if key != "n": - aggregated[key] = aggregated[key] / n if n > 0 else 0.0 - else: - # If no 'n', just average by count of infos - for key in aggregated: - aggregated[key] = aggregated[key] / count if count > 0 else 0.0 + # Average by number of logs (metrics are already per-episode averages from vec_log) + for key in aggregated: + aggregated[key] = aggregated[key] / count if count > 0 else 0.0 return aggregated def _compute_delta_metrics(self): - """Compute absolute delta metrics between first and last scenario.""" + """Compute delta metrics between first and last scenario.""" if len(self.scenario_metrics) < 2: return {} @@ -567,7 +554,7 @@ def _compute_delta_metrics(self): delta_metrics = {} - # Compute absolute deltas for key metrics (last - first) + # Compute deltas for key metrics metrics_to_track = [ "score", "collision_rate", @@ -609,38 +596,33 @@ def step(self, actions): if self.tick % self.report_interval == 0: log = binding.vec_log(self.c_envs) if log: - print(f"log is {log}", flush=True) - # Accumulate infos for adaptive agent tracking if self.adaptive_driving_agent: self.current_scenario_infos.append(log) - - # Only append log to info if it's the 0th scenario - if self.current_scenario == 0: + + # Only append to info if we're in the 0th scenario + if self.current_scenario == 0: + info.append(log) + + else: + # Non-adaptive mode: always append info.append(log) # Check if we've completed a scenario if self.tick % self.scenario_length == 0: - # Before incrementing scenario, process the completed scenario if self.adaptive_driving_agent and self.current_scenario_infos: - # Aggregate metrics for the completed scenario scenario_log = self._aggregate_scenario_metrics(self.current_scenario_infos) - scenario_log['scenario_id'] = self.current_scenario + scenario_log["scenario_id"] = self.current_scenario self.scenario_metrics.append(scenario_log) - - # Check if this is the last scenario + if self.current_scenario == self.k_scenarios - 1: - # Compute and append delta metrics delta_metrics = self._compute_delta_metrics() if delta_metrics: info.append(delta_metrics) - - # Reset for next cycle + self.scenario_metrics = [] - - # Clear accumulated infos for next scenario + self.current_scenario_infos = [] - - # Increment scenario counter + self.current_scenario = (self.current_scenario + 1) % self.k_scenarios if self.tick > 0 and self.resample_frequency > 0 and self.tick % self.resample_frequency == 0: @@ -655,7 +637,7 @@ def step(self, actions): self.scenario_metrics = [] self.current_scenario_infos = [] self.current_scenario = 0 - + binding.vec_close(self.c_envs) self._set_env_variables() env_ids = [] @@ -706,12 +688,12 @@ def step(self, actions): co_player_ids=self.local_co_player_ids[i], ego_agent_ids=self.local_ego_ids[i], num_ego_agents=len(self.local_ego_ids[i]), - place_holder_ids = self.local_place_holder_ids[i], - num_place_holders = len(self.local_place_holder_ids[i]), + place_holder_ids=self.local_place_holder_ids[i], + num_place_holders=len(self.local_place_holder_ids[i]), init_steps=self.init_steps, init_mode=self.init_mode, control_mode=self.control_mode, - create_expert_overflow = self.create_expert_overflow + create_expert_overflow=self.create_expert_overflow, ) env_ids.append(env_id) self.c_envs = binding.vectorize(*env_ids) @@ -723,6 +705,7 @@ def step(self, actions): info.append(self.ego_ids) return (self.observations, self.rewards, self.terminals, self.truncations, info) + def get_global_agent_state(self): """Get current global state of all active agents. diff --git a/pufferlib/pufferlib.py b/pufferlib/pufferlib.py index 60cc2a9f68..e718241a05 100644 --- a/pufferlib/pufferlib.py +++ b/pufferlib/pufferlib.py @@ -38,7 +38,7 @@ def set_buffers(env, buf=None): env.truncations = buf["truncations"] env.masks = buf["masks"] env.actions = buf["actions"] - + class PufferEnv: def __init__(self, buf=None): diff --git a/pufferlib/vector.py b/pufferlib/vector.py index bdd3d11b9b..d4be2134b0 100644 --- a/pufferlib/vector.py +++ b/pufferlib/vector.py @@ -408,7 +408,7 @@ def __init__( notify=np.ndarray(num_workers, dtype=bool, buffer=self.shm["notify"]), ) self.buf["semaphores"][:] = MAIN - print(f"observation shape after buffer created {self.buf["observations"].shape}", flush = True) + print(f"observation shape after buffer created {self.buf['observations'].shape}", flush=True) from multiprocessing import Pipe, Process self.send_pipes, w_recv_pipes = zip(*[Pipe() for _ in range(num_workers)]) From 52f3884b586306e1ae9a5b41f587f59eb3c391ea Mon Sep 17 00:00:00 2001 From: Charlie Molony <73535968+charliemolony@users.noreply.github.com> Date: Mon, 24 Nov 2025 11:18:32 -0500 Subject: [PATCH 12/13] Update pufferlib/ocean/drive/drive.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- pufferlib/ocean/drive/drive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py index b9caeb0a3d..88b7b58b2b 100644 --- a/pufferlib/ocean/drive/drive.py +++ b/pufferlib/ocean/drive/drive.py @@ -217,7 +217,7 @@ def __init__( raise ValueError( f"num ego agents ({self.num_ego_agents}) exceeds the number of total agents ({num_agents}))" ) - if self.condition_type != "none" and self.co_player_condition_type != "none": + if self.condition_type != "none" and hasattr(self, 'co_player_condition_type') and self.co_player_condition_type != "none": raise NotImplementedError("Only one agent can be conditioned at once") if self.one_ego_per_scene: From bc52eb72734158852ebcc3a230ba54147ccac253 Mon Sep 17 00:00:00 2001 From: Charlie Molony <73535968+charliemolony@users.noreply.github.com> Date: Mon, 24 Nov 2025 11:18:48 -0500 Subject: [PATCH 13/13] Update pufferlib/ocean/drive/drive.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- pufferlib/ocean/drive/drive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py index 88b7b58b2b..c011152c50 100644 --- a/pufferlib/ocean/drive/drive.py +++ b/pufferlib/ocean/drive/drive.py @@ -417,7 +417,7 @@ def _set_env_variables(self): self.local_co_player_ids = local_co_player_ids self.local_place_holder_ids = local_place_holder_ids - if self.co_player_condition_type is not None and self.co_player_condition_type != "none": + if hasattr(self, 'co_player_condition_type') and self.co_player_condition_type is not None and self.co_player_condition_type != "none": self._set_co_player_conditioning() if self.population_play: