Skip to content

Commit d1bdc97

Browse files
committed
重新训练新模型,当前增加转弯速度惩罚
1 parent d52dbce commit d1bdc97

6 files changed

Lines changed: 242 additions & 48 deletions

File tree

baselines/ppo/config/ppo_base_puffer.yaml

Lines changed: 62 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -4,106 +4,121 @@ eval_model_path: null
44
baseline: false
55
data_dir: data/processed/training
66
continue_training: true
7-
model_cpt: compatible_huggingface_model.pt
8-
environment: # Overrides default environment configs (see pygpudrive/env/config.py)
7+
model_cpt: /home/wbk/gpudrive/runs/PPO__C__S_64__01_19_09_41_46_795_提高达成率/model_PPO__C__S_64__01_19_09_41_46_795_025817.pt
8+
environment: # 覆盖环境配置(见 gpudrive/env/config.py
99
name: "gpudrive"
10-
num_worlds: 25 # 并行环境数量
11-
k_unique_scenes: 25 # 采样场景数量
12-
max_controlled_agents: 64 # 最大控制代理数量,确保与src/consts.hpp中的变量kMaxAgentCount一致
10+
num_worlds: 18 # 并行环境数量(进一步降低以减少重采样时显存峰值)
11+
k_unique_scenes: 72 # 采样场景数量(减少单次负载)
12+
max_controlled_agents: 64 # 最大控制代理数量(需与环境掩码维度一致)
1313
ego_state: true
1414
road_map_obs: true
1515
partner_obs: true
1616
norm_obs: true
17-
remove_non_vehicles: true # 如果为false,则包括所有代理(车辆、行人、自行车)
18-
lidar_obs: false # NOTE: Setting this to true currently turns of the other observation types
17+
remove_non_vehicles: true # 如果为 false,则包括所有代理(车辆、行人、自行车)
18+
lidar_obs: false # 注意:设为 true 会关闭其他观测类型
1919
reward_type: "weighted_combination"
20-
collision_weight: -0.75
21-
off_road_weight: -0.75
22-
goal_achieved_weight: 1.0
20+
collision_weight: -3.0 # 提高碰撞惩罚:减少转弯时的碰撞
21+
off_road_weight: -1.5 # 降低:允许适度冒险
22+
goal_achieved_weight: 1.0 # 大幅提高:让"到达"比"安全躲避"更有吸引力
23+
# 避免"动几下就停"的塑形项(仅 weighted_combination 生效)
24+
time_penalty: 0.005 # 提高:增强推进压力
25+
idle_speed_threshold: 0.5
26+
idle_penalty: 0.02 # 降低:避免惩罚过重
27+
# 进度奖励:距离目标越近奖励越高(密集正向信号)
28+
progress_reward_weight: 0.1 # 降低:避免改变奖励scale太多
29+
progress_reward_scale: 20.0
30+
# 转弯速度惩罚:转弯时速度过快会给予惩罚,减少碰撞
31+
turn_speed_penalty_weight: 0.05 # 转弯速度惩罚权重
32+
turn_speed_threshold: 8.0 # 速度阈值(超过此速度时开始惩罚)
2333
dynamics_model: "classic"
24-
collision_behavior: "ignore" # Options: "remove", "stop", "ignore"
34+
collision_behavior: "remove" # 选项:"remove""stop""ignore"
2535
dist_to_goal_threshold: 2.0
26-
polyline_reduction_threshold: 0.1 # 采样点率(0表示使用所有最近点,1表示最大稀疏度),需要与kMaxAgentMapObservationsCount平衡
27-
sampling_seed: 42 # If given, the set of scenes to sample from will be deterministic, if None, the set of scenes will be random
28-
obs_radius: 50.0 # Visibility radius of the agents
36+
polyline_reduction_threshold: 0.1 # 采样点率(0 表示使用所有最近点,1 表示最大稀疏度),需与 kMaxAgentMapObservationsCount 平衡
37+
sampling_seed: 42 # 若设置则场景采样可复现;为 None 则随机
38+
obs_radius: 50.0 # 智能体可见半径
2939
action_space_steer_disc: 13
3040
action_space_accel_disc: 7
31-
# Versatile Behavior Diffusion (VBD): This will slow down training
41+
# Versatile Behavior Diffusion (VBD):开启会降低训练速度
3242
use_vbd: false
3343
vbd_model_path: "gpudrive/integrations/vbd/weights/epoch=18.ckpt"
3444
init_steps: 11
35-
vbd_trajectory_weight: 0.1 # Importance of distance to the vbd trajectories in the reward function
45+
vbd_trajectory_weight: 0.1 # 奖励中 VBD 轨迹距离项的权重
3646
vbd_in_obs: false
3747

3848
wandb:
3949
entity: ""
4050
project: "gpudrive"
4151
group: "test"
42-
mode: "online" # Options: online, offline, disabled
52+
mode: "online" # 选项:onlineofflinedisabled
4353
tags: ["ppo", "ff"]
4454

4555
train:
46-
exp_id: PPO # Set dynamically in the script if needed
56+
exp_id: PPO # 如需可在脚本中动态设置
4757
seed: 42
4858
cpu_offload: false
49-
device: "cuda" # Dynamically set to cuda if available, else cpu
59+
device: "cuda" # 若可用则使用 cuda,否则使用 cpu
5060
bptt_horizon: 1
5161
compile: false
5262
compile_mode: "reduce-overhead"
5363

54-
# # # Data sampling # # #
55-
resample_scenes: false
56-
resample_dataset_size: 10_000 # Number of unique scenes to sample from
57-
resample_interval: 2_000_000
64+
# # # 数据采样 # # #
65+
resample_scenes: false # 开启重采样,提升泛化能力
66+
resample_dataset_size: 10_000
67+
resample_interval: 10_000_000 # 50M步训练约5次重采样,平衡稳定性和泛化
5868
sample_with_replacement: true
5969
shuffle_dataset: false
6070

6171
# # # PPO # # #
6272
torch_deterministic: false
63-
total_timesteps: 100_000_000
64-
batch_size: 32_768
65-
minibatch_size: 2048
66-
learning_rate: 3e-4
67-
anneal_lr: false
73+
total_timesteps: 450_000_000
74+
batch_size: 18432
75+
minibatch_size: 3072
76+
# 降低学习率:策略已学会,现在需要精细调优
77+
learning_rate: 1e-4 # 降低,让更新更平滑
78+
anneal_lr: false # 关闭衰减!
6879
gamma: 0.99
6980
gae_lambda: 0.95
70-
update_epochs: 4
81+
# 收紧更新:避免策略变化太大
82+
update_epochs: 3 # 减少更新次数
7183
norm_adv: true
72-
clip_coef: 0.2
73-
clip_vloss: false
84+
clip_coef: 0.15 # 收紧clip,限制策略变化幅度
85+
# value 更稳
86+
clip_vloss: true
7487
vf_clip_coef: 0.2
75-
ent_coef: 0.0001
88+
# 降低探索:策略已学会走,现在需要更稳定(减少晃动)
89+
ent_coef: 0.0003 # 从 0.001 降低到 0.0003
7690
vf_coef: 0.3
7791
max_grad_norm: 0.5
78-
target_kl: null
92+
# KL 早停,避免重采样后一次更新过猛导致震荡
93+
target_kl: 0.02
7994
log_window: 1000
8095

81-
# # # Network # # #
96+
# # # 网络 # # #
8297
network:
83-
input_dim: 64 # Embedding of the input features
84-
hidden_dim: 128 # Latent dimension
98+
input_dim: 64 # 输入特征嵌入维度
99+
hidden_dim: 128 # 潜在维度
85100
dropout: 0.01
86101
class_name: "NeuralNet"
87-
num_parameters: 0 # Total trainable parameters, to be filled at runtime
102+
num_parameters: 0 # 可训练参数数量(运行时填充)
88103
# 新增:观察融合网络配置
89104
fusion_type: "attention" # 选项: "simple", "attention", "adaptive"
90105
num_attention_heads: 4 # 注意力头数(仅在fusion_type="attention"时有效)
91106

92-
# # # Checkpointing # # #
93-
checkpoint_interval: 100 # Save policy every k iterations
107+
# # # 检查点保存 # # #
108+
checkpoint_interval: 200 # 每隔 k 次迭代保存一次
94109
checkpoint_path: "./runs"
95110

96-
# # # Rendering # # #
97-
render: false # Determines whether to render the environment (note: will slow down training)
98-
render_3d: true # Render simulator state in 3d or 2d
99-
render_interval: 1 # Render every k iterations
100-
render_k_scenarios: 10 # Number of scenarios to render
101-
render_format: "mp4" # Options: gif, mp4
102-
render_fps: 15 # Frames per second
111+
# # # 渲染 # # #
112+
render: false # 是否渲染环境(开启会减慢训练)
113+
render_3d: true # 渲染 3D 或 2D
114+
render_interval: 1 # 每隔 k 次迭代渲染
115+
render_k_scenarios: 0 # 训练期建议为 0,避免额外 IO/不确定性
116+
render_format: "mp4" # 选项:gifmp4
117+
render_fps: 15 # 每秒帧数
103118
zoom_radius: 50
104119

105120
vec:
106-
backend: "native" # Only native is currently supported
121+
backend: "native" # 目前仅支持 native
107122
num_workers: 1
108123
env_batch_size: 1
109124
zero_copy: false

gpudrive/env/config.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,27 @@ class EnvConfig:
101101
reward_type: str = "sparse_on_goal_achieved"
102102
# Alternatively, "weighted_combination", "distance_to_logs", "distance_to_vdb_trajs", "reward_conditioned"
103103

104+
# --- weighted_combination 额外稠密项(用于避免“动几下就停”的局部最优) ---
105+
# 每一步的时间成本(仅在 reward_type == "weighted_combination" 时生效)
106+
# 建议从 0.001~0.005 试起;过大可能导致冒进/碰撞上升
107+
time_penalty: float = 0.0
108+
109+
# 低速/怠速惩罚(仅在 reward_type == "weighted_combination" 时生效)
110+
# 当 speed < idle_speed_threshold 且未完成/未终止时,额外扣 idle_penalty
111+
idle_speed_threshold: float = 0.5
112+
idle_penalty: float = 0.0
113+
114+
# 进度奖励:距离目标越近,每步获得的奖励越高(密集正向信号)
115+
# reward += progress_reward_weight * exp(-dist_to_goal / progress_reward_scale)
116+
# 建议 progress_reward_weight: 0.1~0.3, progress_reward_scale: 15~30
117+
progress_reward_weight: float = 0.0 # 默认关闭
118+
progress_reward_scale: float = 20.0 # 距离衰减因子
119+
120+
# 转弯速度惩罚:转弯时速度过快会给予惩罚,减少碰撞
121+
# 当速度超过阈值时,给予惩罚(转弯时应该减速)
122+
turn_speed_penalty_weight: float = 0.0 # 默认关闭,建议值:0.05~0.15
123+
turn_speed_threshold: float = 8.0 # 速度阈值(超过此速度时开始惩罚)
124+
104125
condition_mode: str = "random" # Options: "random", "fixed", "preset"
105126

106127
# Define upper and lower bounds for reward components if using reward_conditioned

gpudrive/env/env_puffer.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,13 @@ def step(self, action):
239239
Args:
240240
action: A numpy array of actions for the controlled agents. Shape:
241241
(num_worlds, max_cont_agents_per_env)
242+
执行一步环境交互:
243+
1. 应用动作
244+
2. 执行物理仿真
245+
3. 计算奖励
246+
4. 处理终止状态
247+
5. 异步重置完成的环境
248+
6. 返回新的观测
242249
"""
243250

244251
# Set the action for the controlled agents

gpudrive/env/env_torch.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -491,6 +491,63 @@ def get_rewards(
491491
+ off_road_weight * off_road
492492
)
493493

494+
# 稠密塑形:避免"停住最优"
495+
# 仅对未 done 且未达成目标的 agent 生效(done/goal 后不再额外惩罚)
496+
needs_shaping = (
497+
self.config.time_penalty != 0.0
498+
or self.config.idle_penalty != 0.0
499+
or self.config.progress_reward_weight != 0.0
500+
)
501+
if needs_shaping or self.config.turn_speed_penalty_weight != 0.0:
502+
done = (
503+
self.sim.done_tensor()
504+
.to_torch()
505+
.clone()
506+
.squeeze(dim=2)
507+
.to(weighted_rewards.device)
508+
.to(torch.float)
509+
)
510+
active = (1.0 - done) * (1.0 - goal_achieved)
511+
512+
if self.config.time_penalty != 0.0:
513+
weighted_rewards = weighted_rewards - self.config.time_penalty * active
514+
515+
# 获取速度(可能被多个惩罚项使用)
516+
speed = None
517+
if self.config.idle_penalty != 0.0 or self.config.turn_speed_penalty_weight != 0.0:
518+
speed = (
519+
self.sim.self_observation_tensor()
520+
.to_torch()
521+
.clone()[:, :, 0]
522+
.to(weighted_rewards.device)
523+
.to(torch.float)
524+
)
525+
526+
if self.config.idle_penalty != 0.0:
527+
is_idle = (speed < self.config.idle_speed_threshold).to(torch.float)
528+
weighted_rewards = weighted_rewards - self.config.idle_penalty * is_idle * active
529+
530+
# 进度奖励:距离目标越近,每步正奖励越高(密集引导信号)
531+
if self.config.progress_reward_weight != 0.0:
532+
self_obs = self.sim.self_observation_tensor().to_torch().clone()
533+
rel_goal_x = self_obs[:, :, 4].to(weighted_rewards.device)
534+
rel_goal_y = self_obs[:, :, 5].to(weighted_rewards.device)
535+
dist_to_goal = torch.sqrt(rel_goal_x ** 2 + rel_goal_y ** 2 + 1e-6)
536+
progress_reward = self.config.progress_reward_weight * torch.exp(
537+
-dist_to_goal / self.config.progress_reward_scale
538+
)
539+
# 只给仍在行驶中的 agent
540+
weighted_rewards = weighted_rewards + progress_reward * active
541+
542+
# 转弯速度惩罚:速度过快时给予惩罚,减少转弯时的碰撞
543+
if self.config.turn_speed_penalty_weight != 0.0:
544+
# 速度超过阈值时给予惩罚(鼓励转弯时减速)
545+
speed_penalty = torch.clamp(
546+
speed - self.config.turn_speed_threshold,
547+
min=0.0
548+
) * self.config.turn_speed_penalty_weight
549+
weighted_rewards = weighted_rewards - speed_penalty * active
550+
494551
return weighted_rewards
495552

496553
elif self.config.reward_type == "reward_conditioned":

gpudrive/integrations/puffer/ppo.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,12 @@ def train(data):
361361

362362
with profile.train_misc:
363363
if config.anneal_lr:
364-
frac = 1.0 - data.global_step / config.total_timesteps
364+
# 支持继续训练时从配置的学习率开始衰减
365+
lr_start_step = getattr(data, 'lr_start_step', 0)
366+
lr_total_steps = config.total_timesteps - lr_start_step
367+
steps_since_start = data.global_step - lr_start_step
368+
frac = 1.0 - steps_since_start / lr_total_steps
369+
frac = max(0.0, frac) # 防止负数
365370
lrnow = float(frac) * float(config.learning_rate)
366371
data.optimizer.param_groups[0]["lr"] = lrnow
367372

gpudrive/visualize/core.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ def plot_simulator_state(
100100
zoom_radius: int = 100,
101101
plot_log_replay_trajectory: bool = False,
102102
agent_positions: Optional[torch.Tensor] = None,
103+
predicted_trajectories: Optional[torch.Tensor] = None,
103104
backward_goals: bool = False,
104105
policy_masks: Optional[Dict[int,Dict[str,torch.Tensor]]] = None,
105106
):
@@ -432,6 +433,15 @@ def plot_simulator_state(
432433
except Exception as e:
433434
print(f"Warning: Could not add colorbar: {e}")
434435

436+
# 绘制预测轨迹(未来轨迹)
437+
if predicted_trajectories is not None:
438+
self._plot_predicted_trajectories(
439+
ax=ax,
440+
env_idx=env_idx,
441+
predicted_trajectories=predicted_trajectories,
442+
controlled_live=controlled_live,
443+
)
444+
435445
# Determine center point for zooming
436446
if center_agent_idx is not None:
437447
center_x = global_agent_states.pos_x[
@@ -1574,3 +1584,82 @@ def plot_agent_observation(
15741584
ax.set_yticks([])
15751585

15761586
return fig
1587+
1588+
def _plot_predicted_trajectories(
1589+
self,
1590+
ax: matplotlib.axes.Axes,
1591+
env_idx: int,
1592+
predicted_trajectories: torch.Tensor,
1593+
controlled_live: torch.Tensor,
1594+
) -> None:
1595+
"""
1596+
绘制预测的未来轨迹
1597+
1598+
Args:
1599+
ax: Matplotlib axis
1600+
env_idx: 环境索引
1601+
predicted_trajectories: [num_worlds, max_agents, horizon, 2] 预测轨迹
1602+
controlled_live: [max_agents] 受控且存活的智能体掩码
1603+
"""
1604+
if predicted_trajectories is None:
1605+
return
1606+
1607+
# 预测轨迹颜色(使用虚线表示预测)
1608+
pred_color = "#FF6B6B" # 红色,表示预测
1609+
pred_alpha = 0.6
1610+
pred_linewidth = 2.0
1611+
1612+
for agent_idx in range(predicted_trajectories.shape[1]):
1613+
if controlled_live[agent_idx]:
1614+
trajectory = predicted_trajectories[env_idx, agent_idx, :, :] # [horizon, 2]
1615+
1616+
# 过滤无效点
1617+
valid_mask = (
1618+
(trajectory[:, 0] != 0)
1619+
& (trajectory[:, 1] != 0)
1620+
& (torch.abs(trajectory[:, 0]) < OUT_OF_BOUNDS)
1621+
& (torch.abs(trajectory[:, 1]) < OUT_OF_BOUNDS)
1622+
)
1623+
valid_trajectory = trajectory[valid_mask]
1624+
1625+
if len(valid_trajectory) > 1:
1626+
points = valid_trajectory.cpu().numpy()
1627+
1628+
if self.render_3d:
1629+
# 3D 绘制
1630+
trajectory_height = 0.1 # 稍微高一点以区分预测轨迹
1631+
ax.plot(
1632+
points[:, 0],
1633+
points[:, 1],
1634+
trajectory_height,
1635+
color=pred_color,
1636+
linestyle="--",
1637+
linewidth=pred_linewidth,
1638+
alpha=pred_alpha,
1639+
zorder=2,
1640+
label="Predicted" if agent_idx == 0 else "",
1641+
)
1642+
else:
1643+
# 2D 绘制
1644+
ax.plot(
1645+
points[:, 0],
1646+
points[:, 1],
1647+
color=pred_color,
1648+
linestyle="--",
1649+
linewidth=pred_linewidth,
1650+
alpha=pred_alpha,
1651+
zorder=2,
1652+
label="Predicted" if agent_idx == 0 else "",
1653+
)
1654+
1655+
# 在轨迹终点添加标记
1656+
if len(points) > 0:
1657+
ax.scatter(
1658+
points[-1, 0],
1659+
points[-1, 1],
1660+
color=pred_color,
1661+
marker="x",
1662+
s=50,
1663+
alpha=pred_alpha,
1664+
zorder=3,
1665+
)

0 commit comments

Comments
 (0)