重新训练新模型,当前增加转弯速度惩罚

wangbingke0 · wangbingke0 · commit d1bdc97b6979 · 2026-01-22T14:50:14.000+08:00
diff --git a/baselines/ppo/config/ppo_base_puffer.yaml b/baselines/ppo/config/ppo_base_puffer.yaml
@@ -4,106 +4,121 @@ eval_model_path: null
 baseline: false
 data_dir: data/processed/training
 continue_training: true
-model_cpt: compatible_huggingface_model.pt
-environment: # Overrides default environment configs (see pygpudrive/env/config.py)
+model_cpt: /home/wbk/gpudrive/runs/PPO__C__S_64__01_19_09_41_46_795_提高达成率/model_PPO__C__S_64__01_19_09_41_46_795_025817.pt
+environment: # 覆盖环境配置（见 gpudrive/env/config.py）
   name: "gpudrive"
-  num_worlds: 25 # 并行环境数量
-  k_unique_scenes: 25 # 采样场景数量
-  max_controlled_agents: 64 # 最大控制代理数量，确保与src/consts.hpp中的变量kMaxAgentCount一致
+  num_worlds: 18 # 并行环境数量（进一步降低以减少重采样时显存峰值）
+  k_unique_scenes: 72 # 采样场景数量（减少单次负载）
+  max_controlled_agents: 64 # 最大控制代理数量（需与环境掩码维度一致）
   ego_state: true
   road_map_obs: true
   partner_obs: true
   norm_obs: true
-  remove_non_vehicles: true # 如果为false，则包括所有代理（车辆、行人、自行车）
-  lidar_obs: false # NOTE: Setting this to true currently turns of the other observation types
+  remove_non_vehicles: true # 如果为 false，则包括所有代理（车辆、行人、自行车）
+  lidar_obs: false # 注意：设为 true 会关闭其他观测类型
   reward_type: "weighted_combination"
-  collision_weight: -0.75
-  off_road_weight: -0.75
-  goal_achieved_weight: 1.0
+  collision_weight: -3.0  # 提高碰撞惩罚：减少转弯时的碰撞
+  off_road_weight: -1.5   # 降低：允许适度冒险
+  goal_achieved_weight: 1.0  # 大幅提高：让"到达"比"安全躲避"更有吸引力
+  # 避免"动几下就停"的塑形项（仅 weighted_combination 生效）
+  time_penalty: 0.005  # 提高：增强推进压力
+  idle_speed_threshold: 0.5
+  idle_penalty: 0.02   # 降低：避免惩罚过重
+  # 进度奖励：距离目标越近奖励越高（密集正向信号）
+  progress_reward_weight: 0.1  # 降低：避免改变奖励scale太多
+  progress_reward_scale: 20.0
+  # 转弯速度惩罚：转弯时速度过快会给予惩罚，减少碰撞
+  turn_speed_penalty_weight: 0.05  # 转弯速度惩罚权重
+  turn_speed_threshold: 8.0  # 速度阈值（超过此速度时开始惩罚）
   dynamics_model: "classic"
-  collision_behavior: "ignore" # Options: "remove", "stop", "ignore"
+  collision_behavior: "remove" # 选项："remove"、"stop"、"ignore"
   dist_to_goal_threshold: 2.0
-  polyline_reduction_threshold: 0.1 # 采样点率（0表示使用所有最近点，1表示最大稀疏度），需要与kMaxAgentMapObservationsCount平衡
-  sampling_seed: 42 # If given, the set of scenes to sample from will be deterministic, if None, the set of scenes will be random
-  obs_radius: 50.0 # Visibility radius of the agents
+  polyline_reduction_threshold: 0.1 # 采样点率（0 表示使用所有最近点，1 表示最大稀疏度），需与 kMaxAgentMapObservationsCount 平衡
+  sampling_seed: 42 # 若设置则场景采样可复现；为 None 则随机
+  obs_radius: 50.0 # 智能体可见半径
   action_space_steer_disc: 13
   action_space_accel_disc: 7
-  # Versatile Behavior Diffusion (VBD): This will slow down training
+  # Versatile Behavior Diffusion (VBD)：开启会降低训练速度
   use_vbd: false
   vbd_model_path: "gpudrive/integrations/vbd/weights/epoch=18.ckpt"
   init_steps: 11
-  vbd_trajectory_weight: 0.1 # Importance of distance to the vbd trajectories in the reward function
+  vbd_trajectory_weight: 0.1 # 奖励中 VBD 轨迹距离项的权重
   vbd_in_obs: false
 
 wandb:
   entity: ""
   project: "gpudrive"
   group: "test"
-  mode: "online" # Options: online, offline, disabled
+  mode: "online" # 选项：online、offline、disabled
   tags: ["ppo", "ff"]
 
 train:
-  exp_id: PPO # Set dynamically in the script if needed
+  exp_id: PPO # 如需可在脚本中动态设置
   seed: 42
   cpu_offload: false
-  device: "cuda"  # Dynamically set to cuda if available, else cpu
+  device: "cuda"  # 若可用则使用 cuda，否则使用 cpu
   bptt_horizon: 1
   compile: false
   compile_mode: "reduce-overhead"
 
-  # # # Data sampling # # #
-  resample_scenes: false
-  resample_dataset_size: 10_000 # Number of unique scenes to sample from
-  resample_interval: 2_000_000
+  # # # 数据采样 # # #
+  resample_scenes: false   # 开启重采样，提升泛化能力
+  resample_dataset_size: 10_000
+  resample_interval: 10_000_000  # 50M步训练约5次重采样，平衡稳定性和泛化
   sample_with_replacement: true
   shuffle_dataset: false
 
   # # # PPO # # #
   torch_deterministic: false
-  total_timesteps: 100_000_000
-  batch_size: 32_768
-  minibatch_size: 2048
-  learning_rate: 3e-4
-  anneal_lr: false
+  total_timesteps: 450_000_000
+  batch_size: 18432
+  minibatch_size: 3072
+  # 降低学习率：策略已学会，现在需要精细调优
+  learning_rate: 1e-4  # 降低，让更新更平滑
+  anneal_lr: false     # 关闭衰减！
   gamma: 0.99
   gae_lambda: 0.95
-  update_epochs: 4
+  # 收紧更新：避免策略变化太大
+  update_epochs: 3  # 减少更新次数
   norm_adv: true
-  clip_coef: 0.2
-  clip_vloss: false
+  clip_coef: 0.15   # 收紧clip，限制策略变化幅度
+  # value 更稳
+  clip_vloss: true
   vf_clip_coef: 0.2
-  ent_coef: 0.0001
+  # 降低探索：策略已学会走，现在需要更稳定（减少晃动）
+  ent_coef: 0.0003  # 从 0.001 降低到 0.0003
   vf_coef: 0.3
   max_grad_norm: 0.5
-  target_kl: null
+  # KL 早停，避免重采样后一次更新过猛导致震荡
+  target_kl: 0.02
   log_window: 1000
 
-  # # # Network # # #
+  # # # 网络 # # #
   network:
-    input_dim: 64 # Embedding of the input features
-    hidden_dim: 128 # Latent dimension
+    input_dim: 64 # 输入特征嵌入维度
+    hidden_dim: 128 # 潜在维度
     dropout: 0.01
     class_name: "NeuralNet"
-    num_parameters: 0 # Total trainable parameters, to be filled at runtime
+    num_parameters: 0 # 可训练参数数量（运行时填充）
     # 新增：观察融合网络配置
     fusion_type: "attention" # 选项: "simple", "attention", "adaptive"
     num_attention_heads: 4 # 注意力头数（仅在fusion_type="attention"时有效）
 
-  # # # Checkpointing # # #
-  checkpoint_interval: 100 # Save policy every k iterations
+  # # # 检查点保存 # # #
+  checkpoint_interval: 200 # 每隔 k 次迭代保存一次
   checkpoint_path: "./runs"
 
-  # # # Rendering # # #
-  render: false # Determines whether to render the environment (note: will slow down training)
-  render_3d: true # Render simulator state in 3d or 2d
-  render_interval: 1 # Render every k iterations
-  render_k_scenarios: 10 # Number of scenarios to render
-  render_format: "mp4" # Options: gif, mp4
-  render_fps: 15 # Frames per second
+  # # # 渲染 # # #
+  render: false # 是否渲染环境（开启会减慢训练）
+  render_3d: true # 渲染 3D 或 2D
+  render_interval: 1 # 每隔 k 次迭代渲染
+  render_k_scenarios: 0 # 训练期建议为 0，避免额外 IO/不确定性
+  render_format: "mp4" # 选项：gif、mp4
+  render_fps: 15 # 每秒帧数
   zoom_radius: 50
 
 vec:
-  backend: "native" # Only native is currently supported
+  backend: "native" # 目前仅支持 native
   num_workers: 1
   env_batch_size: 1
   zero_copy: false
diff --git a/gpudrive/env/config.py b/gpudrive/env/config.py
@@ -101,6 +101,27 @@ class EnvConfig:
     reward_type: str = "sparse_on_goal_achieved"
     # Alternatively, "weighted_combination", "distance_to_logs", "distance_to_vdb_trajs", "reward_conditioned"
 
+    # --- weighted_combination 额外稠密项（用于避免“动几下就停”的局部最优） ---
+    # 每一步的时间成本（仅在 reward_type == "weighted_combination" 时生效）
+    # 建议从 0.001~0.005 试起；过大可能导致冒进/碰撞上升
+    time_penalty: float = 0.0
+
+    # 低速/怠速惩罚（仅在 reward_type == "weighted_combination" 时生效）
+    # 当 speed < idle_speed_threshold 且未完成/未终止时，额外扣 idle_penalty
+    idle_speed_threshold: float = 0.5
+    idle_penalty: float = 0.0
+
+    # 进度奖励：距离目标越近，每步获得的奖励越高（密集正向信号）
+    # reward += progress_reward_weight * exp(-dist_to_goal / progress_reward_scale)
+    # 建议 progress_reward_weight: 0.1~0.3, progress_reward_scale: 15~30
+    progress_reward_weight: float = 0.0  # 默认关闭
+    progress_reward_scale: float = 20.0  # 距离衰减因子
+    
+    # 转弯速度惩罚：转弯时速度过快会给予惩罚，减少碰撞
+    # 当速度超过阈值时，给予惩罚（转弯时应该减速）
+    turn_speed_penalty_weight: float = 0.0  # 默认关闭，建议值：0.05~0.15
+    turn_speed_threshold: float = 8.0  # 速度阈值（超过此速度时开始惩罚）
+
     condition_mode: str = "random"  # Options: "random", "fixed", "preset"
 
     # Define upper and lower bounds for reward components if using reward_conditioned
diff --git a/gpudrive/env/env_puffer.py b/gpudrive/env/env_puffer.py
@@ -239,6 +239,13 @@ def step(self, action):
         Args:
             action: A numpy array of actions for the controlled agents. Shape:
                 (num_worlds, max_cont_agents_per_env)
+        执行一步环境交互：
+        1. 应用动作
+        2. 执行物理仿真
+        3. 计算奖励
+        4. 处理终止状态
+        5. 异步重置完成的环境
+        6. 返回新的观测
         """
 
         # Set the action for the controlled agents
diff --git a/gpudrive/env/env_torch.py b/gpudrive/env/env_torch.py
@@ -491,6 +491,63 @@ def get_rewards(
                 + off_road_weight * off_road
             )
 
+            # 稠密塑形：避免"停住最优"
+            # 仅对未 done 且未达成目标的 agent 生效（done/goal 后不再额外惩罚）
+            needs_shaping = (
+                self.config.time_penalty != 0.0
+                or self.config.idle_penalty != 0.0
+                or self.config.progress_reward_weight != 0.0
+            )
+            if needs_shaping or self.config.turn_speed_penalty_weight != 0.0:
+                done = (
+                    self.sim.done_tensor()
+                    .to_torch()
+                    .clone()
+                    .squeeze(dim=2)
+                    .to(weighted_rewards.device)
+                    .to(torch.float)
+                )
+                active = (1.0 - done) * (1.0 - goal_achieved)
+
+                if self.config.time_penalty != 0.0:
+                    weighted_rewards = weighted_rewards - self.config.time_penalty * active
+
+                # 获取速度（可能被多个惩罚项使用）
+                speed = None
+                if self.config.idle_penalty != 0.0 or self.config.turn_speed_penalty_weight != 0.0:
+                    speed = (
+                        self.sim.self_observation_tensor()
+                        .to_torch()
+                        .clone()[:, :, 0]
+                        .to(weighted_rewards.device)
+                        .to(torch.float)
+                    )
+
+                if self.config.idle_penalty != 0.0:
+                    is_idle = (speed < self.config.idle_speed_threshold).to(torch.float)
+                    weighted_rewards = weighted_rewards - self.config.idle_penalty * is_idle * active
+
+                # 进度奖励：距离目标越近，每步正奖励越高（密集引导信号）
+                if self.config.progress_reward_weight != 0.0:
+                    self_obs = self.sim.self_observation_tensor().to_torch().clone()
+                    rel_goal_x = self_obs[:, :, 4].to(weighted_rewards.device)
+                    rel_goal_y = self_obs[:, :, 5].to(weighted_rewards.device)
+                    dist_to_goal = torch.sqrt(rel_goal_x ** 2 + rel_goal_y ** 2 + 1e-6)
+                    progress_reward = self.config.progress_reward_weight * torch.exp(
+                        -dist_to_goal / self.config.progress_reward_scale
+                    )
+                    # 只给仍在行驶中的 agent
+                    weighted_rewards = weighted_rewards + progress_reward * active
+
+                # 转弯速度惩罚：速度过快时给予惩罚，减少转弯时的碰撞
+                if self.config.turn_speed_penalty_weight != 0.0:
+                    # 速度超过阈值时给予惩罚（鼓励转弯时减速）
+                    speed_penalty = torch.clamp(
+                        speed - self.config.turn_speed_threshold, 
+                        min=0.0
+                    ) * self.config.turn_speed_penalty_weight
+                    weighted_rewards = weighted_rewards - speed_penalty * active
+
             return weighted_rewards
 
         elif self.config.reward_type == "reward_conditioned":
diff --git a/gpudrive/integrations/puffer/ppo.py b/gpudrive/integrations/puffer/ppo.py
@@ -361,7 +361,12 @@ def train(data):
 
     with profile.train_misc:
         if config.anneal_lr:
-            frac = 1.0 - data.global_step / config.total_timesteps
+            # 支持继续训练时从配置的学习率开始衰减
+            lr_start_step = getattr(data, 'lr_start_step', 0)
+            lr_total_steps = config.total_timesteps - lr_start_step
+            steps_since_start = data.global_step - lr_start_step
+            frac = 1.0 - steps_since_start / lr_total_steps
+            frac = max(0.0, frac)  # 防止负数
             lrnow = float(frac) * float(config.learning_rate)
             data.optimizer.param_groups[0]["lr"] = lrnow
 
diff --git a/gpudrive/visualize/core.py b/gpudrive/visualize/core.py
@@ -100,6 +100,7 @@ def plot_simulator_state(
         zoom_radius: int = 100,
         plot_log_replay_trajectory: bool = False,
         agent_positions: Optional[torch.Tensor] = None,
+        predicted_trajectories: Optional[torch.Tensor] = None,
         backward_goals: bool = False,
         policy_masks: Optional[Dict[int,Dict[str,torch.Tensor]]] = None,
     ):
@@ -432,6 +433,15 @@ def plot_simulator_state(
                 except Exception as e:
                     print(f"Warning: Could not add colorbar: {e}")
 
+            # 绘制预测轨迹（未来轨迹）
+            if predicted_trajectories is not None:
+                self._plot_predicted_trajectories(
+                    ax=ax,
+                    env_idx=env_idx,
+                    predicted_trajectories=predicted_trajectories,
+                    controlled_live=controlled_live,
+                )
+
             # Determine center point for zooming
             if center_agent_idx is not None:
                 center_x = global_agent_states.pos_x[
@@ -1574,3 +1584,82 @@ def plot_agent_observation(
         ax.set_yticks([])
 
         return fig
+
+    def _plot_predicted_trajectories(
+        self,
+        ax: matplotlib.axes.Axes,
+        env_idx: int,
+        predicted_trajectories: torch.Tensor,
+        controlled_live: torch.Tensor,
+    ) -> None:
+        """
+        绘制预测的未来轨迹
+        
+        Args:
+            ax: Matplotlib axis
+            env_idx: 环境索引
+            predicted_trajectories: [num_worlds, max_agents, horizon, 2] 预测轨迹
+            controlled_live: [max_agents] 受控且存活的智能体掩码
+        """
+        if predicted_trajectories is None:
+            return
+        
+        # 预测轨迹颜色（使用虚线表示预测）
+        pred_color = "#FF6B6B"  # 红色，表示预测
+        pred_alpha = 0.6
+        pred_linewidth = 2.0
+        
+        for agent_idx in range(predicted_trajectories.shape[1]):
+            if controlled_live[agent_idx]:
+                trajectory = predicted_trajectories[env_idx, agent_idx, :, :]  # [horizon, 2]
+                
+                # 过滤无效点
+                valid_mask = (
+                    (trajectory[:, 0] != 0)
+                    & (trajectory[:, 1] != 0)
+                    & (torch.abs(trajectory[:, 0]) < OUT_OF_BOUNDS)
+                    & (torch.abs(trajectory[:, 1]) < OUT_OF_BOUNDS)
+                )
+                valid_trajectory = trajectory[valid_mask]
+                
+                if len(valid_trajectory) > 1:
+                    points = valid_trajectory.cpu().numpy()
+                    
+                    if self.render_3d:
+                        # 3D 绘制
+                        trajectory_height = 0.1  # 稍微高一点以区分预测轨迹
+                        ax.plot(
+                            points[:, 0],
+                            points[:, 1],
+                            trajectory_height,
+                            color=pred_color,
+                            linestyle="--",
+                            linewidth=pred_linewidth,
+                            alpha=pred_alpha,
+                            zorder=2,
+                            label="Predicted" if agent_idx == 0 else "",
+                        )
+                    else:
+                        # 2D 绘制
+                        ax.plot(
+                            points[:, 0],
+                            points[:, 1],
+                            color=pred_color,
+                            linestyle="--",
+                            linewidth=pred_linewidth,
+                            alpha=pred_alpha,
+                            zorder=2,
+                            label="Predicted" if agent_idx == 0 else "",
+                        )
+                        
+                        # 在轨迹终点添加标记
+                        if len(points) > 0:
+                            ax.scatter(
+                                points[-1, 0],
+                                points[-1, 1],
+                                color=pred_color,
+                                marker="x",
+                                s=50,
+                                alpha=pred_alpha,
+                                zorder=3,
+                            )