Skip to content

How to make it training faster using GDPO #2153

@firmmm

Description

@firmmm
grpo:
  num_prompts_per_step: 32
  num_generations_per_prompt: 16
  max_rollout_turns: 1
  max_num_epochs: 1
  max_num_steps: 1000000

  # ── FIX 1: set top-level baseline flags to false when using GDPO ────────
  # adv_estimator (below) owns normalization and baseline for GDPO.
  # Keeping these true here would conflict with the GDPO path.
  normalize_rewards: false
  use_leave_one_out_baseline: false

  val_period: 10
  val_at_start: false
  val_at_end: false
  overlong_filtering: false
  max_val_samples: 256
  val_batch_size: 256
  seed: 42
  use_dynamic_sampling: false
  dynamic_sampling_max_gen_batches: 10
  batch_multiplier: 1

  reward_shaping:
    enabled: false
    overlong_buffer_length: 128
    overlong_buffer_penalty: 1
    max_response_length: ${policy.max_total_sequence_length}
    stop_properly_penalty_coef: null

  # ── FIX 2: GDPO advantage estimator ─────────────────────────────────────
  # normalize_rewards here applies per reward dimension independently.
  # use_leave_one_out_baseline: false is correct for GDPO.
  # minus_baseline: true subtracts per-prompt mean — keep for stability.
  adv_estimator:
    name: "gdpo"
    normalize_rewards: true
    use_leave_one_out_baseline: false
    minus_baseline: true

  reward_scaling:
    enabled: true
    source_min: 0.0
    source_max: 1.0
    target_min: 0.0
    target_max: 1.0

  seq_logprob_error_threshold: null

  async_grpo:
    enabled: false
    max_trajectory_age_steps: 1
    in_flight_weight_updates: false
    recompute_kv_cache_after_weight_updates: false

  # async_gdpo:
  #   enabled: true
  #   max_trajectory_age_steps: 1
  #   in_flight_weight_updates: false
  #   recompute_kv_cache_after_weight_updates: true

# ── Loss Function ─────────────────────────────────────────────────────────
loss_fn:
  reference_policy_kl_penalty: 0.01
  reference_policy_kl_type: "k3"
  kl_input_clamp_value: 20.0
  kl_output_clamp_value: 10.0
  ratio_clip_min: 0.2
  ratio_clip_max: 0.2
  ratio_clip_c: null
  use_on_policy_kl_approximation: false
  use_importance_sampling_correction: false
  truncated_importance_sampling_ratio: null
  truncated_importance_sampling_ratio_min: null
  truncated_importance_sampling_type: tis
  sequence_level_importance_ratios: false
  token_level_loss: true
  force_on_policy_ratio: false
  use_kl_in_reward: false

# ── Checkpointing ─────────────────────────────────────────────────────────
checkpointing:
  enabled: true
  checkpoint_dir: "/real-all-data/ckpt/gdpo"
  metric_name: "val:accuracy"
  higher_is_better: true
  keep_top_k: 3
  save_period: 100
  checkpoint_must_save_by: null
  model_save_format: "safetensors"
  save_consolidated: false
  save_optimizer: true

# ── Policy Model ──────────────────────────────────────────────────────────
policy:
  model_name: "/home/local/training/ckpts/run2_3/merged_model"
  tokenizer:
    name: ${policy.model_name}
    chat_template_kwargs: null
  hf_config_overrides: {}
  train_global_batch_size: 128
  train_micro_batch_size: 16
  generation_batch_size: 32
  logprob_batch_size: ${policy.train_micro_batch_size}
  max_total_sequence_length: 1024
  precision: "bfloat16"
  logprob_chunk_size: null
  offload_optimizer_for_logprob: false

  dtensor_cfg:
    _v2: true
    enabled: true
    cpu_offload: false
    sequence_parallel: true
    activation_checkpointing: true
    tensor_parallel_size: 1
    context_parallel_size: 1
    custom_parallel_plan: null
    lora_cfg:
      enabled: true
      target_modules: []
      exclude_modules: []
      match_all_linear: true
      dim: 8
      alpha: 32
      dropout: 0.0
      dropout_position: "post"
      lora_A_init: "xavier"
      use_triton: true

  megatron_cfg:
    enabled: false
    empty_unused_memory_level: 1
    activation_checkpointing: false
    converter_type: "Qwen3ForCausalLM"
    tensor_model_parallel_size: 1
    expert_tensor_parallel_size: 1
    expert_model_parallel_size: 1
    pipeline_model_parallel_size: 1
    num_layers_in_first_pipeline_stage: null
    num_layers_in_last_pipeline_stage: null
    context_parallel_size: 1
    pipeline_dtype: ${policy.precision}
    sequence_parallel: false
    freeze_moe_router: true
    moe_router_dtype: "fp64"
    moe_router_load_balancing_type: "none"
    moe_router_bias_update_rate: 0.0
    moe_permute_fusion: false
    apply_rope_fusion: true
    bias_activation_fusion: true
    defer_fp32_logits: false
    moe_per_layer_logging: false
    moe_enable_deepep: false
    moe_token_dispatcher_type: "allgather"
    moe_shared_expert_overlap: false
    optimizer:
      optimizer: "adam"
      lr: 5.0e-6
      min_lr: 5.0e-7
      weight_decay: 0.01
      bf16: true
      fp16: false
      params_dtype: "float32"
      adam_beta1: 0.9
      adam_beta2: 0.999
      adam_eps: 1e-8
      sgd_momentum: 0.9
      use_distributed_optimizer: true
      use_precision_aware_optimizer: true
      clip_grad: ${policy.max_grad_norm}
      optimizer_cpu_offload: false
      optimizer_offload_fraction: 0.0
    scheduler:
      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
      weight_decay_incr_style: "constant"
      lr_decay_style: "constant"
      lr_decay_iters: 1000
      lr_warmup_iters: 13
      lr_warmup_init: 5.0e-7
    distributed_data_parallel_config:
      grad_reduce_in_fp32: false
      overlap_grad_reduce: true
      overlap_param_gather: true
      use_custom_fsdp: false
      data_parallel_sharding_strategy: "optim_grads_params"
    fp8_cfg:
      enabled: false
      fp8: "e4m3"
      fp8_recipe: "blockwise"
      fp8_param: false
    env_vars: null

  dynamic_batching:
    enabled: false
    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
    sequence_length_round: 64

  sequence_packing:
    enabled: true
    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
    algorithm: "modified_first_fit_decreasing"
    sequence_length_round: 64

  make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
  max_grad_norm: 1.0

  optimizer:
    name: "torch.optim.AdamW"
    kwargs:
      lr: 5.0e-6
      weight_decay: 0.01
      betas: [0.9, 0.999]
      eps: 1e-8
      foreach: false
      fused: false

  scheduler:
    - name: "torch.optim.lr_scheduler.LinearLR"
      kwargs:
        start_factor: 0.1
        end_factor: 1.0
        total_iters: 50
    - name: "torch.optim.lr_scheduler.ConstantLR"
      kwargs:
        factor: 1.0
        total_iters: 10000000000
    - milestones: [50]

  generation:
    backend: "vllm"
    max_new_tokens: ${policy.max_total_sequence_length}
    temperature: 1.0
    top_p: 1.0
    top_k: null
    stop_token_ids: null
    stop_strings: null
    vllm_cfg:
      async_engine: true
      precision: ${policy.precision}
      kv_cache_dtype: "auto"
      tensor_parallel_size: 1
      pipeline_parallel_size: 1
      expert_parallel_size: 1
      gpu_memory_utilization: 0.9
      max_model_len: ${policy.max_total_sequence_length}
      enforce_eager: false
      use_deep_gemm: false
      num_last_layers_in_bf16: 0
      num_first_layers_in_bf16: 0
      enable_vllm_metrics_logger: true
      vllm_metrics_logger_interval: 0.5
    vllm_kwargs: {}
    colocated:
      enabled: false
      resources:
        gpus_per_node: 4
        num_nodes: null

# ── Data ──────────────────────────────────────────────────────────────────
data:
  use_multiple_dataloader: false
  max_input_seq_length: ${policy.max_total_sequence_length}

  # ── FIX 3: shuffle should be true for training ─────────────────────────
  shuffle: true
  num_workers: 10

  train:
    dataset_name: ResponseDataset
    data_path: /real-all-data/grpo/merge/merge.jsonl
    # ── FIX 4: input_key must match your JSONL field that holds messages ──
    # If your JSONL has {"messages": [...]} use "messages".
    # If it has {"input": "..."} use "input". Check your actual data file.
    input_key: "messages"
    split_validation_size: 0.05
    seed: 42

  validation: null

  default:
    dataset_name: ResponseDataset
    # ── FIX 5: default input_key aligned with train ───────────────────────
    input_key: "messages"
    output_key: output
    prompt_file: null
    system_prompt_file: null
    processor: "thai_processor"
    # ── FIX 6: env_name must point to YOUR custom env, not "reward_model" ─
    # "reward_model" = Skywork only, returns shape (B,) — GDPO needs (B,2).
    # "thai_reward_env" = your ThaiRewardEnvironmentActor, returns (B,2).
    env_name: "thai_reward_env"

# ── Environment ───────────────────────────────────────────────────────────
env:
  # ── FIX 7: thai_reward_env replaces reward_model as the training env ───
  # Make sure thai_reward_env.py is on PYTHONPATH before launching.
  # The register_env() call inside thai_reward_env.py links this name
  # to ThaiRewardEnvironmentActor.
  thai_reward_env:
    num_workers: 4           # CPU-only env — no GPUs needed
    config:
      required_format: "boxed"
      stop_strings: ["</s>", "<|end|>"]
      max_response_length: 2048
      correctness_model_name: "typhoon-ai/typhoon2.5-qwen3-4b"
      format_model_name: "Skywork/Skywork-Reward-V2-Qwen3-0.6B"
    resources:
      gpus_per_node: 1     # rule-based scoring needs no GPU
      num_nodes: 1

  # ── Skywork reward model (optional — for eval / logging only) ──────────
  # If you want Skywork scores DURING training, move the logic inside
  # thai_reward_env.py and add it as rewards[:, 2] (3rd reward dim).
  # Keeping it here does NOT affect training — it is not referenced by
  # any env_name above.
  reward_model:
    enabled: false           # disabled — not used as training env
    model_name: "Skywork/Skywork-Reward-V2-Qwen3-0.6B"
    tokenizer:
      name: "Skywork/Skywork-Reward-V2-Qwen3-0.6B"
    precision: "bfloat16"
    batch_size: 32
    resources:
      gpus_per_node: 1
      num_nodes: 1
    reward_model_cfg:
      enabled: true
      reward_model_type: "bradley_terry"
    dynamic_batching:
      enabled: false
    sequence_packing:
      enabled: false
    dtensor_cfg:
      enabled: true
      _v2: true
      cpu_offload: false
      sequence_parallel: false
      activation_checkpointing: false
      tensor_parallel_size: 1
      context_parallel_size: 1
    max_grad_norm: null

from my config use with GDPO it take time like 40 days (an hour per step)how I can make it less than that
I have :
1 node B200 (8 GPU card)
90000 of data for training

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions