grpo:
num_prompts_per_step: 32
num_generations_per_prompt: 16
max_rollout_turns: 1
max_num_epochs: 1
max_num_steps: 1000000
# ── FIX 1: set top-level baseline flags to false when using GDPO ────────
# adv_estimator (below) owns normalization and baseline for GDPO.
# Keeping these true here would conflict with the GDPO path.
normalize_rewards: false
use_leave_one_out_baseline: false
val_period: 10
val_at_start: false
val_at_end: false
overlong_filtering: false
max_val_samples: 256
val_batch_size: 256
seed: 42
use_dynamic_sampling: false
dynamic_sampling_max_gen_batches: 10
batch_multiplier: 1
reward_shaping:
enabled: false
overlong_buffer_length: 128
overlong_buffer_penalty: 1
max_response_length: ${policy.max_total_sequence_length}
stop_properly_penalty_coef: null
# ── FIX 2: GDPO advantage estimator ─────────────────────────────────────
# normalize_rewards here applies per reward dimension independently.
# use_leave_one_out_baseline: false is correct for GDPO.
# minus_baseline: true subtracts per-prompt mean — keep for stability.
adv_estimator:
name: "gdpo"
normalize_rewards: true
use_leave_one_out_baseline: false
minus_baseline: true
reward_scaling:
enabled: true
source_min: 0.0
source_max: 1.0
target_min: 0.0
target_max: 1.0
seq_logprob_error_threshold: null
async_grpo:
enabled: false
max_trajectory_age_steps: 1
in_flight_weight_updates: false
recompute_kv_cache_after_weight_updates: false
# async_gdpo:
# enabled: true
# max_trajectory_age_steps: 1
# in_flight_weight_updates: false
# recompute_kv_cache_after_weight_updates: true
# ── Loss Function ─────────────────────────────────────────────────────────
loss_fn:
reference_policy_kl_penalty: 0.01
reference_policy_kl_type: "k3"
kl_input_clamp_value: 20.0
kl_output_clamp_value: 10.0
ratio_clip_min: 0.2
ratio_clip_max: 0.2
ratio_clip_c: null
use_on_policy_kl_approximation: false
use_importance_sampling_correction: false
truncated_importance_sampling_ratio: null
truncated_importance_sampling_ratio_min: null
truncated_importance_sampling_type: tis
sequence_level_importance_ratios: false
token_level_loss: true
force_on_policy_ratio: false
use_kl_in_reward: false
# ── Checkpointing ─────────────────────────────────────────────────────────
checkpointing:
enabled: true
checkpoint_dir: "/real-all-data/ckpt/gdpo"
metric_name: "val:accuracy"
higher_is_better: true
keep_top_k: 3
save_period: 100
checkpoint_must_save_by: null
model_save_format: "safetensors"
save_consolidated: false
save_optimizer: true
# ── Policy Model ──────────────────────────────────────────────────────────
policy:
model_name: "/home/local/training/ckpts/run2_3/merged_model"
tokenizer:
name: ${policy.model_name}
chat_template_kwargs: null
hf_config_overrides: {}
train_global_batch_size: 128
train_micro_batch_size: 16
generation_batch_size: 32
logprob_batch_size: ${policy.train_micro_batch_size}
max_total_sequence_length: 1024
precision: "bfloat16"
logprob_chunk_size: null
offload_optimizer_for_logprob: false
dtensor_cfg:
_v2: true
enabled: true
cpu_offload: false
sequence_parallel: true
activation_checkpointing: true
tensor_parallel_size: 1
context_parallel_size: 1
custom_parallel_plan: null
lora_cfg:
enabled: true
target_modules: []
exclude_modules: []
match_all_linear: true
dim: 8
alpha: 32
dropout: 0.0
dropout_position: "post"
lora_A_init: "xavier"
use_triton: true
megatron_cfg:
enabled: false
empty_unused_memory_level: 1
activation_checkpointing: false
converter_type: "Qwen3ForCausalLM"
tensor_model_parallel_size: 1
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
pipeline_model_parallel_size: 1
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
context_parallel_size: 1
pipeline_dtype: ${policy.precision}
sequence_parallel: false
freeze_moe_router: true
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none"
moe_router_bias_update_rate: 0.0
moe_permute_fusion: false
apply_rope_fusion: true
bias_activation_fusion: true
defer_fp32_logits: false
moe_per_layer_logging: false
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_shared_expert_overlap: false
optimizer:
optimizer: "adam"
lr: 5.0e-6
min_lr: 5.0e-7
weight_decay: 0.01
bf16: true
fp16: false
params_dtype: "float32"
adam_beta1: 0.9
adam_beta2: 0.999
adam_eps: 1e-8
sgd_momentum: 0.9
use_distributed_optimizer: true
use_precision_aware_optimizer: true
clip_grad: ${policy.max_grad_norm}
optimizer_cpu_offload: false
optimizer_offload_fraction: 0.0
scheduler:
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: "constant"
lr_decay_style: "constant"
lr_decay_iters: 1000
lr_warmup_iters: 13
lr_warmup_init: 5.0e-7
distributed_data_parallel_config:
grad_reduce_in_fp32: false
overlap_grad_reduce: true
overlap_param_gather: true
use_custom_fsdp: false
data_parallel_sharding_strategy: "optim_grads_params"
fp8_cfg:
enabled: false
fp8: "e4m3"
fp8_recipe: "blockwise"
fp8_param: false
env_vars: null
dynamic_batching:
enabled: false
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
sequence_length_round: 64
sequence_packing:
enabled: true
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
algorithm: "modified_first_fit_decreasing"
sequence_length_round: 64
make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
max_grad_norm: 1.0
optimizer:
name: "torch.optim.AdamW"
kwargs:
lr: 5.0e-6
weight_decay: 0.01
betas: [0.9, 0.999]
eps: 1e-8
foreach: false
fused: false
scheduler:
- name: "torch.optim.lr_scheduler.LinearLR"
kwargs:
start_factor: 0.1
end_factor: 1.0
total_iters: 50
- name: "torch.optim.lr_scheduler.ConstantLR"
kwargs:
factor: 1.0
total_iters: 10000000000
- milestones: [50]
generation:
backend: "vllm"
max_new_tokens: ${policy.max_total_sequence_length}
temperature: 1.0
top_p: 1.0
top_k: null
stop_token_ids: null
stop_strings: null
vllm_cfg:
async_engine: true
precision: ${policy.precision}
kv_cache_dtype: "auto"
tensor_parallel_size: 1
pipeline_parallel_size: 1
expert_parallel_size: 1
gpu_memory_utilization: 0.9
max_model_len: ${policy.max_total_sequence_length}
enforce_eager: false
use_deep_gemm: false
num_last_layers_in_bf16: 0
num_first_layers_in_bf16: 0
enable_vllm_metrics_logger: true
vllm_metrics_logger_interval: 0.5
vllm_kwargs: {}
colocated:
enabled: false
resources:
gpus_per_node: 4
num_nodes: null
# ── Data ──────────────────────────────────────────────────────────────────
data:
use_multiple_dataloader: false
max_input_seq_length: ${policy.max_total_sequence_length}
# ── FIX 3: shuffle should be true for training ─────────────────────────
shuffle: true
num_workers: 10
train:
dataset_name: ResponseDataset
data_path: /real-all-data/grpo/merge/merge.jsonl
# ── FIX 4: input_key must match your JSONL field that holds messages ──
# If your JSONL has {"messages": [...]} use "messages".
# If it has {"input": "..."} use "input". Check your actual data file.
input_key: "messages"
split_validation_size: 0.05
seed: 42
validation: null
default:
dataset_name: ResponseDataset
# ── FIX 5: default input_key aligned with train ───────────────────────
input_key: "messages"
output_key: output
prompt_file: null
system_prompt_file: null
processor: "thai_processor"
# ── FIX 6: env_name must point to YOUR custom env, not "reward_model" ─
# "reward_model" = Skywork only, returns shape (B,) — GDPO needs (B,2).
# "thai_reward_env" = your ThaiRewardEnvironmentActor, returns (B,2).
env_name: "thai_reward_env"
# ── Environment ───────────────────────────────────────────────────────────
env:
# ── FIX 7: thai_reward_env replaces reward_model as the training env ───
# Make sure thai_reward_env.py is on PYTHONPATH before launching.
# The register_env() call inside thai_reward_env.py links this name
# to ThaiRewardEnvironmentActor.
thai_reward_env:
num_workers: 4 # CPU-only env — no GPUs needed
config:
required_format: "boxed"
stop_strings: ["</s>", "<|end|>"]
max_response_length: 2048
correctness_model_name: "typhoon-ai/typhoon2.5-qwen3-4b"
format_model_name: "Skywork/Skywork-Reward-V2-Qwen3-0.6B"
resources:
gpus_per_node: 1 # rule-based scoring needs no GPU
num_nodes: 1
# ── Skywork reward model (optional — for eval / logging only) ──────────
# If you want Skywork scores DURING training, move the logic inside
# thai_reward_env.py and add it as rewards[:, 2] (3rd reward dim).
# Keeping it here does NOT affect training — it is not referenced by
# any env_name above.
reward_model:
enabled: false # disabled — not used as training env
model_name: "Skywork/Skywork-Reward-V2-Qwen3-0.6B"
tokenizer:
name: "Skywork/Skywork-Reward-V2-Qwen3-0.6B"
precision: "bfloat16"
batch_size: 32
resources:
gpus_per_node: 1
num_nodes: 1
reward_model_cfg:
enabled: true
reward_model_type: "bradley_terry"
dynamic_batching:
enabled: false
sequence_packing:
enabled: false
dtensor_cfg:
enabled: true
_v2: true
cpu_offload: false
sequence_parallel: false
activation_checkpointing: false
tensor_parallel_size: 1
context_parallel_size: 1
max_grad_norm: null
from my config use with GDPO it take time like 40 days (an hour per step)how I can make it less than that
I have :
1 node B200 (8 GPU card)
90000 of data for training
from my config use with GDPO it take time like 40 days (an hour per step)how I can make it less than that
I have :
1 node B200 (8 GPU card)
90000 of data for training