From 2620c5ba319017ec5548ed97b4ed23833f38f25a Mon Sep 17 00:00:00 2001 From: kkscilife <1658148753@qq.com> Date: Mon, 18 May 2026 18:44:25 +0800 Subject: [PATCH 01/10] adapt to new code --- autotest/config/rl_qwen3_8B_gsm8k_grpo.py | 167 ++++++++++++++++++++++ autotest/module/train.py | 2 +- 2 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 autotest/config/rl_qwen3_8B_gsm8k_grpo.py diff --git a/autotest/config/rl_qwen3_8B_gsm8k_grpo.py b/autotest/config/rl_qwen3_8B_gsm8k_grpo.py new file mode 100644 index 000000000..4dcae760f --- /dev/null +++ b/autotest/config/rl_qwen3_8B_gsm8k_grpo.py @@ -0,0 +1,167 @@ +import os +from copy import deepcopy +from pathlib import Path + +from transformers import AutoTokenizer +from xtuner.v1.config import ( + AdamWConfig, + FSDPConfig, + LRConfig, +) +from xtuner.v1.data_proto.rl_data import SampleParams +from xtuner.v1.datasets import RLTokenizeFnConfig +from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig +from xtuner.v1.model import get_model_config_from_hf +from xtuner.v1.ray.base import AcceleratorResourcesConfig +from xtuner.v1.ray.config.worker import RolloutConfig +from xtuner.v1.ray.dataflow import DataFlowConfig, ReplayBufferConfig +from xtuner.v1.ray.evaluator import EvaluatorConfig +from xtuner.v1.ray.judger.controller import JudgerConfig +from xtuner.v1.ray.judger.gsm8k import GSM8KJudgerConfig +from xtuner.v1.rl.base import WorkerConfig +from xtuner.v1.rl.grpo import GRPOLossConfig +from xtuner.v1.train.rl_trainer import RLTrainerConfig + + +work_dir = os.environ["WORK_DIR"] +model_path = os.environ["MODEL_PATH"] +data_path = os.environ["DATA_PATH"] +eval_data_path = os.environ["EVAL_DATA_PATH"] +enable_evaluate = True if eval_data_path != "" else False +enable_partial_rollout = int(os.environ.get("ENABLE_PARTIAL_ROLLOUT", "0")) + +# basic settings +experimental_name = "grpo_gsm8k_tiny" +total_epochs = 3 +global_batch_size = 64 +prompt_repeat_k = 5 +rollout_tp_size = 1 +rollout_ep_size = 1 +max_prompt_length = 512 +max_response_length = 1024 +pack_max_length = 32768 +train_optimizer_steps = 1 +hf_interval = 100 +enable_initial_evaluate = True +evaluate_step = 15 + +# 1. resources +resources = AcceleratorResourcesConfig( + accelerator="GPU", + num_workers=8, + num_cpus_per_worker=12, + cpu_memory_per_worker=16 * 1024**3, # 16 GB +) + +# 2. rollout +rollout_config = RolloutConfig( + env=experimental_name, + device=resources.accelerator, + model_path=model_path, + dtype="bfloat16", + tensor_parallel_size=rollout_tp_size, + expert_parallel_size=rollout_ep_size, + gpu_memory_utilization=0.85, + context_length=max_prompt_length + max_response_length, + rollout_max_batch_size_per_instance=1024, +) + +# sampling params +training_sample_params = SampleParams( + max_tokens=max_response_length, +) +evaluation_sample_params = deepcopy(training_sample_params) +evaluation_sample_params.top_p = 1.0 +evaluation_sample_params.temperature = 0.0 +evaluation_sample_params.top_k = 1 + +# dataset: 不需要修改 +train_dataset = DatasetConfig(name=experimental_name, anno_path=data_path) +eval_dataset = DatasetConfig(name=experimental_name, anno_path=eval_data_path) if enable_evaluate else None +tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) +tokenizer_config = RLTokenizeFnConfig(max_length=max_prompt_length) + +train_dataset_cfg = [{"dataset": train_dataset, "tokenize_fn": tokenizer_config}] +eval_dataset_cfg = [{"dataset": eval_dataset, "tokenize_fn": tokenizer_config}] if enable_evaluate else [] + +dataloader_config = DataloaderConfig(pack_max_length=pack_max_length, collator="fake_collator", pack_level="none") + +# 3. judger +gsm8k_judger_config = GSM8KJudgerConfig(judger_name="openai/gsm8k") +judger_cfg = JudgerConfig(reward_judger_configs=[gsm8k_judger_config]) + +# 4. dataflow and evaluator +dataflow_config = DataFlowConfig( + env=experimental_name, + prompt_repeat_k=prompt_repeat_k, + global_batch_size=global_batch_size, + sample_params=training_sample_params, + enable_partial_rollout=enable_partial_rollout, +) + +evaluator_cfg = ( + EvaluatorConfig( + enable_evaluate=enable_evaluate, + enable_initial_evaluate=enable_initial_evaluate, + dataset_cfg=eval_dataset_cfg, + tokenizer=tokenizer, + evaluate_step=evaluate_step, + compute_metric_func=None, + sample_params=evaluation_sample_params, + ) + if enable_evaluate + else None +) + +# replay buffer config: : 不需要修改 +replay_buffer_cfg = ReplayBufferConfig( + dataset_cfg=train_dataset_cfg, dataloader_cfg=dataloader_config, tokenizer=tokenizer +) + +# 5. Train worker +model_cfg = get_model_config_from_hf(Path(model_path)) +model_cfg.compile_cfg = False +optim_cfg = AdamWConfig(lr=1e-6, foreach=False) +loss_cfg = GRPOLossConfig( + policy_loss_cfg=dict( + cliprange_high=0.2, + cliprange_low=0.2, + loss_type="vanilla", + ), + ignore_idx=-100, + use_kl_loss=True, + kl_loss_coef=0.001, + kl_loss_type="low_var_kl", + mode="chunk", + chunk_size=512, +) +lr_cfg = LRConfig(lr_type="constant", warmup_ratio=0, lr_min=1e-6) +fsdp_cfg = FSDPConfig(cpu_offload=False, ep_size=1) +train_worker_cfg: WorkerConfig = WorkerConfig( + model_cfg=model_cfg, + load_from=model_path, + optim_cfg=optim_cfg, + loss_cfg=loss_cfg, + lr_cfg=lr_cfg, + fsdp_cfg=fsdp_cfg, + sp_size=1, + optimizer_steps=train_optimizer_steps, + pack_max_length=pack_max_length, +) + +# 6. RL Trainer +trainer = RLTrainerConfig( + load_from=model_path, + resources=resources, + rollout_config=rollout_config, + dataflow_config=dataflow_config, + judger_config=judger_cfg, + replay_buffer_config=replay_buffer_cfg, + evaluator_config=evaluator_cfg, + train_worker_config=train_worker_cfg, + tokenizer_path=model_path, + work_dir=work_dir, + total_epochs=total_epochs, + hf_interval=hf_interval, + exp_tracker="jsonl", +) diff --git a/autotest/module/train.py b/autotest/module/train.py index c813851d1..e9e6e88f0 100644 --- a/autotest/module/train.py +++ b/autotest/module/train.py @@ -77,7 +77,7 @@ def validate(config): check_metrics = config.get("assert_info", {}).get("check_metrics", {}) return check_result(config["case_name"], base_path, cur_path, check_metrics) elif train_type == "rl": - cur_path = os.path.join(get_latest_subdir(work_dir), "exp_tracking/tracker.jsonl") + cur_path = os.path.join(get_latest_subdir(work_dir), "logs/exp_tracking/tracker.jsonl") check_metrics = config.get("assert_info", {}) return check_rl_result(config["case_name"], base_path, cur_path, check_metrics) else: From aa512a484adcc69855e10fe40d974a9f6b5098c7 Mon Sep 17 00:00:00 2001 From: kkscilife <1658148753@qq.com> Date: Tue, 19 May 2026 10:17:11 +0800 Subject: [PATCH 02/10] new config --- autotest/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotest/config.yaml b/autotest/config.yaml index 8f242a80d..e9fa27e1b 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -709,7 +709,7 @@ case: - type: rl parameters: - config: autotest/config/rl_qwen3_8B_gsm8k_grpo.py + config: autotest/config/rl_qwen3_gsm8k_grpo.py infer_backend: lmdeploy output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output resource: From 71cdcefb97b922566de13bedaf9b2fe98eda7a0d Mon Sep 17 00:00:00 2001 From: kkscilife <1658148753@qq.com> Date: Tue, 19 May 2026 14:50:54 +0800 Subject: [PATCH 03/10] output to jsonl --- autotest/config/rl_qwen3_gsm8k_grpo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/autotest/config/rl_qwen3_gsm8k_grpo.py b/autotest/config/rl_qwen3_gsm8k_grpo.py index 4d6fefdb2..648a58d49 100644 --- a/autotest/config/rl_qwen3_gsm8k_grpo.py +++ b/autotest/config/rl_qwen3_gsm8k_grpo.py @@ -206,4 +206,5 @@ work_dir=work_dir, seed=123, debug_rollout=False, + exp_tracker="jsonl", ) From 21925513a2ad87189a6532e61056a9c8f3c0945b Mon Sep 17 00:00:00 2001 From: kkscilife <1658148753@qq.com> Date: Wed, 20 May 2026 14:06:07 +0800 Subject: [PATCH 04/10] use new script --- autotest/module/train.py | 3 +- autotest/utils/ci_run_rl.sh | 208 ++++++++++++++++++++++++++++++++++++ 2 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 autotest/utils/ci_run_rl.sh diff --git a/autotest/module/train.py b/autotest/module/train.py index e9e6e88f0..6fc3e4b70 100644 --- a/autotest/module/train.py +++ b/autotest/module/train.py @@ -57,10 +57,11 @@ def get_cmd(config): return command, config elif train_type == "rl": infer_type = config.get("parameters", {}).get("infer_backend", "lmdeploy") + acceleator = config.get("parameters", {}).get("acceleator", "GPU") command = ( f"cd {current_dir}; pwd; pip install -e .[all]; export GITHUB_RUN_ID={config.get('run_id')}; export WORK_DIR={work_dir}; " + cudnn_patch - + f"bash -x examples/v1/scripts/run_rl.sh {config_path} {infer_type} ${{MODEL_PATH}} ${{DATA_PATH}} ${{EVAL_DATA_PATH}}" + + f"bash -x autotest/utils/ci_run_rl.sh {acceleator} {infer_type} {config_path} ${{MODEL_PATH}} ${{DATA_PATH}} ${{EVAL_DATA_PATH}}" ) return command, config else: diff --git a/autotest/utils/ci_run_rl.sh b/autotest/utils/ci_run_rl.sh new file mode 100644 index 000000000..4ba764d12 --- /dev/null +++ b/autotest/utils/ci_run_rl.sh @@ -0,0 +1,208 @@ +set -ex +ray stop --force +# examples of usage: +# qwen3_8B_grpo_gsm8k training: +# bash autotest/utils/ci_run_rl.sh "GPU" "sglang" auotest/config/rl_qwen3_8B_grpo.py $MODEL_PATH $DATA_PATH $EVAL_DATA_PATH + +ACCELERATOR=$1 +INFER_BACKEND=$2 +CONFIG_PATH=$3 +MODEL_PATH=$4 +DATA_PATH=$5 +EVAL_DATA_PATH=${6:-""} +ACCELERATOR=$(echo "$ACCELERATOR" | tr '[:lower:]' '[:upper:]') +if [ $ACCELERATOR != "GPU" ] && [ $ACCELERATOR != "NPU" ]; then + echo "Error: ACCELERATOR must be either 'gpu' or 'npu'!" + exit 1 +fi + +ulimit -n 65536 + +if [ "$ACCELERATOR" = "NPU" ]; then + ACCELERATOR_PER_NODE=${7:-16} + yum install dnsutils -y + source /usr/local/Ascend/ascend-toolkit/set_env.sh + source /usr/local/Ascend/nnal/atb/set_env.sh --cxx_abi=1 + git config --global --add safe.directory "$PWD" + export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver + export CPU_AFFINITY_CONF=0 + export HF_HOME=/workspace + export HF_DATASETS_OFFLINE=1 + export TRANSFORMERS_OFFLINE=1 + export HF_EVALUATE_OFFLINE=1 + export HF_HUB_OFFLINE=1 + export UVICORN_LOG_LEVEL="CRITICAL" + export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True" + export RAY_CGRAPH_get_timeout=3600 +else + ACCELERATOR_PER_NODE=${7:-8} +fi + +export PYTHONPATH=$(pwd):$PYTHONPATH + +# ray 环境变量 +export MASTER_PORT=6000 +export WORLD_SIZE=${NODE_COUNT:-"1"} +export RANK=${NODE_RANK:-"0"} +export RAY_MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} +export RAY_RANK=${RANK:-0} # 0 代表主节点, >0 代表工作节点 +export RAY_HEAD_PORT=${RAY_HEAD_PORT:-"6379"} +export RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-"8265"} + +# xtuner 环境变量 +export MODEL_PATH=$MODEL_PATH +export DATA_PATH=$DATA_PATH +export EVAL_DATA_PATH=$EVAL_DATA_PATH +export XTUNER_USE_FA3=${XTUNER_USE_FA3:-1} +export XTUNER_LOG_LEVEL=${XTUNER_LOG_LEVEL:-"INFO"} +export PYTHONUNBUFFERED=1 + +infer_backend_lower=$(echo "$INFER_BACKEND" | tr '[:upper:]' '[:lower:]') +if [ "$infer_backend_lower" = "sglang" ]; then + export XTUNER_USE_SGLANG=1 + unset PYTORCH_CUDA_ALLOC_CONF + export SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 + export SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION=False +elif [ "$infer_backend_lower" = "lmdeploy" ]; then + export XTUNER_USE_LMDEPLOY=1 + export PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' + export PYTHONPATH=$LMDEPLOY_PATH:$PYTHONPATH +elif [ "$infer_backend_lower" = "vllm" ]; then + export XTUNER_USE_VLLM=1 + export PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' +else + echo "Error: INFER_BACKEND '$INFER_BACKEND' is not supported or not specified!" + exit 1 +fi + +current_time=$(date "+%m%d%H") +# 取模型路径的最后一级作为model_name,取数据路径的倒数第二级作为data_name +model_dir_name=$(basename "$MODEL_PATH") +data_dir_name=$(basename "$(dirname "$DATA_PATH")") + +if [ "x$WORK_DIR" = "x" ]; then + DIR=$(pwd) + export WORK_DIR="${DIR}/work_dirs/${model_dir_name}_${data_dir_name}_${infer_backend_lower}" +else + export WORK_DIR=$WORK_DIR +fi +echo "WORK_DIR: $WORK_DIR" +if [ ! -d "$WORK_DIR" ]; then + mkdir -p "$WORK_DIR" +fi + +export LMDEPLOY_LOG_FILE="${WORK_DIR}/lmdeploy_log_${current_time}.txt" +if [ "$ACCELERATOR" = "GPU" ]; then + # TODO: support NPU RL Memory Monitor + export XTUNER_RL_MEM_DIR="${WORK_DIR}/mem_${current_time}" +fi + +# 2. Launch Ray cluster +# 根据 NODE_COUNT 分配 num_cpus, 防止内存OOM +node_count=${NODE_COUNT:-1} + +WORK_DIR=$(realpath "$WORK_DIR") +if [ "$ACCELERATOR" = "GPU" ]; then + if [ "$RAY_RANK" -eq 0 ]; then + rm -rf /tmp/ray_log + export RAY_LOG_DIR="${WORK_DIR}/ray_${current_time}/" + mkdir -p ${RAY_LOG_DIR} + ln -sfn "${RAY_LOG_DIR}" /tmp/ray_log + ray start --head \ + --node-ip-address="$RAY_MASTER_ADDR" \ + --port="$RAY_HEAD_PORT" \ + --dashboard-host=0.0.0.0 \ + --dashboard-port=$RAY_DASHBOARD_PORT \ + --include-dashboard=true \ + --disable-usage-stats \ + --temp-dir="/tmp/ray_log/" + else + while true; do + if curl --connect-timeout 2 "http://${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}" >/dev/null 2>&1; then + echo "Successfully connected to Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}" + break + else + echo "Waiting for Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT} to be available..." + sleep 2 + fi + done + ray start --address="$RAY_MASTER_ADDR:$RAY_HEAD_PORT" --block --disable-usage-stats + fi + + while true; do + result=$(ray status | grep ${ACCELERATOR} | cut -d ' ' -f2 | cut -d '/' -f2) + expected_accelerator_count=$((node_count * ${ACCELERATOR_PER_NODE})) + if [ "$result" = "$expected_accelerator_count.0" ]; then + break + else + echo "Waiting for ${ACCELERATOR} count to be $expected_accelerator_count, current: $result" + sleep 2 + fi + done + + SCRIPT_NAME=$(basename "$0") + cp "$0" "${WORK_DIR}/${SCRIPT_NAME}" + cp "$CONFIG_PATH" "${WORK_DIR}/config.py" + LOG_FILE="${WORK_DIR}/training_log_${current_time}.txt" + + python xtuner/v1/train/cli/rl.py \ + --config $CONFIG_PATH \ + 2>&1 | tee -a "${WORK_DIR}/training_log_${current_time}.txt" +elif [ "$ACCELERATOR" = "NPU" ]; then + if [ "$RAY_RANK" -eq 0 ]; then + RAY_memory_monitor_refresh_ms=0 ray start --head \ + --node-ip-address="$RAY_MASTER_ADDR" \ + --port="$RAY_HEAD_PORT" \ + --dashboard-host=0.0.0.0 \ + --dashboard-port=$RAY_DASHBOARD_PORT \ + --include-dashboard=true \ + --disable-usage-stats \ + else + RAY_RESOLVED_IP=$(nslookup $MASTER_ADDR | awk '/^Address: / { addr=$2 } END { print addr }') + if [ -z "$RAY_RESOLVED_IP" ]; then + echo "Warning: nslookup failed, falling back to original MASTER_ADDR" + RAY_RESOLVED_IP=$MASTER_ADDR + fi + echo "Resolved Master IP from $MASTER_ADDR: $RAY_RESOLVED_IP" + WAIT_TIME=$(awk "BEGIN {print 10 + 1.5 * $RAY_RANK}") + echo "Rank $RAY_RANK is sleeping for $WAIT_TIME seconds before connecting to Head..." + sleep $WAIT_TIME + export RAY_gcs_rpc_server_reconnect_timeout_s=600 + export RAY_gcs_grpc_max_reconnect_attempts=100 + export RAY_rpc_grpc_timeout_ms=30000 + RAY_memory_monitor_refresh_ms=0 ray start --address="$RAY_RESOLVED_IP:$RAY_HEAD_PORT" \ + --node-ip-address="$(hostname -i)" \ + --block \ + --disable-usage-stats + fi + + sleep 10 + + SCRIPT_NAME=$(basename "$0") + cp "$0" "${WORK_DIR}/${SCRIPT_NAME}" + cp "$CONFIG_PATH" "${WORK_DIR}/config.py" + + echo OUPUT_DIR is ${WORK_DIR} + if [ "$RAY_RANK" -eq 0 ]; then + RUNTIME_ENV_JSON="{ + \"env_vars\": { + \"XTUNER_LOG_LEVEL\": \"${XTUNER_LOG_LEVEL}\", + \"PYTHONPATH\": \"${PYTHONPATH}\", + \"MASTER_ADDR\": \"${RAY_MASTER_ADDR}\", + \"XTUNER_USE_SGLANG\": \"${XTUNER_USE_SGLANG:-}\", + \"XTUNER_USE_LMDEPLOY\": \"${XTUNER_USE_LMDEPLOY:-}\", + \"XTUNER_USE_VLLM\": \"${XTUNER_USE_VLLM:-}\", + \"PYTORCH_NPU_ALLOC_CONF\": \"${PYTORCH_NPU_ALLOC_CONF:-}\", + \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\", + \"RAY_CGRAPH_get_timeout\": \"3600\" + } + }" + + ray job submit --address="http://127.0.0.1:$RAY_DASHBOARD_PORT" \ + --runtime-env-json="$RUNTIME_ENV_JSON" \ + -- python xtuner/v1/train/cli/rl.py \ + --config $CONFIG_PATH \ + 2>&1 | tee -a "${WORK_DIR}/training_log.txt" + echo "训练任务提交完成。日志文件: ${WORK_DIR}/training_log.txt" + fi +fi From c357e11e26bae1cce154f987f2c01b24bbbfdea7 Mon Sep 17 00:00:00 2001 From: kkscilife <1658148753@qq.com> Date: Wed, 20 May 2026 14:54:20 +0800 Subject: [PATCH 05/10] try vl --- autotest/config.yaml | 3 +- autotest/config/rl_qwen3_8B_gsm8k_grpo.py | 167 ---------------------- 2 files changed, 2 insertions(+), 168 deletions(-) delete mode 100644 autotest/config/rl_qwen3_8B_gsm8k_grpo.py diff --git a/autotest/config.yaml b/autotest/config.yaml index e9fa27e1b..8c7fb655a 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -752,7 +752,8 @@ case: - type: rl parameters: - config: autotest/config/rl_qwen3_vl_geometry3k_grpo.py + config: examples/v1/config/rl_grpo_geo3k_judge.py + #config: autotest/config/rl_qwen3_vl_geometry3k_grpo.py infer_backend: lmdeploy output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output resource: diff --git a/autotest/config/rl_qwen3_8B_gsm8k_grpo.py b/autotest/config/rl_qwen3_8B_gsm8k_grpo.py deleted file mode 100644 index 4dcae760f..000000000 --- a/autotest/config/rl_qwen3_8B_gsm8k_grpo.py +++ /dev/null @@ -1,167 +0,0 @@ -import os -from copy import deepcopy -from pathlib import Path - -from transformers import AutoTokenizer -from xtuner.v1.config import ( - AdamWConfig, - FSDPConfig, - LRConfig, -) -from xtuner.v1.data_proto.rl_data import SampleParams -from xtuner.v1.datasets import RLTokenizeFnConfig -from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig -from xtuner.v1.model import get_model_config_from_hf -from xtuner.v1.ray.base import AcceleratorResourcesConfig -from xtuner.v1.ray.config.worker import RolloutConfig -from xtuner.v1.ray.dataflow import DataFlowConfig, ReplayBufferConfig -from xtuner.v1.ray.evaluator import EvaluatorConfig -from xtuner.v1.ray.judger.controller import JudgerConfig -from xtuner.v1.ray.judger.gsm8k import GSM8KJudgerConfig -from xtuner.v1.rl.base import WorkerConfig -from xtuner.v1.rl.grpo import GRPOLossConfig -from xtuner.v1.train.rl_trainer import RLTrainerConfig - - -work_dir = os.environ["WORK_DIR"] -model_path = os.environ["MODEL_PATH"] -data_path = os.environ["DATA_PATH"] -eval_data_path = os.environ["EVAL_DATA_PATH"] -enable_evaluate = True if eval_data_path != "" else False -enable_partial_rollout = int(os.environ.get("ENABLE_PARTIAL_ROLLOUT", "0")) - -# basic settings -experimental_name = "grpo_gsm8k_tiny" -total_epochs = 3 -global_batch_size = 64 -prompt_repeat_k = 5 -rollout_tp_size = 1 -rollout_ep_size = 1 -max_prompt_length = 512 -max_response_length = 1024 -pack_max_length = 32768 -train_optimizer_steps = 1 -hf_interval = 100 -enable_initial_evaluate = True -evaluate_step = 15 - -# 1. resources -resources = AcceleratorResourcesConfig( - accelerator="GPU", - num_workers=8, - num_cpus_per_worker=12, - cpu_memory_per_worker=16 * 1024**3, # 16 GB -) - -# 2. rollout -rollout_config = RolloutConfig( - env=experimental_name, - device=resources.accelerator, - model_path=model_path, - dtype="bfloat16", - tensor_parallel_size=rollout_tp_size, - expert_parallel_size=rollout_ep_size, - gpu_memory_utilization=0.85, - context_length=max_prompt_length + max_response_length, - rollout_max_batch_size_per_instance=1024, -) - -# sampling params -training_sample_params = SampleParams( - max_tokens=max_response_length, -) -evaluation_sample_params = deepcopy(training_sample_params) -evaluation_sample_params.top_p = 1.0 -evaluation_sample_params.temperature = 0.0 -evaluation_sample_params.top_k = 1 - -# dataset: 不需要修改 -train_dataset = DatasetConfig(name=experimental_name, anno_path=data_path) -eval_dataset = DatasetConfig(name=experimental_name, anno_path=eval_data_path) if enable_evaluate else None -tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) -tokenizer_config = RLTokenizeFnConfig(max_length=max_prompt_length) - -train_dataset_cfg = [{"dataset": train_dataset, "tokenize_fn": tokenizer_config}] -eval_dataset_cfg = [{"dataset": eval_dataset, "tokenize_fn": tokenizer_config}] if enable_evaluate else [] - -dataloader_config = DataloaderConfig(pack_max_length=pack_max_length, collator="fake_collator", pack_level="none") - -# 3. judger -gsm8k_judger_config = GSM8KJudgerConfig(judger_name="openai/gsm8k") -judger_cfg = JudgerConfig(reward_judger_configs=[gsm8k_judger_config]) - -# 4. dataflow and evaluator -dataflow_config = DataFlowConfig( - env=experimental_name, - prompt_repeat_k=prompt_repeat_k, - global_batch_size=global_batch_size, - sample_params=training_sample_params, - enable_partial_rollout=enable_partial_rollout, -) - -evaluator_cfg = ( - EvaluatorConfig( - enable_evaluate=enable_evaluate, - enable_initial_evaluate=enable_initial_evaluate, - dataset_cfg=eval_dataset_cfg, - tokenizer=tokenizer, - evaluate_step=evaluate_step, - compute_metric_func=None, - sample_params=evaluation_sample_params, - ) - if enable_evaluate - else None -) - -# replay buffer config: : 不需要修改 -replay_buffer_cfg = ReplayBufferConfig( - dataset_cfg=train_dataset_cfg, dataloader_cfg=dataloader_config, tokenizer=tokenizer -) - -# 5. Train worker -model_cfg = get_model_config_from_hf(Path(model_path)) -model_cfg.compile_cfg = False -optim_cfg = AdamWConfig(lr=1e-6, foreach=False) -loss_cfg = GRPOLossConfig( - policy_loss_cfg=dict( - cliprange_high=0.2, - cliprange_low=0.2, - loss_type="vanilla", - ), - ignore_idx=-100, - use_kl_loss=True, - kl_loss_coef=0.001, - kl_loss_type="low_var_kl", - mode="chunk", - chunk_size=512, -) -lr_cfg = LRConfig(lr_type="constant", warmup_ratio=0, lr_min=1e-6) -fsdp_cfg = FSDPConfig(cpu_offload=False, ep_size=1) -train_worker_cfg: WorkerConfig = WorkerConfig( - model_cfg=model_cfg, - load_from=model_path, - optim_cfg=optim_cfg, - loss_cfg=loss_cfg, - lr_cfg=lr_cfg, - fsdp_cfg=fsdp_cfg, - sp_size=1, - optimizer_steps=train_optimizer_steps, - pack_max_length=pack_max_length, -) - -# 6. RL Trainer -trainer = RLTrainerConfig( - load_from=model_path, - resources=resources, - rollout_config=rollout_config, - dataflow_config=dataflow_config, - judger_config=judger_cfg, - replay_buffer_config=replay_buffer_cfg, - evaluator_config=evaluator_cfg, - train_worker_config=train_worker_cfg, - tokenizer_path=model_path, - work_dir=work_dir, - total_epochs=total_epochs, - hf_interval=hf_interval, - exp_tracker="jsonl", -) From a10a5825c50472f7bf3374c7dcc9785889dcba44 Mon Sep 17 00:00:00 2001 From: kkscilife <1658148753@qq.com> Date: Mon, 25 May 2026 10:16:33 +0800 Subject: [PATCH 06/10] vl case --- autotest/config.yaml | 3 +- .../config/rl_qwen3_vl_geometry3k_grpo.py | 229 ++++++++++-------- 2 files changed, 129 insertions(+), 103 deletions(-) diff --git a/autotest/config.yaml b/autotest/config.yaml index 8c7fb655a..e9fa27e1b 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -752,8 +752,7 @@ case: - type: rl parameters: - config: examples/v1/config/rl_grpo_geo3k_judge.py - #config: autotest/config/rl_qwen3_vl_geometry3k_grpo.py + config: autotest/config/rl_qwen3_vl_geometry3k_grpo.py infer_backend: lmdeploy output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output resource: diff --git a/autotest/config/rl_qwen3_vl_geometry3k_grpo.py b/autotest/config/rl_qwen3_vl_geometry3k_grpo.py index 4158d872d..e2ee4cc16 100644 --- a/autotest/config/rl_qwen3_vl_geometry3k_grpo.py +++ b/autotest/config/rl_qwen3_vl_geometry3k_grpo.py @@ -1,57 +1,56 @@ +"""RL Colocate Trainer 示例配置(GRPO + GSM8K)。 + +用法:通过环境变量传入路径后,由 CLI 加载本配置并 trainer_cfg.build().fit()。 +需设置: WORK_DIR, MODEL_PATH, DATA_PATH, EVAL_DATA_PATH +可选: WORLD_SIZE, ENABLE_RETURN_ROUTED_EXPERTS, LOSS_TYPE, LOSS_MODE, SP_SIZE +""" import os -from copy import deepcopy from xtuner.v1.config import AdamWConfig, FSDPConfig, LRConfig from xtuner.v1.data_proto.rl_data import SampleParams +from xtuner.v1.rl.advantage import GRPOAdvantageConfig from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig from xtuner.v1.datasets.rl_tokenize_fn import RLQwen3VLTokenizeFnConfig from xtuner.v1.model.compose.qwen3_vl import Qwen3VLDense8BConfig -from xtuner.v1.rl.advantage import GRPOAdvantageConfig -from xtuner.v1.rl.agent_loop import SingleTurnAgentLoopConfig -from xtuner.v1.rl.agent_loop_manager import ( - AgentLoopManagerConfig, - SamplerConfig, - SyncProduceStrategyConfig, - TaskSpecConfig, -) -from xtuner.v1.rl.evaluator import EvaluatorConfig +from xtuner.v1.rl.utils import AcceleratorResourcesConfig, CPUResourcesConfig +from xtuner.v1.rl.rollout.worker import RolloutConfig from xtuner.v1.rl.judger import GEO3KJudgerConfig -from xtuner.v1.rl.loss import GRPOLossConfig from xtuner.v1.rl.replay_buffer import SyncReplayBufferConfig -from xtuner.v1.rl.rollout.worker import RolloutConfig from xtuner.v1.rl.trainer import WorkerConfig -from xtuner.v1.rl.utils import AcceleratorResourcesConfig +from xtuner.v1.rl.agent_loop import SingleTurnAgentLoopConfig +from xtuner.v1.rl.agent_loop_manager import AgentLoopManagerConfig, SamplerConfig, SyncProduceStrategyConfig, TaskSpecConfig +from xtuner.v1.rl.evaluator import EvaluatorConfig +from xtuner.v1.rl.loss import GRPOLossConfig from xtuner.v1.train.rl_trainer import RLColocateTrainerConfig - +# env work_dir = os.environ["WORK_DIR"] model_path = os.environ["MODEL_PATH"] data_path = os.environ["DATA_PATH"] eval_data_path = os.environ["EVAL_DATA_PATH"] -enable_evaluate = eval_data_path != "" +enable_return_routed_experts = os.environ.get("ENABLE_RETURN_ROUTED_EXPERTS", "0") +NNODE = int(os.environ.get("WORLD_SIZE", "1")) media_root = os.environ["MEDIA_ROOT"] # basic settings experimental_name = "grpo_geo3k" -total_epochs = 15 -global_batch_size = 64 +total_train_steps = 30 # TODO: total_epoch +evaluate_step = 30 +train_optimizer_steps = 4 +train_batch_size = 1024 prompt_repeat_k = 5 -rollout_tp_size = 2 +rollout_tp_size = 1 rollout_ep_size = 1 max_prompt_length = 1024 max_response_length = 2048 -pack_max_length = 32768 -train_optimizer_steps = 4 -hf_interval = 30 -enable_initial_evaluate = True -evaluate_step = 15 +pack_max_length = 32 * 1024 # 1. resources resources = AcceleratorResourcesConfig( accelerator="GPU", - num_workers=8, + num_workers=8 * NNODE, num_cpus_per_worker=12, - cpu_memory_per_worker=16 * 1024**3, + cpu_memory_per_worker=16 * 1024**3, # 16 GB ) # 2. rollout @@ -62,109 +61,134 @@ dtype="bfloat16", tensor_parallel_size=rollout_tp_size, expert_parallel_size=rollout_ep_size, - gpu_memory_utilization=0.75, + gpu_memory_utilization=0.8, context_length=max_response_length + max_prompt_length, + enable_return_routed_experts=(enable_return_routed_experts == "1"), ) -# sampling params -training_sample_params = SampleParams(max_tokens=max_response_length) -evaluation_sample_params = deepcopy(training_sample_params) -evaluation_sample_params.top_p = 1.0 -evaluation_sample_params.temperature = 0.0 -evaluation_sample_params.top_k = 1 - -# 3. datasets -train_dataset_cfg = [ - { - "dataset": DatasetConfig( - name="geo3k", - anno_path=data_path, - class_name="VLMJsonlDataset", - media_root=media_root, - sample_ratio=1.0, - ), - "tokenize_fn": RLQwen3VLTokenizeFnConfig(processor_path=model_path, max_length=max_prompt_length), - } -] -eval_dataset_cfg = [ - { - "dataset": DatasetConfig( - name="geo3k", - anno_path=eval_data_path if enable_evaluate else data_path, - class_name="VLMJsonlDataset", - media_root=media_root, - sample_ratio=1.0, - ), - "tokenize_fn": RLQwen3VLTokenizeFnConfig( - processor_path=model_path, - max_length=max_prompt_length, - ignore_multimodal_info=True, - ), - } -] -dataloader_cfg = DataloaderConfig( - dataset_config_list=train_dataset_cfg, - pack_max_length=pack_max_length, - collator="fake_collator", - pack_level="none", - num_workers=8, -) -eval_dataloader_cfg = DataloaderConfig( - dataset_config_list=eval_dataset_cfg, - pack_max_length=pack_max_length, - collator="fake_collator", - pack_level="none", - num_workers=8, +# 3. judger +judger_config = GEO3KJudgerConfig( + cpu_resources=CPUResourcesConfig(num_workers=1, num_cpus_per_worker=1), ) -# 4. judger -judger_config = GEO3KJudgerConfig() +# 4. train worker +lr_cfg = LRConfig(lr_type="constant", warmup_ratio=0, lr_min=1e-6) +fsdp_cfg = FSDPConfig(torch_compile=False, cpu_offload=False, ep_size=1) -# 5. train worker +# TODO: support get_model_config_from_hf model_cfg = Qwen3VLDense8BConfig(freeze_vision=True, freeze_projector=True) -optim_cfg = AdamWConfig(lr=1e-6, foreach=False) + +if hasattr(model_cfg.text_config, "balancing_loss_cfg"): + model_cfg.text_config.balancing_loss_cfg = None +if hasattr(model_cfg.text_config, "z_loss_cfg"): + model_cfg.text_config.z_loss_cfg = None +optim_cfg = AdamWConfig(lr=1e-6, foreach=False, weight_decay=0.1) loss_cfg = GRPOLossConfig( policy_loss_cfg=dict( - cliprange_high=0.2, + cliprange_high=0.28, cliprange_low=0.2, - loss_type="vanilla", + loss_type=os.environ.get("LOSS_TYPE", "vanilla"), + clip_ratio_c=10.0, + log_prob_diff_min=-20.0, + log_prob_diff_max=20.0, ), ignore_idx=-100, - use_kl_loss=True, - kl_loss_coef=0.001, + use_kl_loss=False, + kl_loss_coef=0.0, kl_loss_type="low_var_kl", - mode="chunk", + mode=os.environ.get("LOSS_MODE", "chunk"), chunk_size=512, ) -lr_cfg = LRConfig(lr_type="constant", warmup_ratio=0, lr_min=1e-6) -fsdp_cfg = FSDPConfig(cpu_offload=False) -train_worker_cfg: WorkerConfig = WorkerConfig( +train_worker_cfg = WorkerConfig( model_cfg=model_cfg, load_from=model_path, optim_cfg=optim_cfg, loss_cfg=loss_cfg, lr_cfg=lr_cfg, fsdp_cfg=fsdp_cfg, - sp_size=1, + sp_size=int(os.environ.get("SP_SIZE", "1")), optimizer_steps=train_optimizer_steps, pack_max_length=pack_max_length, ) -# 6. agent loop managers +# 5. train agent loop manager +train_dataset_cfg = [ + { + "dataset": DatasetConfig(name="geo3k", + anno_path=data_path, + class_name='VLMJsonlDataset', + media_root=media_root, + sample_ratio=1.0), + "tokenize_fn": RLQwen3VLTokenizeFnConfig(processor_path=model_path, + max_length=max_prompt_length), + } +] + +dataloader_cfg = DataloaderConfig( + dataset_config_list=train_dataset_cfg, + pack_max_length=pack_max_length, + collator="fake_collator", + pack_level="none", + num_workers=8, +) +sampler_config = SamplerConfig( + dataloader_cfg=dataloader_cfg, + prompt_repeat_k=prompt_repeat_k, +) +training_sample_params = SampleParams( + max_tokens=max_response_length, + top_k=0, + top_p=1.0, + temperature=1.0, + min_tokens=0, +) agent_loop_config = SingleTurnAgentLoopConfig( hf_checkpoint=model_path, sample_params=training_sample_params, ) +produce_strategy_config = SyncProduceStrategyConfig() agent_loop_manager_cfg = AgentLoopManagerConfig( tasks=TaskSpecConfig( task_name="train_task", agent_loop_config=agent_loop_config, judger_config=judger_config, - produce_strategy_config=SyncProduceStrategyConfig(), - sampler_config=SamplerConfig(dataloader_cfg=dataloader_cfg, prompt_repeat_k=prompt_repeat_k), + produce_strategy_config=produce_strategy_config, + sampler_config=sampler_config, ), ) +# 6. eval agent loop manager +eval_dataset_cfg = [ + { + "dataset": DatasetConfig(name="geo3k", + anno_path=eval_data_path, + class_name='VLMJsonlDataset', + media_root=media_root, + sample_ratio=1.0), + "tokenize_fn": RLQwen3VLTokenizeFnConfig(processor_path=model_path, + max_length=max_prompt_length, + ignore_multimodal_info=True), + } +] + +eval_dataloader_cfg = DataloaderConfig( + dataset_config_list=eval_dataset_cfg, + pack_max_length=pack_max_length, + collator="fake_collator", + pack_level="none", + num_workers=8, +) +eval_sampler_config = SamplerConfig( + dataloader_cfg=eval_dataloader_cfg, + prompt_repeat_k=1, +) +evaluation_sample_params = SampleParams( + max_tokens=max_response_length, + top_k=1, + top_p=1.0, + temperature=0.0, + min_tokens=0, +) eval_agent_loop_config = SingleTurnAgentLoopConfig( hf_checkpoint=model_path, sample_params=evaluation_sample_params, @@ -174,28 +198,31 @@ task_name="eval_task", agent_loop_config=eval_agent_loop_config, judger_config=judger_config, - sampler_config=SamplerConfig(dataloader_cfg=eval_dataloader_cfg, prompt_repeat_k=1), + sampler_config=eval_sampler_config, ), ) -# 7. trainer +# 7. evaluator +evaluator_config = EvaluatorConfig(compute_metric_func=None) + +# 8. RL Colocate Trainer Config(CLI 通过 config["trainer"].build() 得到 Trainer) trainer = RLColocateTrainerConfig( resources=resources, - train_worker_cfg=train_worker_cfg, + train_worker_cfg=train_worker_cfg, # TODO: uniform naming of cfg and config rollout_config=rollout_config, tokenizer_path=model_path, replay_buffer_config=SyncReplayBufferConfig(), agent_loop_manager_cfg=agent_loop_manager_cfg, eval_agent_loop_manager_cfg=eval_agent_loop_manager_cfg, - evaluator_config=EvaluatorConfig(compute_metric_func=None), + evaluator_config=evaluator_config, load_from=model_path, - total_epochs=total_epochs, - train_batch_size=global_batch_size, + total_train_steps=total_train_steps, + train_batch_size=train_batch_size, advantage_estimator_config=GRPOAdvantageConfig(eps=1e-8), - enable_evaluate=enable_evaluate, - enable_initial_evaluate=enable_evaluate and enable_initial_evaluate, + enable_evaluate=True, + enable_initial_evaluate=False, evaluate_step=evaluate_step, work_dir=work_dir, - hf_interval=hf_interval, - exp_tracker="jsonl", + seed=123, + debug_rollout=False, ) From 4f115748b89533270383532de9840437f79241e6 Mon Sep 17 00:00:00 2001 From: kkscilife <1658148753@qq.com> Date: Mon, 25 May 2026 14:09:20 +0800 Subject: [PATCH 07/10] change step --- autotest/config/rl_qwen3_vl_geometry3k_grpo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autotest/config/rl_qwen3_vl_geometry3k_grpo.py b/autotest/config/rl_qwen3_vl_geometry3k_grpo.py index e2ee4cc16..7d372c1fe 100644 --- a/autotest/config/rl_qwen3_vl_geometry3k_grpo.py +++ b/autotest/config/rl_qwen3_vl_geometry3k_grpo.py @@ -34,8 +34,8 @@ # basic settings experimental_name = "grpo_geo3k" -total_train_steps = 30 # TODO: total_epoch -evaluate_step = 30 +total_train_steps = 15 # TODO: total_epoch +evaluate_step = 15 train_optimizer_steps = 4 train_batch_size = 1024 prompt_repeat_k = 5 From d90d67763bf6cc945711471006b4b73d7124de13 Mon Sep 17 00:00:00 2001 From: kkscilife <1658148753@qq.com> Date: Mon, 25 May 2026 15:46:37 +0800 Subject: [PATCH 08/10] set output format --- autotest/config.yaml | 2 +- autotest/config/rl_qwen3_vl_geometry3k_grpo.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/autotest/config.yaml b/autotest/config.yaml index e9fa27e1b..f2887c645 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -733,7 +733,7 @@ case: operator: < - metric: mismatch/mismatch_k3_kl - threshold: 0.0001 + threshold: 0.0002 method: absolute operator: <= - diff --git a/autotest/config/rl_qwen3_vl_geometry3k_grpo.py b/autotest/config/rl_qwen3_vl_geometry3k_grpo.py index 7d372c1fe..84f6b5e71 100644 --- a/autotest/config/rl_qwen3_vl_geometry3k_grpo.py +++ b/autotest/config/rl_qwen3_vl_geometry3k_grpo.py @@ -225,4 +225,5 @@ work_dir=work_dir, seed=123, debug_rollout=False, + exp_tracker="jsonl", ) From 20e7aa86999c1955ce00c291110585a78982b076 Mon Sep 17 00:00:00 2001 From: kkscilife <1658148753@qq.com> Date: Mon, 25 May 2026 19:55:05 +0800 Subject: [PATCH 09/10] fix spell error --- autotest/module/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autotest/module/train.py b/autotest/module/train.py index 6fc3e4b70..8883d1914 100644 --- a/autotest/module/train.py +++ b/autotest/module/train.py @@ -57,11 +57,11 @@ def get_cmd(config): return command, config elif train_type == "rl": infer_type = config.get("parameters", {}).get("infer_backend", "lmdeploy") - acceleator = config.get("parameters", {}).get("acceleator", "GPU") + accelerator = config.get("parameters", {}).get("accelerator", "GPU") command = ( f"cd {current_dir}; pwd; pip install -e .[all]; export GITHUB_RUN_ID={config.get('run_id')}; export WORK_DIR={work_dir}; " + cudnn_patch - + f"bash -x autotest/utils/ci_run_rl.sh {acceleator} {infer_type} {config_path} ${{MODEL_PATH}} ${{DATA_PATH}} ${{EVAL_DATA_PATH}}" + + f"bash -x autotest/utils/ci_run_rl.sh {accelerator} {infer_type} {config_path} ${{MODEL_PATH}} ${{DATA_PATH}} ${{EVAL_DATA_PATH}}" ) return command, config else: From e387118681e3baced753e8b15bf45094484a9d27 Mon Sep 17 00:00:00 2001 From: kkscilife <1658148753@qq.com> Date: Tue, 26 May 2026 10:33:08 +0800 Subject: [PATCH 10/10] adjust grad_norm for running resume --- autotest/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotest/config.yaml b/autotest/config.yaml index f2887c645..86e291a20 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -484,7 +484,7 @@ case: assert_info: base_metric: qwen3-5-sft-sp4-resume/625c0018/tracker.jsonl check_metrics: - grad_norm: 0.02 + grad_norm: 0.09 loss/local_loss: 0.000001 loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 0.000001