diff --git a/autotest/config.yaml b/autotest/config.yaml index 8f242a80d..86e291a20 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -484,7 +484,7 @@ case: assert_info: base_metric: qwen3-5-sft-sp4-resume/625c0018/tracker.jsonl check_metrics: - grad_norm: 0.02 + grad_norm: 0.09 loss/local_loss: 0.000001 loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 0.000001 @@ -709,7 +709,7 @@ case: - type: rl parameters: - config: autotest/config/rl_qwen3_8B_gsm8k_grpo.py + config: autotest/config/rl_qwen3_gsm8k_grpo.py infer_backend: lmdeploy output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output resource: @@ -733,7 +733,7 @@ case: operator: < - metric: mismatch/mismatch_k3_kl - threshold: 0.0001 + threshold: 0.0002 method: absolute operator: <= - diff --git a/autotest/config/rl_qwen3_gsm8k_grpo.py b/autotest/config/rl_qwen3_gsm8k_grpo.py index 4d6fefdb2..648a58d49 100644 --- a/autotest/config/rl_qwen3_gsm8k_grpo.py +++ b/autotest/config/rl_qwen3_gsm8k_grpo.py @@ -206,4 +206,5 @@ work_dir=work_dir, seed=123, debug_rollout=False, + exp_tracker="jsonl", ) diff --git a/autotest/config/rl_qwen3_vl_geometry3k_grpo.py b/autotest/config/rl_qwen3_vl_geometry3k_grpo.py index 4158d872d..84f6b5e71 100644 --- a/autotest/config/rl_qwen3_vl_geometry3k_grpo.py +++ b/autotest/config/rl_qwen3_vl_geometry3k_grpo.py @@ -1,57 +1,56 @@ +"""RL Colocate Trainer 示例配置(GRPO + GSM8K)。 + +用法:通过环境变量传入路径后,由 CLI 加载本配置并 trainer_cfg.build().fit()。 +需设置: WORK_DIR, MODEL_PATH, DATA_PATH, EVAL_DATA_PATH +可选: WORLD_SIZE, ENABLE_RETURN_ROUTED_EXPERTS, LOSS_TYPE, LOSS_MODE, SP_SIZE +""" import os -from copy import deepcopy from xtuner.v1.config import AdamWConfig, FSDPConfig, LRConfig from xtuner.v1.data_proto.rl_data import SampleParams +from xtuner.v1.rl.advantage import GRPOAdvantageConfig from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig from xtuner.v1.datasets.rl_tokenize_fn import RLQwen3VLTokenizeFnConfig from xtuner.v1.model.compose.qwen3_vl import Qwen3VLDense8BConfig -from xtuner.v1.rl.advantage import GRPOAdvantageConfig -from xtuner.v1.rl.agent_loop import SingleTurnAgentLoopConfig -from xtuner.v1.rl.agent_loop_manager import ( - AgentLoopManagerConfig, - SamplerConfig, - SyncProduceStrategyConfig, - TaskSpecConfig, -) -from xtuner.v1.rl.evaluator import EvaluatorConfig +from xtuner.v1.rl.utils import AcceleratorResourcesConfig, CPUResourcesConfig +from xtuner.v1.rl.rollout.worker import RolloutConfig from xtuner.v1.rl.judger import GEO3KJudgerConfig -from xtuner.v1.rl.loss import GRPOLossConfig from xtuner.v1.rl.replay_buffer import SyncReplayBufferConfig -from xtuner.v1.rl.rollout.worker import RolloutConfig from xtuner.v1.rl.trainer import WorkerConfig -from xtuner.v1.rl.utils import AcceleratorResourcesConfig +from xtuner.v1.rl.agent_loop import SingleTurnAgentLoopConfig +from xtuner.v1.rl.agent_loop_manager import AgentLoopManagerConfig, SamplerConfig, SyncProduceStrategyConfig, TaskSpecConfig +from xtuner.v1.rl.evaluator import EvaluatorConfig +from xtuner.v1.rl.loss import GRPOLossConfig from xtuner.v1.train.rl_trainer import RLColocateTrainerConfig - +# env work_dir = os.environ["WORK_DIR"] model_path = os.environ["MODEL_PATH"] data_path = os.environ["DATA_PATH"] eval_data_path = os.environ["EVAL_DATA_PATH"] -enable_evaluate = eval_data_path != "" +enable_return_routed_experts = os.environ.get("ENABLE_RETURN_ROUTED_EXPERTS", "0") +NNODE = int(os.environ.get("WORLD_SIZE", "1")) media_root = os.environ["MEDIA_ROOT"] # basic settings experimental_name = "grpo_geo3k" -total_epochs = 15 -global_batch_size = 64 +total_train_steps = 15 # TODO: total_epoch +evaluate_step = 15 +train_optimizer_steps = 4 +train_batch_size = 1024 prompt_repeat_k = 5 -rollout_tp_size = 2 +rollout_tp_size = 1 rollout_ep_size = 1 max_prompt_length = 1024 max_response_length = 2048 -pack_max_length = 32768 -train_optimizer_steps = 4 -hf_interval = 30 -enable_initial_evaluate = True -evaluate_step = 15 +pack_max_length = 32 * 1024 # 1. resources resources = AcceleratorResourcesConfig( accelerator="GPU", - num_workers=8, + num_workers=8 * NNODE, num_cpus_per_worker=12, - cpu_memory_per_worker=16 * 1024**3, + cpu_memory_per_worker=16 * 1024**3, # 16 GB ) # 2. rollout @@ -62,109 +61,134 @@ dtype="bfloat16", tensor_parallel_size=rollout_tp_size, expert_parallel_size=rollout_ep_size, - gpu_memory_utilization=0.75, + gpu_memory_utilization=0.8, context_length=max_response_length + max_prompt_length, + enable_return_routed_experts=(enable_return_routed_experts == "1"), ) -# sampling params -training_sample_params = SampleParams(max_tokens=max_response_length) -evaluation_sample_params = deepcopy(training_sample_params) -evaluation_sample_params.top_p = 1.0 -evaluation_sample_params.temperature = 0.0 -evaluation_sample_params.top_k = 1 - -# 3. datasets -train_dataset_cfg = [ - { - "dataset": DatasetConfig( - name="geo3k", - anno_path=data_path, - class_name="VLMJsonlDataset", - media_root=media_root, - sample_ratio=1.0, - ), - "tokenize_fn": RLQwen3VLTokenizeFnConfig(processor_path=model_path, max_length=max_prompt_length), - } -] -eval_dataset_cfg = [ - { - "dataset": DatasetConfig( - name="geo3k", - anno_path=eval_data_path if enable_evaluate else data_path, - class_name="VLMJsonlDataset", - media_root=media_root, - sample_ratio=1.0, - ), - "tokenize_fn": RLQwen3VLTokenizeFnConfig( - processor_path=model_path, - max_length=max_prompt_length, - ignore_multimodal_info=True, - ), - } -] -dataloader_cfg = DataloaderConfig( - dataset_config_list=train_dataset_cfg, - pack_max_length=pack_max_length, - collator="fake_collator", - pack_level="none", - num_workers=8, -) -eval_dataloader_cfg = DataloaderConfig( - dataset_config_list=eval_dataset_cfg, - pack_max_length=pack_max_length, - collator="fake_collator", - pack_level="none", - num_workers=8, +# 3. judger +judger_config = GEO3KJudgerConfig( + cpu_resources=CPUResourcesConfig(num_workers=1, num_cpus_per_worker=1), ) -# 4. judger -judger_config = GEO3KJudgerConfig() +# 4. train worker +lr_cfg = LRConfig(lr_type="constant", warmup_ratio=0, lr_min=1e-6) +fsdp_cfg = FSDPConfig(torch_compile=False, cpu_offload=False, ep_size=1) -# 5. train worker +# TODO: support get_model_config_from_hf model_cfg = Qwen3VLDense8BConfig(freeze_vision=True, freeze_projector=True) -optim_cfg = AdamWConfig(lr=1e-6, foreach=False) + +if hasattr(model_cfg.text_config, "balancing_loss_cfg"): + model_cfg.text_config.balancing_loss_cfg = None +if hasattr(model_cfg.text_config, "z_loss_cfg"): + model_cfg.text_config.z_loss_cfg = None +optim_cfg = AdamWConfig(lr=1e-6, foreach=False, weight_decay=0.1) loss_cfg = GRPOLossConfig( policy_loss_cfg=dict( - cliprange_high=0.2, + cliprange_high=0.28, cliprange_low=0.2, - loss_type="vanilla", + loss_type=os.environ.get("LOSS_TYPE", "vanilla"), + clip_ratio_c=10.0, + log_prob_diff_min=-20.0, + log_prob_diff_max=20.0, ), ignore_idx=-100, - use_kl_loss=True, - kl_loss_coef=0.001, + use_kl_loss=False, + kl_loss_coef=0.0, kl_loss_type="low_var_kl", - mode="chunk", + mode=os.environ.get("LOSS_MODE", "chunk"), chunk_size=512, ) -lr_cfg = LRConfig(lr_type="constant", warmup_ratio=0, lr_min=1e-6) -fsdp_cfg = FSDPConfig(cpu_offload=False) -train_worker_cfg: WorkerConfig = WorkerConfig( +train_worker_cfg = WorkerConfig( model_cfg=model_cfg, load_from=model_path, optim_cfg=optim_cfg, loss_cfg=loss_cfg, lr_cfg=lr_cfg, fsdp_cfg=fsdp_cfg, - sp_size=1, + sp_size=int(os.environ.get("SP_SIZE", "1")), optimizer_steps=train_optimizer_steps, pack_max_length=pack_max_length, ) -# 6. agent loop managers +# 5. train agent loop manager +train_dataset_cfg = [ + { + "dataset": DatasetConfig(name="geo3k", + anno_path=data_path, + class_name='VLMJsonlDataset', + media_root=media_root, + sample_ratio=1.0), + "tokenize_fn": RLQwen3VLTokenizeFnConfig(processor_path=model_path, + max_length=max_prompt_length), + } +] + +dataloader_cfg = DataloaderConfig( + dataset_config_list=train_dataset_cfg, + pack_max_length=pack_max_length, + collator="fake_collator", + pack_level="none", + num_workers=8, +) +sampler_config = SamplerConfig( + dataloader_cfg=dataloader_cfg, + prompt_repeat_k=prompt_repeat_k, +) +training_sample_params = SampleParams( + max_tokens=max_response_length, + top_k=0, + top_p=1.0, + temperature=1.0, + min_tokens=0, +) agent_loop_config = SingleTurnAgentLoopConfig( hf_checkpoint=model_path, sample_params=training_sample_params, ) +produce_strategy_config = SyncProduceStrategyConfig() agent_loop_manager_cfg = AgentLoopManagerConfig( tasks=TaskSpecConfig( task_name="train_task", agent_loop_config=agent_loop_config, judger_config=judger_config, - produce_strategy_config=SyncProduceStrategyConfig(), - sampler_config=SamplerConfig(dataloader_cfg=dataloader_cfg, prompt_repeat_k=prompt_repeat_k), + produce_strategy_config=produce_strategy_config, + sampler_config=sampler_config, ), ) +# 6. eval agent loop manager +eval_dataset_cfg = [ + { + "dataset": DatasetConfig(name="geo3k", + anno_path=eval_data_path, + class_name='VLMJsonlDataset', + media_root=media_root, + sample_ratio=1.0), + "tokenize_fn": RLQwen3VLTokenizeFnConfig(processor_path=model_path, + max_length=max_prompt_length, + ignore_multimodal_info=True), + } +] + +eval_dataloader_cfg = DataloaderConfig( + dataset_config_list=eval_dataset_cfg, + pack_max_length=pack_max_length, + collator="fake_collator", + pack_level="none", + num_workers=8, +) +eval_sampler_config = SamplerConfig( + dataloader_cfg=eval_dataloader_cfg, + prompt_repeat_k=1, +) +evaluation_sample_params = SampleParams( + max_tokens=max_response_length, + top_k=1, + top_p=1.0, + temperature=0.0, + min_tokens=0, +) eval_agent_loop_config = SingleTurnAgentLoopConfig( hf_checkpoint=model_path, sample_params=evaluation_sample_params, @@ -174,28 +198,32 @@ task_name="eval_task", agent_loop_config=eval_agent_loop_config, judger_config=judger_config, - sampler_config=SamplerConfig(dataloader_cfg=eval_dataloader_cfg, prompt_repeat_k=1), + sampler_config=eval_sampler_config, ), ) -# 7. trainer +# 7. evaluator +evaluator_config = EvaluatorConfig(compute_metric_func=None) + +# 8. RL Colocate Trainer Config(CLI 通过 config["trainer"].build() 得到 Trainer) trainer = RLColocateTrainerConfig( resources=resources, - train_worker_cfg=train_worker_cfg, + train_worker_cfg=train_worker_cfg, # TODO: uniform naming of cfg and config rollout_config=rollout_config, tokenizer_path=model_path, replay_buffer_config=SyncReplayBufferConfig(), agent_loop_manager_cfg=agent_loop_manager_cfg, eval_agent_loop_manager_cfg=eval_agent_loop_manager_cfg, - evaluator_config=EvaluatorConfig(compute_metric_func=None), + evaluator_config=evaluator_config, load_from=model_path, - total_epochs=total_epochs, - train_batch_size=global_batch_size, + total_train_steps=total_train_steps, + train_batch_size=train_batch_size, advantage_estimator_config=GRPOAdvantageConfig(eps=1e-8), - enable_evaluate=enable_evaluate, - enable_initial_evaluate=enable_evaluate and enable_initial_evaluate, + enable_evaluate=True, + enable_initial_evaluate=False, evaluate_step=evaluate_step, work_dir=work_dir, - hf_interval=hf_interval, + seed=123, + debug_rollout=False, exp_tracker="jsonl", ) diff --git a/autotest/module/train.py b/autotest/module/train.py index c813851d1..8883d1914 100644 --- a/autotest/module/train.py +++ b/autotest/module/train.py @@ -57,10 +57,11 @@ def get_cmd(config): return command, config elif train_type == "rl": infer_type = config.get("parameters", {}).get("infer_backend", "lmdeploy") + accelerator = config.get("parameters", {}).get("accelerator", "GPU") command = ( f"cd {current_dir}; pwd; pip install -e .[all]; export GITHUB_RUN_ID={config.get('run_id')}; export WORK_DIR={work_dir}; " + cudnn_patch - + f"bash -x examples/v1/scripts/run_rl.sh {config_path} {infer_type} ${{MODEL_PATH}} ${{DATA_PATH}} ${{EVAL_DATA_PATH}}" + + f"bash -x autotest/utils/ci_run_rl.sh {accelerator} {infer_type} {config_path} ${{MODEL_PATH}} ${{DATA_PATH}} ${{EVAL_DATA_PATH}}" ) return command, config else: @@ -77,7 +78,7 @@ def validate(config): check_metrics = config.get("assert_info", {}).get("check_metrics", {}) return check_result(config["case_name"], base_path, cur_path, check_metrics) elif train_type == "rl": - cur_path = os.path.join(get_latest_subdir(work_dir), "exp_tracking/tracker.jsonl") + cur_path = os.path.join(get_latest_subdir(work_dir), "logs/exp_tracking/tracker.jsonl") check_metrics = config.get("assert_info", {}) return check_rl_result(config["case_name"], base_path, cur_path, check_metrics) else: diff --git a/autotest/utils/ci_run_rl.sh b/autotest/utils/ci_run_rl.sh new file mode 100644 index 000000000..4ba764d12 --- /dev/null +++ b/autotest/utils/ci_run_rl.sh @@ -0,0 +1,208 @@ +set -ex +ray stop --force +# examples of usage: +# qwen3_8B_grpo_gsm8k training: +# bash autotest/utils/ci_run_rl.sh "GPU" "sglang" auotest/config/rl_qwen3_8B_grpo.py $MODEL_PATH $DATA_PATH $EVAL_DATA_PATH + +ACCELERATOR=$1 +INFER_BACKEND=$2 +CONFIG_PATH=$3 +MODEL_PATH=$4 +DATA_PATH=$5 +EVAL_DATA_PATH=${6:-""} +ACCELERATOR=$(echo "$ACCELERATOR" | tr '[:lower:]' '[:upper:]') +if [ $ACCELERATOR != "GPU" ] && [ $ACCELERATOR != "NPU" ]; then + echo "Error: ACCELERATOR must be either 'gpu' or 'npu'!" + exit 1 +fi + +ulimit -n 65536 + +if [ "$ACCELERATOR" = "NPU" ]; then + ACCELERATOR_PER_NODE=${7:-16} + yum install dnsutils -y + source /usr/local/Ascend/ascend-toolkit/set_env.sh + source /usr/local/Ascend/nnal/atb/set_env.sh --cxx_abi=1 + git config --global --add safe.directory "$PWD" + export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver + export CPU_AFFINITY_CONF=0 + export HF_HOME=/workspace + export HF_DATASETS_OFFLINE=1 + export TRANSFORMERS_OFFLINE=1 + export HF_EVALUATE_OFFLINE=1 + export HF_HUB_OFFLINE=1 + export UVICORN_LOG_LEVEL="CRITICAL" + export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True" + export RAY_CGRAPH_get_timeout=3600 +else + ACCELERATOR_PER_NODE=${7:-8} +fi + +export PYTHONPATH=$(pwd):$PYTHONPATH + +# ray 环境变量 +export MASTER_PORT=6000 +export WORLD_SIZE=${NODE_COUNT:-"1"} +export RANK=${NODE_RANK:-"0"} +export RAY_MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} +export RAY_RANK=${RANK:-0} # 0 代表主节点, >0 代表工作节点 +export RAY_HEAD_PORT=${RAY_HEAD_PORT:-"6379"} +export RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-"8265"} + +# xtuner 环境变量 +export MODEL_PATH=$MODEL_PATH +export DATA_PATH=$DATA_PATH +export EVAL_DATA_PATH=$EVAL_DATA_PATH +export XTUNER_USE_FA3=${XTUNER_USE_FA3:-1} +export XTUNER_LOG_LEVEL=${XTUNER_LOG_LEVEL:-"INFO"} +export PYTHONUNBUFFERED=1 + +infer_backend_lower=$(echo "$INFER_BACKEND" | tr '[:upper:]' '[:lower:]') +if [ "$infer_backend_lower" = "sglang" ]; then + export XTUNER_USE_SGLANG=1 + unset PYTORCH_CUDA_ALLOC_CONF + export SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 + export SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION=False +elif [ "$infer_backend_lower" = "lmdeploy" ]; then + export XTUNER_USE_LMDEPLOY=1 + export PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' + export PYTHONPATH=$LMDEPLOY_PATH:$PYTHONPATH +elif [ "$infer_backend_lower" = "vllm" ]; then + export XTUNER_USE_VLLM=1 + export PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' +else + echo "Error: INFER_BACKEND '$INFER_BACKEND' is not supported or not specified!" + exit 1 +fi + +current_time=$(date "+%m%d%H") +# 取模型路径的最后一级作为model_name,取数据路径的倒数第二级作为data_name +model_dir_name=$(basename "$MODEL_PATH") +data_dir_name=$(basename "$(dirname "$DATA_PATH")") + +if [ "x$WORK_DIR" = "x" ]; then + DIR=$(pwd) + export WORK_DIR="${DIR}/work_dirs/${model_dir_name}_${data_dir_name}_${infer_backend_lower}" +else + export WORK_DIR=$WORK_DIR +fi +echo "WORK_DIR: $WORK_DIR" +if [ ! -d "$WORK_DIR" ]; then + mkdir -p "$WORK_DIR" +fi + +export LMDEPLOY_LOG_FILE="${WORK_DIR}/lmdeploy_log_${current_time}.txt" +if [ "$ACCELERATOR" = "GPU" ]; then + # TODO: support NPU RL Memory Monitor + export XTUNER_RL_MEM_DIR="${WORK_DIR}/mem_${current_time}" +fi + +# 2. Launch Ray cluster +# 根据 NODE_COUNT 分配 num_cpus, 防止内存OOM +node_count=${NODE_COUNT:-1} + +WORK_DIR=$(realpath "$WORK_DIR") +if [ "$ACCELERATOR" = "GPU" ]; then + if [ "$RAY_RANK" -eq 0 ]; then + rm -rf /tmp/ray_log + export RAY_LOG_DIR="${WORK_DIR}/ray_${current_time}/" + mkdir -p ${RAY_LOG_DIR} + ln -sfn "${RAY_LOG_DIR}" /tmp/ray_log + ray start --head \ + --node-ip-address="$RAY_MASTER_ADDR" \ + --port="$RAY_HEAD_PORT" \ + --dashboard-host=0.0.0.0 \ + --dashboard-port=$RAY_DASHBOARD_PORT \ + --include-dashboard=true \ + --disable-usage-stats \ + --temp-dir="/tmp/ray_log/" + else + while true; do + if curl --connect-timeout 2 "http://${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}" >/dev/null 2>&1; then + echo "Successfully connected to Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}" + break + else + echo "Waiting for Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT} to be available..." + sleep 2 + fi + done + ray start --address="$RAY_MASTER_ADDR:$RAY_HEAD_PORT" --block --disable-usage-stats + fi + + while true; do + result=$(ray status | grep ${ACCELERATOR} | cut -d ' ' -f2 | cut -d '/' -f2) + expected_accelerator_count=$((node_count * ${ACCELERATOR_PER_NODE})) + if [ "$result" = "$expected_accelerator_count.0" ]; then + break + else + echo "Waiting for ${ACCELERATOR} count to be $expected_accelerator_count, current: $result" + sleep 2 + fi + done + + SCRIPT_NAME=$(basename "$0") + cp "$0" "${WORK_DIR}/${SCRIPT_NAME}" + cp "$CONFIG_PATH" "${WORK_DIR}/config.py" + LOG_FILE="${WORK_DIR}/training_log_${current_time}.txt" + + python xtuner/v1/train/cli/rl.py \ + --config $CONFIG_PATH \ + 2>&1 | tee -a "${WORK_DIR}/training_log_${current_time}.txt" +elif [ "$ACCELERATOR" = "NPU" ]; then + if [ "$RAY_RANK" -eq 0 ]; then + RAY_memory_monitor_refresh_ms=0 ray start --head \ + --node-ip-address="$RAY_MASTER_ADDR" \ + --port="$RAY_HEAD_PORT" \ + --dashboard-host=0.0.0.0 \ + --dashboard-port=$RAY_DASHBOARD_PORT \ + --include-dashboard=true \ + --disable-usage-stats \ + else + RAY_RESOLVED_IP=$(nslookup $MASTER_ADDR | awk '/^Address: / { addr=$2 } END { print addr }') + if [ -z "$RAY_RESOLVED_IP" ]; then + echo "Warning: nslookup failed, falling back to original MASTER_ADDR" + RAY_RESOLVED_IP=$MASTER_ADDR + fi + echo "Resolved Master IP from $MASTER_ADDR: $RAY_RESOLVED_IP" + WAIT_TIME=$(awk "BEGIN {print 10 + 1.5 * $RAY_RANK}") + echo "Rank $RAY_RANK is sleeping for $WAIT_TIME seconds before connecting to Head..." + sleep $WAIT_TIME + export RAY_gcs_rpc_server_reconnect_timeout_s=600 + export RAY_gcs_grpc_max_reconnect_attempts=100 + export RAY_rpc_grpc_timeout_ms=30000 + RAY_memory_monitor_refresh_ms=0 ray start --address="$RAY_RESOLVED_IP:$RAY_HEAD_PORT" \ + --node-ip-address="$(hostname -i)" \ + --block \ + --disable-usage-stats + fi + + sleep 10 + + SCRIPT_NAME=$(basename "$0") + cp "$0" "${WORK_DIR}/${SCRIPT_NAME}" + cp "$CONFIG_PATH" "${WORK_DIR}/config.py" + + echo OUPUT_DIR is ${WORK_DIR} + if [ "$RAY_RANK" -eq 0 ]; then + RUNTIME_ENV_JSON="{ + \"env_vars\": { + \"XTUNER_LOG_LEVEL\": \"${XTUNER_LOG_LEVEL}\", + \"PYTHONPATH\": \"${PYTHONPATH}\", + \"MASTER_ADDR\": \"${RAY_MASTER_ADDR}\", + \"XTUNER_USE_SGLANG\": \"${XTUNER_USE_SGLANG:-}\", + \"XTUNER_USE_LMDEPLOY\": \"${XTUNER_USE_LMDEPLOY:-}\", + \"XTUNER_USE_VLLM\": \"${XTUNER_USE_VLLM:-}\", + \"PYTORCH_NPU_ALLOC_CONF\": \"${PYTORCH_NPU_ALLOC_CONF:-}\", + \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\", + \"RAY_CGRAPH_get_timeout\": \"3600\" + } + }" + + ray job submit --address="http://127.0.0.1:$RAY_DASHBOARD_PORT" \ + --runtime-env-json="$RUNTIME_ENV_JSON" \ + -- python xtuner/v1/train/cli/rl.py \ + --config $CONFIG_PATH \ + 2>&1 | tee -a "${WORK_DIR}/training_log.txt" + echo "训练任务提交完成。日志文件: ${WORK_DIR}/training_log.txt" + fi +fi