From 30ebad3d58ee3a25efa3b2754e790fb3e3a22040 Mon Sep 17 00:00:00 2001 From: liukuikun <641417025@qq.com> Date: Wed, 27 May 2026 03:23:51 +0000 Subject: [PATCH] remove recipe --- .../lagent/agent_rl_interns1_pro_mini_grpo.py | 431 ------------ recipe/lagent/agent_rl_trainer.py | 642 ------------------ recipe/lagent/environment/agent_env.py | 121 ---- recipe/lagent/environment/composed_env.py | 42 -- recipe/lagent/environment/lagent/__init__.py | 0 .../environment/lagent/agents/__init__.py | 3 - .../environment/lagent/agents/env_agent.py | 145 ---- .../lagent/agents/jugder_wrapper.py | 53 -- .../environment/lagent/agents/tito_agent.py | 68 -- .../environment/lagent/llms/__init__.py | 1 - .../lagent/llms/controller_wrapper.py | 79 --- recipe/lagent/environment/lagent/parsers.py | 122 ---- recipe/lagent/environment/lagent/schema.py | 84 --- recipe/lagent/environment/lagent/tokenize.py | 135 ---- recipe/tb2_eval/__init__.py | 6 - .../infer/agents/interndp/__init__.py | 1 - .../tb2_eval/infer/agents/interndp/config.py | 97 --- .../infer/agents/interndp/install-deps.sh | 5 - .../infer/agents/interndp/tools/__init__.py | 1 - recipe/tb2_eval/infer/setup/pre_entry.sh | 23 - recipe/tb2_eval/judgers/rule_grader/grader.py | 106 --- recipe/tb2_eval/judgers/rule_grader/run.sh | 44 -- recipe/tb2_eval/local_run/__init__.py | 0 recipe/tb2_eval/local_run/__main__.py | 170 ----- recipe/tb2_eval/local_run/config.py | 15 - recipe/tb2_eval/local_run/dataset.py | 71 -- recipe/tb2_eval/pipeline.py | 323 --------- recipe/tb2_eval/scripts/generate_jsonl.py | 104 --- recipe/tb2_eval/xtuner_dataset.py | 119 ---- recipe/tb2_rl/__init__.py | 6 - .../tb2_rl/infer/agents/interndp/__init__.py | 0 recipe/tb2_rl/infer/agents/interndp/config.py | 101 --- .../infer/agents/interndp/install-deps.sh | 5 - .../infer/agents/interndp/tools/__init__.py | 0 recipe/tb2_rl/infer/setup/pre_entry.sh | 24 - recipe/tb2_rl/judgers/rule_grader/grader.py | 101 --- recipe/tb2_rl/judgers/rule_grader/run.sh | 40 -- recipe/tb2_rl/local_run/__init__.py | 0 recipe/tb2_rl/local_run/__main__.py | 175 ----- recipe/tb2_rl/local_run/config.py | 15 - recipe/tb2_rl/local_run/dataset.py | 70 -- recipe/tb2_rl/pipeline.py | 319 --------- recipe/tb2_rl/scripts/generate_jsonl.py | 114 ---- recipe/tb2_rl/xtuner_dataset.py | 119 ---- 44 files changed, 4100 deletions(-) delete mode 100644 recipe/lagent/agent_rl_interns1_pro_mini_grpo.py delete mode 100644 recipe/lagent/agent_rl_trainer.py delete mode 100644 recipe/lagent/environment/agent_env.py delete mode 100644 recipe/lagent/environment/composed_env.py delete mode 100644 recipe/lagent/environment/lagent/__init__.py delete mode 100644 recipe/lagent/environment/lagent/agents/__init__.py delete mode 100644 recipe/lagent/environment/lagent/agents/env_agent.py delete mode 100644 recipe/lagent/environment/lagent/agents/jugder_wrapper.py delete mode 100644 recipe/lagent/environment/lagent/agents/tito_agent.py delete mode 100644 recipe/lagent/environment/lagent/llms/__init__.py delete mode 100644 recipe/lagent/environment/lagent/llms/controller_wrapper.py delete mode 100644 recipe/lagent/environment/lagent/parsers.py delete mode 100644 recipe/lagent/environment/lagent/schema.py delete mode 100644 recipe/lagent/environment/lagent/tokenize.py delete mode 100644 recipe/tb2_eval/__init__.py delete mode 100644 recipe/tb2_eval/infer/agents/interndp/__init__.py delete mode 100644 recipe/tb2_eval/infer/agents/interndp/config.py delete mode 100644 recipe/tb2_eval/infer/agents/interndp/install-deps.sh delete mode 100644 recipe/tb2_eval/infer/agents/interndp/tools/__init__.py delete mode 100644 recipe/tb2_eval/infer/setup/pre_entry.sh delete mode 100644 recipe/tb2_eval/judgers/rule_grader/grader.py delete mode 100644 recipe/tb2_eval/judgers/rule_grader/run.sh delete mode 100644 recipe/tb2_eval/local_run/__init__.py delete mode 100644 recipe/tb2_eval/local_run/__main__.py delete mode 100644 recipe/tb2_eval/local_run/config.py delete mode 100644 recipe/tb2_eval/local_run/dataset.py delete mode 100644 recipe/tb2_eval/pipeline.py delete mode 100644 recipe/tb2_eval/scripts/generate_jsonl.py delete mode 100644 recipe/tb2_eval/xtuner_dataset.py delete mode 100644 recipe/tb2_rl/__init__.py delete mode 100644 recipe/tb2_rl/infer/agents/interndp/__init__.py delete mode 100644 recipe/tb2_rl/infer/agents/interndp/config.py delete mode 100755 recipe/tb2_rl/infer/agents/interndp/install-deps.sh delete mode 100644 recipe/tb2_rl/infer/agents/interndp/tools/__init__.py delete mode 100755 recipe/tb2_rl/infer/setup/pre_entry.sh delete mode 100755 recipe/tb2_rl/judgers/rule_grader/grader.py delete mode 100755 recipe/tb2_rl/judgers/rule_grader/run.sh delete mode 100644 recipe/tb2_rl/local_run/__init__.py delete mode 100644 recipe/tb2_rl/local_run/__main__.py delete mode 100644 recipe/tb2_rl/local_run/config.py delete mode 100644 recipe/tb2_rl/local_run/dataset.py delete mode 100644 recipe/tb2_rl/pipeline.py delete mode 100644 recipe/tb2_rl/scripts/generate_jsonl.py delete mode 100644 recipe/tb2_rl/xtuner_dataset.py diff --git a/recipe/lagent/agent_rl_interns1_pro_mini_grpo.py b/recipe/lagent/agent_rl_interns1_pro_mini_grpo.py deleted file mode 100644 index a2fed1998f..0000000000 --- a/recipe/lagent/agent_rl_interns1_pro_mini_grpo.py +++ /dev/null @@ -1,431 +0,0 @@ -import json -import os -from copy import deepcopy - -import ray -from lagent.actions.mcp_client import AsyncMCPClient -from lagent.agents.fc_agent import FunctionCallAgent, get_tool_prompt -from ray.util.placement_group import placement_group - -from xtuner.v1.config import AdamWConfig, FSDPConfig, LRConfig -from xtuner.v1.data_proto.rl_data import ( - RLAgentDataItem, - RLDataFlowItem, - RLJudgerResponseItem, - RLRolloutResponseItem, - RolloutState, - SampleParams, - update_dataflow_item, -) -from xtuner.v1.datasets import DatasetConfig, Qwen3VLTokenizeFnConfig -from xtuner.v1.datasets.config import DataloaderConfig -from xtuner.v1.datasets.rl_tokenize_fn.rl_tokenize_fn import RLTokenizeFnConfig -from xtuner.v1.model import Qwen3VLMoE30BA3Config -from xtuner.v1.module.rope.rope import RopeScalingConfig -from xtuner.v1.ray.base import ( - AcceleratorResourcesConfig, - AutoAcceleratorWorkers, - CPUResourcesConfig, -) -from xtuner.v1.ray.config.worker import RolloutConfig -from xtuner.v1.ray.dataflow import DataFlowConfig, ReplayBufferConfig -from xtuner.v1.ray.environment.agent_env import AgentEnvironment -from xtuner.v1.ray.environment.composed_env import ComposedEnvironment -from xtuner.v1.ray.environment.lagent.agents import ( - AsyncTokenInOutAgent, - EnvAgent, - JudgerWrapper, - finish_condition_func, -) -from xtuner.v1.ray.environment.lagent.llms.controller_wrapper import ControllerWrapper -from xtuner.v1.ray.environment.lagent.schema import AgentMessage -from xtuner.v1.ray.evaluator import EvaluatorConfig -from xtuner.v1.ray.judger.compass_verifier_v2 import CompassVerifierV2Config -from xtuner.v1.ray.judger.controller import JudgerConfig -from xtuner.v1.ray.rollout import RolloutController -from xtuner.v1.rl.base import WorkerConfig -from xtuner.v1.rl.base.rollout_is import RolloutImportanceSampling -from xtuner.v1.rl.grpo import GRPOLossConfig -from xtuner.v1.train.agent_rl_trainer import AgentRLTrainerConfig -from xtuner.v1.train.trainer import LoadCheckpointConfig -from xtuner.v1.utils.compute_metric import compute_metric - -experimental_name = 'agent_rl_interns1_mini' -work_dir = os.environ["WORK_DIR"] -model_path = os.environ["MODEL_PATH"] - -# basic settings -global_batch_size = 8 -prompt_repeat_k = 8 -max_concurrent_groups = 512 - -max_prompt_length = 4096 -pack_max_length = 68 * 1024 -max_response_length = 64 * 1024 - -train_ep_size = 1 -rollout_tp_size = 1 -rollout_ep_size = 1 -enable_float8_rollout = False -rollout_max_batch_size = 1024 -max_prefill_token_num = 1024 -enable_return_routed_experts = True -enable_partial_rollout = False -auto_resume = False -skip_load_weights = False -lr = 1e-6 -train_optimizer_steps = 8 # mini batch steps -hf_interval = 5 -total_epochs = 10 -sp_size = 1 -# evaluation settings -enable_evaluate = True -enable_initial_evaluate = True -evaluate_step = 5 - -# 1. resources -resources = AcceleratorResourcesConfig( - accelerator="GPU", - num_workers=8, - num_cpus_per_worker=12, - cpu_memory_per_worker=16 * 1024**3, # 16 GB -) -judger_cpu_resources = CPUResourcesConfig.from_total(total_cpus=16, num_workers=16, total_memory=64 * 1024**3) - -# 2. rollout -rollout_config = RolloutConfig( - env=experimental_name, - device=resources.accelerator, - model_path=model_path, - dtype="bfloat16", - tensor_parallel_size=rollout_tp_size, - expert_parallel_size=rollout_ep_size, - gpu_memory_utilization=0.7, - enable_float8=enable_float8_rollout, - skip_load_weights=skip_load_weights, - context_length=max_response_length, - rollout_max_batch_size_per_instance=rollout_max_batch_size, - allow_over_concurrency_ratio=2, - rollout_timeout=36000, - enable_return_routed_experts=enable_return_routed_experts, - router_n_groups=8, - max_prefill_token_num=max_prefill_token_num, - extra_rollout_config=dict(lmdeploy_log_level="ERROR", lmdeploy_uvicorn_log_level="ERROR"), -) - -# sampling params -training_sample_params = SampleParams( - max_tokens=max_response_length, top_k=0, top_p=0.999, temperature=1.0, min_tokens=0 -) -evaluation_sample_params = deepcopy(training_sample_params) -evaluation_sample_params.temperature = 0.8 - -# 2. dataset - - -def parse_xpuyu_json_cfg(path, tokenize_fn_cfg, max_prompt_length, data_judger_mapping, ignore_multimodal_info=False): - with open(path, "r") as f: - json_cfg = json.load(f) - converted_cfg = [] - for ds_name, ds_cfg in json_cfg.items(): - annotation = ds_cfg["annotation"] - if isinstance(annotation, str): - annotation = [annotation] - for ann in annotation: - converted_cfg.append( - { - "dataset": DatasetConfig( - name=ds_name, - anno_path=ann, - sample_ratio=ds_cfg["sample_ratio"], - media_root=ds_cfg.get("media_root", None), - class_name='VLMJsonlDataset', - ), - "tokenize_fn": RLTokenizeFnConfig( - tokenize_fn_cfg=tokenize_fn_cfg, - system_prompt=ds_cfg.get("system_message", None), - max_length=max_prompt_length, - data_judger_mapping=data_judger_mapping, - ignore_multimodal_info=ignore_multimodal_info, - ), - } - ) - return converted_cfg - - -data_judger_mapping = { - 'WebSearch': {"compass_verifier_v2": 1.0}, - 'BrowseComp-ZH': {"compass_verifier_v2": 1.0}, - 'HLE': {"compass_verifier_v2": 1.0}, -} -tokenize_fn_cfg = Qwen3VLTokenizeFnConfig( - processor_path=model_path, enable_3d_rope=False, debug=True, chat_template='qwen3-vl-rl' -) -train_dataset_cfg = parse_xpuyu_json_cfg( - os.environ['TRAIN_DATA_PATH'], tokenize_fn_cfg, max_prompt_length, data_judger_mapping -) -eval_dataset_cfg = parse_xpuyu_json_cfg( - os.environ['EVAL_DATA_PATH'], tokenize_fn_cfg, max_prompt_length, data_judger_mapping, ignore_multimodal_info=True -) -dataloader_config = DataloaderConfig(pack_max_length=pack_max_length, collator="fake_collator", pack_level="none") - -# 3. judger -judger_cfg = JudgerConfig(reward_judger_configs=[CompassVerifierV2Config(hosts=[])]) - - -def prepare_agent_inputs(env, group_data_item: RLDataFlowItem): - env_agent, session_id = env.agent.env_agent, group_data_item.uid.observation_id - user_prompt = group_data_item.data.messages[-1]['content'] - env_message = AgentMessage(role="env", content=user_prompt) - if session_id not in env_agent.memory.memory_map or not env_agent.memory.get_memory(session_id): - set_env_message = AgentMessage(role="env", content=group_data_item) - env_agent.update_memory(set_env_message, session_id=session_id) - return (env_message,) - - -def convert_rollout_tractory_to_train(env, group_data_items): - agent_data_items, rollout_response_items, judger_response_items = [], [], [] - for i in range(len(group_data_items)): - session_id = group_data_items[i].uid.observation_id - history = env.agent.select_agent.state_dict(session_id=session_id)['memory'] - messages = env.agent.get_messages(session_id, keypath='select_agent') - agent_data_items.append(RLAgentDataItem(extra_info=dict(messages=messages, state={"history": history}))) - rollout_response_items.append( - RLRolloutResponseItem( - response=history[-1]['raw_content'], - response_ids=history[-1]['raw_content_ids'], - logprobs=history[-1]['raw_content_logprobs'], - state=RolloutState.COMPLETED, - ) - ) - env_history = env.agent.env_agent.state_dict(session_id=session_id)['memory'] - judger_response_items.append(RLJudgerResponseItem(reward=dict(score=env_history[-1]['reward']))) - # reset agent memory - env.agent.reset(session_id=session_id, recursive=True) - group_data_items = update_dataflow_item(group_data_items, "env.agent", agent_data_items) - group_data_items = update_dataflow_item(group_data_items, "env.rollout", rollout_response_items) - group_data_items = update_dataflow_item(group_data_items, "env.judger", judger_response_items) - return group_data_items - - -pg = AutoAcceleratorWorkers.build_placement_group(resources) -rollout_controller = ray.remote(max_concurrency=1000)(RolloutController).remote(rollout_config, pg) -load_checkpoint_cfg = LoadCheckpointConfig(load_optimizer_states=False, load_optimizer_args=False) - -actions = [ - dict(type=AsyncMCPClient, name='SerperSearch', server_type='http', rate_limit=100.0, max_concurrency=30, url=[]), - dict(type=AsyncMCPClient, name='JinaBrowse', server_type='http', rate_limit=100.0, max_concurrency=40, url=[]), -] -tool_prompt = get_tool_prompt(actions) - -train_agent = dict( - type=FunctionCallAgent, - select_agent=dict( - type=AsyncTokenInOutAgent, - llm=dict( - type=ControllerWrapper, - rollout_controller=rollout_controller, - sample_params=SampleParams(max_tokens=max_response_length), - ), - template=tool_prompt, - ), - env_agent=dict( - type=EnvAgent, - actions=actions, - judger=dict( - type=JudgerWrapper, - judger_cfg=judger_cfg, - placement_group=ray.get( - placement_group(bundles=[{"CPU": 1, "memory": 1024**3}], strategy="PACK").ready(), timeout=30 - ), - ), - max_turn=25, - enable_no_thinking_penalty=False, - max_tool_response_length=4096, - ), - finish_condition=finish_condition_func, -) - -eval_agent = dict( - type=FunctionCallAgent, - select_agent=dict( - type=AsyncTokenInOutAgent, - llm=dict( - type=ControllerWrapper, - rollout_controller=rollout_controller, - sample_params=SampleParams(max_tokens=max_response_length), - ), - template=tool_prompt, - ), - env_agent=dict( - type=EnvAgent, - actions=actions, - judger=dict( - type=JudgerWrapper, - judger_cfg=judger_cfg, - placement_group=ray.get( - placement_group(bundles=[{"CPU": 1, "memory": 1024**3}], strategy="PACK").ready(), timeout=30 - ), - ), - max_turn=25, - enable_no_thinking_penalty=False, - max_tool_response_length=4096, - ), - finish_condition=finish_condition_func, -) - - -def rollout_env_router_fn(item: RLDataFlowItem): - if item.data.extra_info['origin_data_source'] in ['BrowseComp-ZH', 'HLE']: - return 'eval' - return 'train_agent' - - -environment_config = dict( - type=ComposedEnvironment, - environment=experimental_name, - rollout_controller=rollout_controller, - environments={ - 'train_agent': dict( - type=AgentEnvironment, - environment='websailor', - agent_cfg=train_agent, - rollout_controller=rollout_controller, - preprocess_func=prepare_agent_inputs, - postprocess_func=convert_rollout_tractory_to_train, - ), - 'eval': dict( - type=AgentEnvironment, - environment='eval', - agent_cfg=eval_agent, - rollout_controller=rollout_controller, - preprocess_func=prepare_agent_inputs, - postprocess_func=convert_rollout_tractory_to_train, - ), - }, - router=rollout_env_router_fn, -) - -# 4. dataflow and evaluator -dataflow_config = DataFlowConfig( - env=experimental_name, - max_concurrent=max_concurrent_groups, - enable_partial_rollout=enable_partial_rollout, - max_retry_times=3, - prompt_repeat_k=prompt_repeat_k, - global_batch_size=global_batch_size, - sample_params=training_sample_params, -) - -evaluator_cfg = ( - EvaluatorConfig( - enable_evaluate=enable_evaluate, - enable_initial_evaluate=enable_initial_evaluate, - dataset_cfg=eval_dataset_cfg, - tokenizer=model_path, - eval_sample_ratio=1.0, - evaluate_step=evaluate_step, - compute_metric_func=compute_metric, - sample_params=evaluation_sample_params, - max_concurrent=8192, - ) - if enable_evaluate - else None -) - -replay_buffer_cfg = ReplayBufferConfig( - dataset_cfg=train_dataset_cfg, - dataloader_cfg=dataloader_config, - tokenizer=model_path, - # postprocessor_func=group_sample_filter_func, -) - -# # 5. Train worker -model_cfg = Qwen3VLMoE30BA3Config() -model_cfg.compile_cfg = False -model_cfg.freeze_vision = True -model_cfg.freeze_projector = True -model_cfg.vision_config.depth = 24 -model_cfg.vision_config.hidden_size = 1024 -model_cfg.vision_config.intermediate_size = 4096 -model_cfg.vision_config.deepstack_visual_indexes = [] - -model_cfg.projector_config.vision_hidden_size = 1024 -model_cfg.projector_config.deepstack_visual_indexes = [] - -model_cfg.text_config.max_position_embeddings = 32768 -model_cfg.text_config.rope_theta = 1000000 -model_cfg.text_config.rope_scaling_cfg = RopeScalingConfig( - fope_init_factor=0.1, - fope_sep_head=True, - num_inv_freq=None, -) -model_cfg.text_config.vocab_size = 155008 -model_cfg.text_config.freeze_routers = True -model_cfg.text_config.balancing_loss_cfg = None - - -optim_cfg = AdamWConfig( - lr=lr, - betas=(0.9, 0.95), - max_grad_norm=1.0, - weight_decay=0.1, - foreach=False, - skip_grad_norm_threshold=0.9, - eps=1e-15, -) -loss_cfg = GRPOLossConfig( - policy_loss_cfg=dict( - cliprange_high=0.2, - cliprange_low=0.2, - loss_type="vanilla", - ), - ignore_idx=-100, - use_kl_loss=False, - kl_loss_coef=0.0, - kl_loss_type="low_var_kl", - mode="chunk", - chunk_size=512, - rollout_is=RolloutImportanceSampling( - rollout_is_level="token", - rollout_is_mode="both", - rollout_is_threshold=(5, 0), - rollout_is_mask_threshold=(5, 0.5), - rollout_is_veto_threshold=(20, 0), - ), -) -lr_cfg = LRConfig(lr_type="constant", warmup_ratio=0, lr_min=lr) -fsdp_cfg = FSDPConfig(torch_compile=False, cpu_offload=False, ep_size=train_ep_size) -train_worker_cfg: WorkerConfig = WorkerConfig( - model_cfg=model_cfg, - load_from=model_path, - optim_cfg=optim_cfg, - loss_cfg=loss_cfg, - lr_cfg=lr_cfg, - fsdp_cfg=fsdp_cfg, - sp_size=sp_size, - optimizer_steps=train_optimizer_steps, - pack_max_length=pack_max_length, -) - -# 6. RL Trainer -trainer = AgentRLTrainerConfig( - load_from=model_path, - pg=pg, - environment_config=environment_config, - dataflow_config=dataflow_config, - replay_buffer_config=replay_buffer_cfg, - train_worker_cfg=train_worker_cfg, - evaluator_config=evaluator_cfg, - tokenizer_path=model_path, - work_dir=work_dir, - total_epochs=total_epochs, - hf_interval=hf_interval, - skip_load_weights=skip_load_weights, - auto_resume=auto_resume, - checkpoint_interval=2, - checkpoint_maxkeep=1, - load_checkpoint_cfg=load_checkpoint_cfg, - checkpoint_no_save_optimizer=True, - skip_checkpoint_validation=True, -) diff --git a/recipe/lagent/agent_rl_trainer.py b/recipe/lagent/agent_rl_trainer.py deleted file mode 100644 index 481f347d09..0000000000 --- a/recipe/lagent/agent_rl_trainer.py +++ /dev/null @@ -1,642 +0,0 @@ -import json -import os -import random -from collections import defaultdict -from concurrent.futures import ThreadPoolExecutor -from pathlib import Path -from typing import Any, Dict, Literal, cast - -import matplotlib.pyplot as plt -import numpy as np -import ray -import torch -from mmengine.dist import get_rank -from pydantic import BaseModel, ConfigDict, field_serializer, model_validator -from ray.actor import ActorClass -from ray.util.placement_group import PlacementGroup -from typing_extensions import Self - -from transformers import AutoTokenizer -from xtuner.v1.data_proto.sequence_context import SequenceContext -from xtuner.v1.patch import patch_default_save_plan -from xtuner.v1.ray.dataflow import DataFlow, DataFlowConfig, ReplayBufferConfig -from xtuner.v1.ray.environment.lagent.tokenize import tokenize -from xtuner.v1.ray.evaluator import Evaluator, EvaluatorConfig -from xtuner.v1.rl.base import WorkerConfig -from xtuner.v1.train.trainer import LoadCheckpointConfig -from xtuner.v1.utils import is_hf_model_path, timer -from xtuner.v1.utils.device import get_device, get_torch_device_module -from xtuner.v1.utils.env_check import get_rollout_engine_version - -from .rl_trainer import ( - TRAINER_RAY_GET_TIMEOUT, - RLTrainer, - RLTrainerConfig, - bind_train_rollout, -) - - -# TODO: Move DEVICE to `xtuner.utils.device` -DEVICE = get_device() -DEVICE_MODULE = get_torch_device_module() - - -class AgentRLTrainerConfig(BaseModel): - model_config = ConfigDict(extra="forbid") - - load_from: str | Path # Huggingface model path or saved trainer_path - pg: Any - environment_config: dict - dataflow_config: DataFlowConfig - replay_buffer_config: ReplayBufferConfig - train_worker_cfg: WorkerConfig - evaluator_config: EvaluatorConfig | None = None - tokenizer_path: str | Path - work_dir: Path | str | None = None - log_dir: Path | str | None = None - total_epochs: int - auto_resume: bool = False - load_checkpoint_cfg: LoadCheckpointConfig = LoadCheckpointConfig() - checkpoint_interval: int | None = -1 - checkpoint_maxkeep: int | None = -1 - checkpoint_no_save_optimizer: bool = False - skip_checkpoint_validation: bool = False # Suggest enabled if fsdp_size is larger than 512 - hf_interval: int | None = None - hf_max_keep: int | None = None - seed: int = 42 - debug: bool = False - debug_rollout: bool = False - rollout_steps: int | None = None - exp_tracker: Literal["tensorboard", "jsonl"] = "tensorboard" - display_all_workers_log: bool = False - skip_load_weights: bool = False - - @model_validator(mode="after") - def _convert_work_dir(self): - if isinstance(self.work_dir, str): - self.work_dir = Path(self.work_dir) - elif self.work_dir is None: - self.work_dir = Path.cwd() - return self - - @field_serializer("replay_buffer_config") - def serialize_replay_buffer_cfg(self, replay_buffer_config: ReplayBufferConfig) -> str: - return replay_buffer_config.model_dump(include={"replay_ratio", "replay_weights"}) # type: ignore[return-value] - - @field_serializer("evaluator_config") - def serialize_evaluator_cfg(self, evaluator_config: EvaluatorConfig) -> str: # type: ignore[return-value] - if evaluator_config: - return evaluator_config.model_dump(exclude={"tokenizer", "dataset_cfg", "compute_metric_func"}) # type: ignore[return-value] - else: - return "" - - @field_serializer("pg") - def serialize_pg(self, pg: PlacementGroup) -> str: - return f"PlacementGroup(id={pg.id})" - - @field_serializer("environment_config") - def serialize_environment_config(self, environment_config: dict) -> str: - return str(environment_config) - - -class AgentRLTrainer(RLTrainer): - def __init__( - self, - *, - load_from: str | Path, # Huggingface model path or saved trainer_path - pg: PlacementGroup, - environment_config: Dict, - dataflow_config: DataFlowConfig, - replay_buffer_config: ReplayBufferConfig, - train_worker_cfg: WorkerConfig, - evaluator_config: EvaluatorConfig | None = None, - tokenizer_path: str | Path, - work_dir: Path | str | None = None, - log_dir: Path | str | None = None, - total_epochs: int, - auto_resume: bool = False, - load_checkpoint_cfg: LoadCheckpointConfig = LoadCheckpointConfig(), - checkpoint_interval: int | None = -1, - checkpoint_maxkeep: int | None = -1, - checkpoint_no_save_optimizer: bool = False, - skip_checkpoint_validation: bool = False, # - hf_interval: int | None = None, - hf_max_keep: int | None = None, - seed: int = 42, - debug: bool = False, - debug_rollout: bool = False, - rollout_steps: int | None = None, - exp_tracker: Literal["tensorboard", "jsonl"] = "tensorboard", - display_all_workers_log: bool = False, - trainer_cfg: RLTrainerConfig | None = None, - skip_load_weights: bool = False, - ): - """Initialize the RL training system.""" - if os.environ.get("XTUNER_USE_FA3", "0") == "1": - try: - from xtuner.v1.ops.flash_attn import get_flash_attn_varlen - - get_flash_attn_varlen() - except RuntimeError as e: - raise RuntimeError( - f"Flash attention v3 runtime error {e}, Please install it first or set XTUNER_USE_FA3=0." - ) - train_worker_cfg.load_from = load_from - - self._total_epochs = total_epochs - self._cur_step = 0 - - if skip_checkpoint_validation: - patch_default_save_plan() - - self._rl_trainer_cfg = trainer_cfg - self._load_from = Path(load_from) if isinstance(load_from, str) else load_from - - is_hf_path, error_info = is_hf_model_path(load_from) if load_from is not None else False, "" - self._load_from_hf = is_hf_path - - if not self._load_from_hf: - raise NotImplementedError(error_info) - - self._hf_max_keep = hf_max_keep - self._hf_interval = hf_interval - self._checkpoint_interval = checkpoint_interval - self._checkpoint_maxkeep = checkpoint_maxkeep - self._checkpoint_no_save_optimizer = checkpoint_no_save_optimizer - - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True) - - self._debug = debug - self._debug_rollout = debug_rollout - self._seed = seed - self._set_deterministic() - self._set_random_seed(seed) - - if work_dir is None: - work_dir = Path.cwd() / "work_dir" - - if isinstance(work_dir, str): - work_dir = Path(work_dir) - - if get_rank() == 0: - work_dir.mkdir(parents=True, exist_ok=True) - - self._work_dir = work_dir - self._auto_resume = auto_resume - self._meta = self._init_xtuner_meta(work_dir, self._auto_resume) - - if log_dir is None: - log_dir = self.exp_dir - if isinstance(log_dir, str): - log_dir = Path(log_dir) - - self.logger = self._init_logger(log_dir) - - self._load_checkpoint_cfg = self._resolve_load_checkpoint_cfg(self._auto_resume, load_checkpoint_cfg) - - train_worker_cfg.log_dir = log_dir - dataflow_config.worker_log_dir = log_dir - # if rollout_config is not None: - # rollout_config.worker_log_dir = log_dir - # self._enable_return_routed_experts = rollout_config.enable_return_routed_experts - self._enable_evaluate = False - self._enable_initial_evaluate = False - if evaluator_config: - evaluator_config.worker_log_dir = log_dir - self._enable_evaluate = evaluator_config.enable_evaluate - self._enable_initial_evaluate = evaluator_config.enable_initial_evaluate - self._pg = pg - # We need to build train controller first, and then build rollout dataflow to make - # inference engines know how much memory they can utilize. - self._train_controller = self._build_train_controller(train_worker_cfg) - - if self._load_checkpoint_cfg.checkpoint_path is not None: - skip_load_weights = True - # rollout_config.skip_load_weights = True - self.logger.info( - f"Skip load rollout weights due to resume from checkpoint {self._load_checkpoint_cfg.checkpoint_path}" - ) - - # resume train worker - ray.get(self._train_controller.resume.remote(self._load_checkpoint_cfg)) - - train_state_path = Path(self._load_checkpoint_cfg.checkpoint_path) / self._SAVE_TRAIN_STATE_PATH - with train_state_path.open("r") as f: - train_state = json.load(f) - self._cur_step = train_state["cur_step"] - - self._rollout_env_controller, self._rollout_dataflow = self._build_rollout_dataflow( - environment_cfg=environment_config, - dataflow_cfg=dataflow_config, - replay_buffer_config=replay_buffer_config, - ) - self._dataflow_partial_rollout_step = dataflow_config.tail_batch_candidate_steps - - if self._load_checkpoint_cfg.checkpoint_path is not None: - # resume rollout dataflow - self.logger.info(f"Resume rollout dataflow from checkpoint {self._load_checkpoint_cfg.checkpoint_path}") - ray.get(self._rollout_dataflow.resume.remote(self._load_checkpoint_cfg.checkpoint_path)) # type: ignore[union-attr] - - if self._enable_evaluate and evaluator_config: - self._evaluator = Evaluator.remote(evaluator_config, self._rollout_env_controller) # type: ignore[attr-defined, union-attr] - self._eval_step = evaluator_config.evaluate_step - else: - pass - - self._global_batch_size = dataflow_config.global_batch_size - self._rollout_steps = ( - ray.get(self._rollout_dataflow.get_train_dataset_length.remote()) # type: ignore[attr-defined] - // dataflow_config.global_batch_size - * total_epochs - ) - if rollout_steps is not None: - self._rollout_steps = rollout_steps - self.logger.info(f"Set rollout steps to {self._rollout_steps} according to rollout_steps arg") - - bind_train_rollout(train_controller=self._train_controller, env_controller=self._rollout_env_controller) - # update weights if rollout_config.skip_load_weights == True - # if rollout_config.skip_load_weights: - if skip_load_weights: - self.logger.info("Rollout workers skip load weights, update weights from train workers.") - ray.get(self._train_controller.offload.remote(target="optimizer")) - ray.get(self._rollout_env_controller.offload.remote()) - ray.get(self._rollout_env_controller.onload_weights.remote()) - ray.get(self._train_controller.update_weights.remote()) - ray.get(self._train_controller.offload.remote(target="model")) - ray.get(self._rollout_env_controller.onload_kvcache.remote()) - self.logger.info("Rollout workers has updated weights from train workers.") - else: - ray.get(self._train_controller.offload.remote(target="all")) - - self._train_worker_cfg = train_worker_cfg - - if self._rl_trainer_cfg is not None and get_rank() == 0: - config_path = log_dir / "rl_trainer_config.json" - with config_path.open("w") as f: - f.write(self._rl_trainer_cfg.model_dump_json(indent=2)) - - env_path = log_dir / "env.json" - environment_variables = dict(os.environ) - infer_engine_version = get_rollout_engine_version() - environment_variables.update(infer_engine_version) - with env_path.open("w") as f: - json.dump(environment_variables, f, indent=2) - - self._ray_get_timeout = TRAINER_RAY_GET_TIMEOUT - self._exp_tracker = self._init_tracker(exp_tracker, log_dir / self._EXP_TRACKING_PATH) - self._display_all_workers_log = display_all_workers_log - - self._train_results: dict = defaultdict(list) - self._eval_results: dict = defaultdict(list) - - @classmethod - def from_config(cls, config: AgentRLTrainerConfig) -> Self: # type: ignore[override] - """Create a Trainer instance from a TrainerConfig. - - Args: - config (TrainerConfig): TrainerConfig instance containing all configuration parameters. - - Returns: - Self: Trainer instance initialized with the provided config. - """ - self = cls( - load_from=config.load_from, - pg=config.pg, - environment_config=config.environment_config, - dataflow_config=config.dataflow_config, - replay_buffer_config=config.replay_buffer_config, - train_worker_cfg=config.train_worker_cfg, - evaluator_config=config.evaluator_config, - tokenizer_path=config.tokenizer_path, - work_dir=config.work_dir, - log_dir=config.log_dir, - total_epochs=config.total_epochs, - auto_resume=config.auto_resume, - load_checkpoint_cfg=config.load_checkpoint_cfg, - checkpoint_interval=config.checkpoint_interval, - checkpoint_maxkeep=config.checkpoint_maxkeep, - checkpoint_no_save_optimizer=config.checkpoint_no_save_optimizer, - hf_interval=config.hf_interval, - hf_max_keep=config.hf_max_keep, - skip_checkpoint_validation=config.skip_checkpoint_validation, - seed=config.seed, - debug=config.debug, - debug_rollout=config.debug_rollout, - rollout_steps=config.rollout_steps, - exp_tracker=config.exp_tracker, - display_all_workers_log=config.display_all_workers_log, - trainer_cfg=config, # type: ignore[arg-type] - skip_load_weights=config.skip_load_weights, - ) - return self - - def _build_rollout_dataflow( # type: ignore[override] - self, environment_cfg: Dict, dataflow_cfg: DataFlowConfig, replay_buffer_config: ReplayBufferConfig - ): - from lagent.utils import create_object - - env = create_object(environment_cfg) - flow = cast(ActorClass, DataFlow).remote("grpo", dataflow_cfg, replay_buffer_config, env) - return env, flow - - def _initial_evaluate(self): - """Performs an initial evaluation before the training loop starts.""" - if self._debug_rollout: - return - if self._enable_initial_evaluate and self._enable_evaluate and self._evaluator: - ray.get(self._rollout_env_controller.update_active_workers.remote()) - scores, eval_data_groups = ray.get(self._evaluator.run.remote(return_samples=True)) - trajectory_save_path = self.exp_dir / "eval_0_trajectory.jsonl" - self._save_trajectories(eval_data_groups, trajectory_save_path, 0, is_eval=True) - self.logger.info(f"Initial rollout evaluate scores {scores} and start training") - tb_scores = {f"eval/{k}": v for k, v in scores.items()} - self._exp_tracker.add_scalars(tag_scalar_dict=tb_scores, global_step=0) - for name, score in scores.items(): - self._eval_results[name].append((self._cur_step, score)) - self.visulize_results("eval") - - def _rollout_step(self, rollout_idx: int, step_timer_dict: dict): - rollout_info = super()._rollout_step(rollout_idx, step_timer_dict) - metrics_results = self._compute_metrics(rollout_info["data_groups"]) - self.logger.info( - f"train idx {rollout_idx} scores {metrics_results['avg_reward']}," - f" all-zero group ratio {metrics_results.pop('all_zero_ratio', None)}," - f" all-one group ratio {metrics_results.pop('all_one_ratio', None)}" - ) - for metric, result in metrics_results.items(): - self._train_results[metric].append((self._cur_step, result)) - self.visulize_results("train") - return rollout_info - - def _evaluate_step(self, rollout_idx: int, step_timer_dict: dict): - """Performs an evaluation step.""" - eval_log_info = {} - if self._enable_evaluate and self._evaluator and rollout_idx % self._eval_step == 0: - with timer("evaluation", step_timer_dict): - scores, eval_data_groups = ray.get(self._evaluator.run.remote(return_samples=True)) - trajectory_save_path = self.exp_dir / f"eval_{rollout_idx}_trajectory.jsonl" - self._save_trajectories(eval_data_groups, trajectory_save_path, rollout_idx, is_eval=True) - self.logger.info(f"Evaluate idx {rollout_idx} scores {scores}") - eval_log_info.update(scores) - tb_scores = {f"eval/{k}": v for k, v in scores.items()} - self._exp_tracker.add_scalars(tag_scalar_dict=tb_scores, global_step=rollout_idx) - for name, score in scores.items(): - self._eval_results[name].append((self._cur_step, score)) - self.visulize_results("eval") - return eval_log_info - - def _save_trajectories(self, data_groups, save_path, rollout_idx=None, is_eval: bool = False): - rewards = [] - rollout_response_len_list = [] - for group in data_groups: - for data in group: - rewards.append(data.env.judger.reward["score"]) - if data.env.rollout.response_ids is not None: - if isinstance(data.env.rollout.response_ids, torch.Tensor): - response_ids = data.env.rollout.response_ids.flatten().tolist() - else: - response_ids = data.env.rollout.response_ids - rollout_response_len_list.append(len(response_ids)) - - rewards_tensor = torch.tensor(rewards).float() - rollout_response_lens = None - if len(rollout_response_len_list) > 0: - rollout_response_lens = torch.tensor(rollout_response_len_list).float() - - with open(save_path, "w", encoding="utf-8") as f: - item = { - "reward_mean": rewards_tensor.mean().item(), - "reward_std": rewards_tensor.std().item(), - "reward_max": rewards_tensor.max().item(), - "reward_min": rewards_tensor.min().item(), - "total_len": len(rewards_tensor), - } - if len(rollout_response_len_list) > 0 and rollout_response_lens is not None: - item.update( - { - "rollout_response_len_mean": rollout_response_lens.mean().item(), - "rollout_response_len_std": rollout_response_lens.std().item(), - "rollout_response_len_max": rollout_response_lens.max().item(), - "rollout_response_len_min": rollout_response_lens.min().item(), - } - ) - json.dump(item, f, ensure_ascii=False, indent=2) - f.write("\n") - for group in data_groups: - for data in group: - entry = { - "raw_prompt": data.data.extra_info["raw_prompt"], - "prompt": [ - { - "role": msg["role"], - "content": msg["raw_content"] if "raw_content" in msg else msg["content"], - } - for msg in data.env.agent.extra_info.get("messages", [])[:-1] - ], - "response": data.env.rollout.response, - "response_len": len(data.env.rollout.response_ids or []), - "label": data.data.reward_model["ground_truth"], - "reward": data.env.judger.reward["score"], - # "round": sum(msg['role'] == 'assistant' for msg in data.env.agent.extra_info['messages'][:-1]), - # "judger_response": data.env.judger.extra_info, - } - if "completions" in data.env.agent.extra_info: - entry["completions"] = data.env.agent.extra_info["completions"] - - json.dump(entry, f, ensure_ascii=False, indent=2) - f.write("\n") - - def _compute_metrics(self, data_groups): - def compute_reward(data_groups): - total_groups = len(data_groups) - zero_count = one_count = 0 - for group in data_groups: - rewards = [data.env.judger.reward["score"] for data in group] - if all(r == 0 for r in rewards): - zero_count += 1 - elif all(r == 1 for r in rewards): - one_count += 1 - - zero_ratio = zero_count / total_groups if total_groups > 0 else 0 - one_ratio = one_count / total_groups if total_groups > 0 else 0 - - all_rewards = [d.env.judger.reward["score"] for group in data_groups for d in group] - avg_reward = sum(all_rewards) / len(all_rewards) - return avg_reward, zero_ratio, one_ratio - - def compute_tool_turns(data_groups): - tool_turns = [] - for group in data_groups: - for data in group: - messages = data.env.agent.extra_info.get("messages", []) - tool_turn_count = sum(1 for msg in messages if msg["role"] == "tool") - tool_turns.append(tool_turn_count) - avg_tool_turns = sum(tool_turns) / len(tool_turns) if tool_turns else 0 - return avg_tool_turns - - avg_reward, zero_ratio, one_ratio = compute_reward(data_groups) - tool_turns = compute_tool_turns(data_groups) - metrics_results = dict( - avg_reward=avg_reward, all_zero_ratio=zero_ratio, all_one_ratio=one_ratio, avg_tool_turns=tool_turns - ) - return metrics_results - - def _prepare_train_data(self, data_groups, pack_max_length, multimodal_train_infos=None): - chat_data_groups, chat_multimodal_train_infos, agent_data_groups = [], [], [] - for j, group in enumerate(data_groups): - # always place agent messages in the extra_info - if "messages" in group[0].env.agent.extra_info or "inputs" in group[0].env.agent.extra_info: - agent_data_groups.append(group) - else: - chat_data_groups.append(group) - if multimodal_train_infos: - chat_multimodal_train_infos.append(multimodal_train_infos[j]) - - data_batches, info_dict = [], {} - if chat_data_groups: - data_batches, info_dict = super()._prepare_train_data( - chat_data_groups, pack_max_length, chat_multimodal_train_infos - ) - if not agent_data_groups: - return data_batches, info_dict - - def _tokenize_agent_messages(data_item): - if "inputs" in data_item.env.agent.extra_info: - return data_item.env.agent.extra_info["inputs"] - return tokenize(self.tokenizer, data_item.env.agent.extra_info["messages"]) - - with ThreadPoolExecutor(max_workers=64) as executor: - inputs_list = list( - executor.map( - _tokenize_agent_messages, - [group[i] for group in agent_data_groups for i in range(len(group))], - ) - ) - - rewards_list = [] - advantages_list = [] - prompt_len_list = [] - response_len_list = [] - offset = 0 - for group in agent_data_groups: - rewards = [data.env.judger.reward["score"] for data in group] - rewards_list.extend(rewards) - rewards = torch.tensor(rewards, dtype=torch.float32) - - prompt_repeat_k = len(group) - group_inputs = inputs_list[offset : offset + prompt_repeat_k] - offset += prompt_repeat_k - - # GRPO - # advantages = (rewards - rewards.mean(0)) / (rewards.std(0) + 1e-8) - - # RLOO - if prompt_repeat_k > 1: - baseline = (rewards.sum(0) - rewards) / (prompt_repeat_k - 1) - advantages = rewards - baseline - else: - advantages = rewards - - sum_entropy = None - total_tokens = 0 - for i in range(prompt_repeat_k): - # messages = group[i].env.agent.extra_info['messages'] - # assert messages[-1]['role'] == 'assistant' - inputs = group_inputs[i] - logprobs_tensor = torch.tensor(inputs["logprobs"], dtype=torch.float32) - entropy = -(logprobs_tensor).sum() - sum_entropy = entropy if sum_entropy is None else sum_entropy + entropy - total_tokens += (logprobs_tensor != 0).sum().item() - avg_entropy = sum_entropy / max(total_tokens, 1) - entropy_upper_bound = 0.65 - entropy_lower_bound = 0.25 - _tau_upper = 0.0 # noqa: F841 - _tau_lower = 0.0 # noqa: F841 # 越大scale下降的越慢 - coeff_min_upper = 0.2 # 熵高分支的最小缩放 - coeff_min_lower = 0.5 # 熵低分支的最小缩放 - if avg_entropy > entropy_upper_bound: - advantages = torch.where(advantages < 0, advantages * coeff_min_upper, advantages) - elif avg_entropy < entropy_lower_bound: - # 熵低:减弱正优势,保留负优势 - advantages = torch.where(advantages > 0, advantages * coeff_min_lower, advantages) - - for i in range(prompt_repeat_k): - rollout = group[i].env.rollout - inputs = group_inputs[i] - input_ids, labels, logprobs = inputs["input_ids"], inputs["labels"], inputs["logprobs"] - input_ids, shifted_labels, logprobs = input_ids[:-1], labels[1:], logprobs[1:] - - response_len_list.append(len(rollout.response_ids)) - prompt_len_list.append(len(input_ids) - len(rollout.response_ids)) - advantages_list.extend([advantages[i]] * len(rollout.response_ids)) - assert len(input_ids) <= pack_max_length, ( - f"Input ids length {len(input_ids)} exceed pack max length {pack_max_length}." - ) - input_ids = torch.tensor(input_ids, dtype=torch.int64).unsqueeze(0) - shifted_labels = torch.tensor(shifted_labels, dtype=torch.int64).unsqueeze(0) - rollout_logprobs = torch.tensor(logprobs, dtype=torch.float32).unsqueeze(0) - assert rollout_logprobs.size() == shifted_labels.size(), ( - f"{rollout_logprobs.size()} vs {shifted_labels.size()}" - ) - - seq_ctx = SequenceContext.from_input_ids((input_ids,), device="cpu") - seq_ctx.rollout_routed_experts = inputs["routed_experts"] - data_batches.append( - dict( - seq_ctx=seq_ctx, - shifted_labels=shifted_labels, - advantage=advantages[i].item(), - rollout_logprobs=rollout_logprobs, - ) - ) - random.shuffle(data_batches) - info_dict.update( - { - "agent/batch_size": len(rewards_list), - "agent/rewards/mean": np.mean(rewards_list), - "agent/rewards/min": np.min(rewards_list), - "agent/rewards/max": np.max(rewards_list), - "agent/advantages/mean": np.mean(advantages_list), - "agent/advantages/min": np.min(advantages_list), - "agent/advantages/max": np.max(advantages_list), - "agent/response_len/mean": np.mean(response_len_list), - "agent/response_len/min": np.min(response_len_list), - "agent/response_len/max": np.max(response_len_list), - "agent/response_len/std": np.std(response_len_list), - "agent/prompt_len/mean": np.mean(prompt_len_list), - "agent/prompt_len/min": np.min(prompt_len_list), - "agent/prompt_len/max": np.max(prompt_len_list), - } - ) - return data_batches, info_dict - - def visulize_results(self, stage: Literal["all", "train", "eval"] = "all"): - def plot_accuracy_curve(data_list, x_label="training_steps", y_label="accuracy", save_path=None): - """绘制折线图,输入的 data_list 是一个列表,元素为 (x, y) 元组, 其中 x 是横坐标,y 是纵坐标。 - - Args: - data_list (list of tuple): [(x1, y1), (x2, y2), ...],要求按 x 升序排列 - x_label (str): 横坐标标签 - y_label (str): 纵坐标标签 - save_path (str): 图片保存路径,如果为 None 则不保存 - """ - # 拆分横纵坐标 - x_values = [x for x, _ in data_list] - y_values = [y for _, y in data_list] - - plt.figure(figsize=(8, 5)) - plt.plot(x_values, y_values, marker="o", linestyle="-", linewidth=2) - plt.xlabel(x_label) - plt.ylabel(y_label) - plt.title(f"{y_label} vs {x_label}") - plt.grid(True) - - plt.savefig(save_path, dpi=300, bbox_inches="tight") - print(f"图片已保存到: {save_path}") - - if stage in ["all", "eval"]: - for metric, results in self._eval_results.items(): - plot_accuracy_curve(results, y_label=f"eval_{metric}", save_path=self.exp_dir / f"eval_{metric}.png") - if stage in ["all", "train"]: - for metric, results in self._train_results.items(): - plot_accuracy_curve(results, y_label=f"train_{metric}", save_path=self.exp_dir / f"train_{metric}.png") diff --git a/recipe/lagent/environment/agent_env.py b/recipe/lagent/environment/agent_env.py deleted file mode 100644 index 3c2f6f8b17..0000000000 --- a/recipe/lagent/environment/agent_env.py +++ /dev/null @@ -1,121 +0,0 @@ -import asyncio -import inspect -import os -import traceback -from copy import deepcopy -from typing import Callable, List, Self, Tuple - -import ray -from lagent.utils import create_object - -from xtuner.v1.data_proto.rl_data import ( - RLDataFlowItem, - RLJudgerResponseItem, - RolloutState, - update_dataflow_item, -) -from xtuner.v1.ray.environment.lagent.schema import AgentMessage -from xtuner.v1.utils import get_logger - -from .base_env import BaseEnvironment - - -def check_dead_actors(): - # 获取所有 Actor 的列表 - from ray.util.state import list_actors - - all_actors = list_actors() - - dead_actors = [] - for actor_info in all_actors: - # 状态通常是 "ALIVE", "DEAD", "RECONSTRUCTING" 等 - if actor_info["state"] == "DEAD": - dead_actors.append(actor_info) - - return dead_actors - - -@ray.remote(max_concurrency=int(os.environ.get("RAY_MAX_CONCURRENCY", 2000))) # type: ignore[call-overload] -class AgentEnvironment(BaseEnvironment): - def __init__( - self, - environment: str, - agent_cfg: dict, - rollout_controller, - judger_pg=None, - judger_cfg=None, - preprocess_func: Callable[[Self, RLDataFlowItem], Tuple[AgentMessage]] = lambda _, item: ( - AgentMessage(role="user", content=item.data.messages[0]["content"]), # type: ignore[index] - ), - postprocess_func: Callable[[Self, List[RLDataFlowItem]], List[RLDataFlowItem]] = lambda _, items: items, - ): - super().__init__(environment, None, None, judger_pg, judger_cfg) - self.rollout_controller = rollout_controller - self.agent = create_object(agent_cfg) - self.preprocess_func = preprocess_func - self.postprocess_func = postprocess_func - - async def generate( # type: ignore[override] - self, group_data_items: List[RLDataFlowItem], sample_params=None, extra_params=None - ) -> List[RLDataFlowItem]: - sample_params = sample_params.model_dump() if sample_params else {} - - async def _inner_agent_call(item): - if item.env.rollout.state == RolloutState.COMPLETED: - get_logger().debug(f"Rollout already completed for item {item.uid.observation_id}, skip agent call.") - return "Passed" - self.agent.reset(session_id=item.uid.observation_id, recursive=True) - if "agent_state_dict" in item.env.rollout.extra_info: # type: ignore[operator] - self.agent.load_state_dict( - item.env.rollout.extra_info.pop("agent_state_dict"), - session_id=item.uid.observation_id, # type: ignore[arg-type] - ) - agent_inputs = self.preprocess_func(self, deepcopy(item)) - try: - return await self.agent(*agent_inputs, session_id=item.uid.observation_id, **sample_params) - except BaseException as exc: - get_logger().error( - f"[Agent Inference Error] {exc}. Dead actors: {check_dead_actors()}\n{traceback.format_exc()}" - ) - return "Failed" - - results = await asyncio.gather(*[_inner_agent_call(sample) for sample in group_data_items]) - passed_data_items, completed_data_items = [], [] - for sample, message in zip(group_data_items, results): - if message == "Failed": - continue - if message == "Passed": - passed_data_items.append(sample) - elif message.finish_reason == "abort": - sample.env.rollout.state = RolloutState.ABORTED - agent_state_dict = self.agent.state_dict(sample.uid.observation_id) - # remove routed_experts from message extra_info to avoid serialization issue - for state in agent_state_dict.values(): - for msg in state: - msg["extra_info"].pop("routed_experts", None) - sample.env.rollout.extra_info["agent_state_dict"] = agent_state_dict # type: ignore[typeddict-unknown-key] - passed_data_items.append(sample) - else: - completed_data_items.append(sample) - completed_data_items_result = self.postprocess_func(self, completed_data_items) # type: ignore[arg-type] - if inspect.iscoroutinefunction(self.postprocess_func): - completed_data_items_result = await completed_data_items_result # type: ignore[misc] - return passed_data_items + completed_data_items_result - - async def run( # type: ignore[override] - self, group_data_items: List[RLDataFlowItem], sample_params=None, extra_params=None - ) -> List[RLDataFlowItem]: - group_data_items = await self.generate(group_data_items, sample_params, extra_params) - skip_judger = any( - item.env.rollout.finish_reason == "abort" or item.env.rollout.finish_reason == "failed" - for item in group_data_items - ) - if self.judger_controller and not skip_judger: - try: - judger_responses: List[RLJudgerResponseItem] = await asyncio.wait_for( - self.judger_controller.run.remote(group_data_items), timeout=1200.0 - ) - except asyncio.TimeoutError: - judger_responses = [RLJudgerResponseItem(extra_info={"state": "failed"}) for _ in group_data_items] - group_data_items = update_dataflow_item(group_data_items, "env.judger", judger_responses) - return group_data_items diff --git a/recipe/lagent/environment/composed_env.py b/recipe/lagent/environment/composed_env.py deleted file mode 100644 index 1b8539f4da..0000000000 --- a/recipe/lagent/environment/composed_env.py +++ /dev/null @@ -1,42 +0,0 @@ -import asyncio -import os -from itertools import chain, groupby -from typing import Callable, Dict, List - -import ray -from lagent.utils import create_object - -from xtuner.v1.data_proto.rl_data import RLDataFlowItem - -from .base_env import BaseEnvironment - - -@ray.remote(max_concurrency=int(os.environ.get("RAY_MAX_CONCURRENCY", 2000))) # type: ignore[call-overload] -class ComposedEnvironment(BaseEnvironment): - def __init__( - self, - environment: str, - rollout_controller, - environments: Dict[str, BaseEnvironment | dict], - router: Callable[[RLDataFlowItem], str] = lambda item: item.data.extra_info["environment"], - ): - super().__init__(environment, None, None, None, None) - self.rollout_controller = rollout_controller - self.environments = {name: create_object(env) for name, env in environments.items()} - self.router = router - - async def generate(self, data, sample_params, extra_params): - return await super().generate(data, sample_params, extra_params) - - async def run( # type: ignore[override] - self, group_data_items: List[RLDataFlowItem], sample_params=None, extra_params=None - ) -> List[RLDataFlowItem]: - results = await asyncio.gather( - *[ - self.environments[env_name].run.remote( - list(items), sample_params=sample_params, extra_params=extra_params - ) - for env_name, items in groupby(sorted(group_data_items, key=self.router), key=self.router) - ] - ) - return list(chain.from_iterable(results)) diff --git a/recipe/lagent/environment/lagent/__init__.py b/recipe/lagent/environment/lagent/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/recipe/lagent/environment/lagent/agents/__init__.py b/recipe/lagent/environment/lagent/agents/__init__.py deleted file mode 100644 index 098c103e80..0000000000 --- a/recipe/lagent/environment/lagent/agents/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .env_agent import EnvAgent, finish_condition_func -from .jugder_wrapper import JudgerWrapper -from .tito_agent import AsyncTokenInOutAgent diff --git a/recipe/lagent/environment/lagent/agents/env_agent.py b/recipe/lagent/environment/lagent/agents/env_agent.py deleted file mode 100644 index 83b2d7ab8d..0000000000 --- a/recipe/lagent/environment/lagent/agents/env_agent.py +++ /dev/null @@ -1,145 +0,0 @@ -import asyncio -import json -from dataclasses import asdict -from typing import Dict, List, Literal, Optional, Union - -from lagent.agents import AsyncAgent -from lagent.agents.fc_agent import EnvAgent as BaseEnvAgent -from lagent.hooks import Hook -from lagent.schema import ActionStatusCode, ActionValidCode, AgentStatusCode -from lagent.utils import create_object, truncate_text - -from xtuner.v1.ray.environment.lagent.schema import AgentMessage - - -def finish_condition_func(selection_message, env_message): - return (env_message.extra_info or {}).get("finish", False) - - -class EnvAgent(BaseEnvAgent): - def __init__( - self, - actions: list, - judger: Union[Dict, AsyncAgent], - stateful_tools: Optional[List[str]] = None, - max_turn: Optional[int] = None, - max_tool_response_length: Optional[int] = None, - max_tool_calls_per_turn: int = 5, - tool_response_truncate_side: Literal["left", "right", "middle"] = "middle", - enable_no_thinking_penalty: bool = True, - lower_tool_turn_bound: Optional[int] = None, - enable_repeated_tool_call_penalty: bool = False, - action_hooks: Optional[List[Union[dict, Hook]]] = None, - name: Optional[str] = None, - ): - super().__init__( - actions, stateful_tools, max_tool_response_length, tool_response_truncate_side, action_hooks, name - ) - self.judger: AsyncAgent = create_object(judger) - # scoring rule settings - self.max_turn = max_turn - self.enable_no_thinking_penalty = enable_no_thinking_penalty - self.max_tool_calls_per_turn = max_tool_calls_per_turn - self.lower_tool_turn_bound = lower_tool_turn_bound - self.enable_repeated_tool_call_penalty = enable_repeated_tool_call_penalty - - async def forward(self, assistant_message: AgentMessage, session_id: str, **kwargs): - extra_info = {} - current_turn = len(self.memory.get_memory(session_id)) // 2 - if assistant_message.stream_state == AgentStatusCode.SESSION_OUT_OF_LIMIT: - extra_info["finish"] = True - return AgentMessage( - sender=self.name, content="Session Length Out Of Limit", extra_info=extra_info, reward=0.0 - ) - if self.max_turn is not None and current_turn > self.max_turn: - extra_info["finish"] = True - return AgentMessage(sender=self.name, content="Reach Max turn", extra_info=extra_info, reward=0.0) - if self.enable_no_thinking_penalty and not assistant_message.thinking: - extra_info["finish"] = True - return AgentMessage(sender=self.name, content="Format Error", extra_info=extra_info, reward=0.0) - if not assistant_message.tool_calls: - extra_info["finish"] = True - if "" in (assistant_message.raw_content or ""): - return AgentMessage(sender=self.name, content="Format Error", extra_info=extra_info, reward=-1.0) - # assume the first message contains meta info to help judge - set_env_message = self.memory.get_memory(session_id)[0] - message = ( - await self.judger(assistant_message, set_env_message, session_id=session_id, **kwargs) - ).model_copy(update={"sender": self.name}, deep=True) - self.judger.reset(session_id, recursive=True) - if isinstance(message.extra_info, dict): - message.extra_info.update(extra_info) - else: - message.extra_info = extra_info - # 惩罚工具调用轮数低于下限的样本 - if self.lower_tool_turn_bound is not None and current_turn < self.lower_tool_turn_bound: - message.reward = min(message.reward, 0.0) - return message - if ( - self.max_tool_calls_per_turn is not None - and len(assistant_message.tool_calls) > self.max_tool_calls_per_turn - ): - extra_info["finish"] = True - return AgentMessage( - sender=self.name, content="Exceed Max Tool Calls Per Turn", extra_info=extra_info, reward=0.0 - ) - # 惩罚冗余工具调用 - if self.enable_repeated_tool_call_penalty: - previous_tool_calls = set() - for msg in self.memory.get_memory(session_id)[:-1]: - for call in msg.tool_calls or []: - try: - if isinstance(call["arguments"], str): - args = json.loads(call["arguments"]) - previous_tool_calls.add((call["name"], tuple(sorted(args.items())))) - except Exception: - continue - for call in assistant_message.tool_calls: - try: - if isinstance(call["arguments"], str): - args = json.loads(call["arguments"]) - if (call["name"], tuple(sorted(args.items()))) in previous_tool_calls: - extra_info["finish"] = True - return AgentMessage( - sender=self.name, - content=f"Repeated Tool Call: {call['name']}", - extra_info=extra_info, - reward=-1, - ) - except Exception: - continue - - tool_responses = await asyncio.gather( - *[ - self._retry_mechanism(self.execute_tool)(tool_call, session_id) - for tool_call in assistant_message.tool_calls - ] - ) - for i, tool_response in enumerate(tool_responses): - if tool_response.valid != ActionValidCode.OPEN: - extra_info["finish"] = True - return AgentMessage( - sender=self.name, - content=f"Tool Call Error: {tool_response.errmsg} in tool call " - f"{json.dumps(assistant_message.tool_calls[i], ensure_ascii=False)}", - extra_info=extra_info, - reward=-1, - ) - if tool_response.state != ActionStatusCode.SUCCESS: - extra_info["finish"] = True - return AgentMessage( - sender=self.name, - content=f"Tool Call Error: {tool_response.errmsg} in tool call " - f"{json.dumps(assistant_message.tool_calls[i], ensure_ascii=False)}", - extra_info=extra_info, - reward=-1 if tool_response.state == ActionStatusCode.ARGS_ERROR else 0, - ) - res = tool_response.format_result() - if self.max_tool_response_length is not None and len(res) > self.max_tool_response_length: - res = truncate_text(res, max_num=self.max_tool_response_length, side=self.tool_response_truncate_side) - tool_response.result = [{"type": "text", "content": res}] - extra_info["finish"] = False - return_message = AgentMessage( - sender=self.name, content=[asdict(resp) for resp in tool_responses], extra_info=extra_info, reward=0.0 - ) - return return_message diff --git a/recipe/lagent/environment/lagent/agents/jugder_wrapper.py b/recipe/lagent/environment/lagent/agents/jugder_wrapper.py deleted file mode 100644 index 546c4ce5bd..0000000000 --- a/recipe/lagent/environment/lagent/agents/jugder_wrapper.py +++ /dev/null @@ -1,53 +0,0 @@ -from copy import deepcopy -from typing import Callable, Optional - -from lagent.agents import AsyncAgent - -from xtuner.v1.data_proto.rl_data import ( - RLDataFlowItem, - RLJudgerResponseItem, - RLRolloutResponseItem, - RolloutState, - update_dataflow_item, -) -from xtuner.v1.ray.environment.lagent.schema import AgentMessage -from xtuner.v1.ray.judger.controller import JudgerController - - -class JudgerWrapper(AsyncAgent): - def __init__( - self, - judger_cfg=None, - placement_group=None, - judger_controller=None, - itemgetter: Callable[[AgentMessage], RLDataFlowItem] = lambda m: m.content, # type: ignore[assignment,return-value] - reward_key: str = "score", - name: Optional[str] = None, - ): - assert judger_controller is not None or (judger_cfg and placement_group), ( - "Either judger_controller or judger_cfg and placement_group must be provided." - ) - self.judger_controller = judger_controller or JudgerController.remote(judger_cfg, placement_group) # type: ignore[attr-defined] - self.itemgetter = itemgetter - self.reward_key = reward_key - super().__init__(memory=None, aggregator=None, name=name) - - async def forward(self, message: AgentMessage, meta_message: AgentMessage, *args, **kwargs) -> AgentMessage: - item = deepcopy(self.itemgetter(meta_message)) - if isinstance(item, dict): - item = RLDataFlowItem.model_validate(item) - item = update_dataflow_item( - [item], - "env.rollout", - [ - RLRolloutResponseItem( - response=message.content, - response_ids=message.content_ids, - logprobs=message.content_logprobs, - finish_reason="finished", - state=RolloutState.COMPLETED, - ) - ], - )[0] - judger_response: RLJudgerResponseItem = await self.judger_controller.run.remote(item) - return AgentMessage(sender=self.name, content=None, reward=judger_response.reward[self.reward_key]) diff --git a/recipe/lagent/environment/lagent/agents/tito_agent.py b/recipe/lagent/environment/lagent/agents/tito_agent.py deleted file mode 100644 index b14713f6fb..0000000000 --- a/recipe/lagent/environment/lagent/agents/tito_agent.py +++ /dev/null @@ -1,68 +0,0 @@ -import copy - -from lagent.agents import Agent - -from xtuner.v1.data_proto.rl_data import RLRolloutResponseItem -from xtuner.v1.ray.environment.lagent.schema import AgentMessage - - -class AsyncTokenInOutAgentMixin: - async def __call__(self, *message: AgentMessage, session_id=0, **kwargs) -> AgentMessage: # type: ignore[override] - message = [AgentMessage(sender="user", content=m) if isinstance(m, str) else copy.deepcopy(m) for m in message] # type: ignore[assignment] - for hook in self._hooks.values(): # type: ignore[attr-defined] - result = hook.before_agent(self, message, session_id) - if result: - message = result - - # resume aborted rollout - _message = self._scroll_buffer(message[-1], session_id) # type: ignore[attr-defined] - if _message is not None: - if _message.finish_reason != "abort": - _message = copy.deepcopy(_message) - for hook in self._hooks.values(): # type: ignore[attr-defined] - result = hook.after_agent(self, _message, session_id) - if result: - _message = result - return _message - message[-1].extra_info["partial_response"] = _message - else: - self.update_memory(message, session_id=session_id) # type: ignore[attr-defined] - response_message = await self.forward(*message, session_id=session_id, **kwargs) - if _message and _message.finish_reason == "abort": - message[-1].extra_info.pop("partial_response", None) - if not isinstance(response_message, AgentMessage): - assert isinstance(response_message, RLRolloutResponseItem), ( - f"Expected response to be of type AgentMessage or RLRolloutResponseItem, but got {type(response_message)}" - ) - response_message = AgentMessage.from_model_response(response_message, self.name) # type: ignore[attr-defined] - self.update_memory(response_message, session_id=session_id) # type: ignore[attr-defined] - response_message = copy.deepcopy(response_message) - for hook in self._hooks.values(): # type: ignore[attr-defined] - result = hook.after_agent(self, response_message, session_id) - if result: - response_message = result - return response_message - - async def forward(self, *message: AgentMessage, session_id=0, **kwargs) -> AgentMessage: - partial_response: AgentMessage = message[-1].extra_info.get("partial_response") - if partial_response and partial_response.raw_content: - self.update_memory(partial_response, session_id=session_id) # type: ignore[attr-defined] - formatted_messages = self.aggregator.aggregate( # type: ignore[attr-defined] - self.memory.get(session_id), # type: ignore[attr-defined] - self.name, # type: ignore[attr-defined] - self.output_format, # type: ignore[attr-defined] - self.template, # type: ignore[attr-defined] - ) - response_message = await self.llm.chat(formatted_messages, session_id, **kwargs) # type: ignore[attr-defined] - if isinstance(response_message, AgentMessage): - response_message.sender = self.name # type: ignore[attr-defined] - if partial_response and partial_response.raw_content: - response_message = partial_response.merge_with(response_message) - response_message = self.llm.parse_response(response_message) # type: ignore[attr-defined] - # remove the partial response from memory, since it's merged into the final response - self.memory.get(session_id).delete(-1) # type: ignore[attr-defined] - return response_message - - -class AsyncTokenInOutAgent(AsyncTokenInOutAgentMixin, Agent): - pass diff --git a/recipe/lagent/environment/lagent/llms/__init__.py b/recipe/lagent/environment/lagent/llms/__init__.py deleted file mode 100644 index 02e1720362..0000000000 --- a/recipe/lagent/environment/lagent/llms/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .controller_wrapper import ControllerWrapper diff --git a/recipe/lagent/environment/lagent/llms/controller_wrapper.py b/recipe/lagent/environment/lagent/llms/controller_wrapper.py deleted file mode 100644 index 140c28c403..0000000000 --- a/recipe/lagent/environment/lagent/llms/controller_wrapper.py +++ /dev/null @@ -1,79 +0,0 @@ -from typing import Any, Dict, List, Optional - -import ray -from lagent.utils import create_object -from ray.actor import ActorClass - -from xtuner.v1.data_proto.rl_data import RLRolloutResponseItem, SampleParams -from xtuner.v1.ray.config.worker import RolloutConfig -from xtuner.v1.ray.environment.lagent.parsers import ( - Qwen3FunctionCallParser, - Qwen3TokenReasonParser, - ResponseParser, -) -from xtuner.v1.ray.environment.lagent.schema import AgentMessage -from xtuner.v1.ray.environment.lagent.tokenize import tokenize -from xtuner.v1.ray.rollout.controller import RolloutController - - -class ControllerWrapper: - def __init__( - self, - placement_group: Optional[Any] = None, - rollout_cfg: Optional[RolloutConfig] = None, - rollout_controller: Optional[ActorClass] = None, - sample_params: Optional[SampleParams] = None, - reasoning_parser: Optional[ResponseParser] = None, - tool_call_parser: Optional[ResponseParser] = None, - ): - assert rollout_controller is not None or (placement_group and rollout_cfg), ( - "Either rollout_controller or placement_group and rollout_cfg must be provided." - ) - if rollout_controller: - self.rollout_controller = rollout_controller - self.rollout_cfg = ray.get(rollout_controller.get_rollout_info.remote())["rollout_config"] # type: ignore[call-overload, attr-defined] - else: - self.rollout_controller = RolloutController.remote(rollout_cfg, placement_group) # type: ignore[attr-defined] - self.rollout_cfg = rollout_cfg - - from transformers import AutoTokenizer - - self.tokenizer = AutoTokenizer.from_pretrained(self.rollout_cfg.tokenizer_path, trust_remote_code=True) - self.sample_params = sample_params or SampleParams() - # default parsers - self.reasoning_parser = ( - reasoning_parser - and create_object(reasoning_parser) - or Qwen3TokenReasonParser(self.rollout_cfg.tokenizer_path) - ) - self.tool_call_parser = tool_call_parser and create_object(tool_call_parser) or Qwen3FunctionCallParser() - - async def chat(self, messages, session_id=None, tools: Optional[List[Dict]] = None, **kwargs): - sample_params = self.sample_params.model_copy(update=kwargs) - inputs = tokenize(self.tokenizer, messages, tools) - if len(inputs["input_ids"]) >= self.rollout_cfg.context_length: - response = RLRolloutResponseItem(finish_reason="length") - else: - extra_info = {"action_id": session_id} - if inputs["routed_experts"] is not None: - extra_info["routed_experts"] = inputs["routed_experts"] - response = await self.rollout_controller.rollout.remote( # type: ignore[no-redef, attr-defined] - input_ids=inputs["input_ids"], - sample_params=sample_params, - session_id=session_id, - extra_info=extra_info, - ) - if ( - response.finish_reason != "abort" - and self.rollout_cfg.enable_return_routed_experts - and "routed_experts" not in response.extra_info - ): - raise ValueError("Routed experts expected in response extra_info but not found.") - - response = AgentMessage.from_model_response(response, "") - return self.parse_response(response) - - def parse_response(self, response: AgentMessage): - response = self.reasoning_parser.parse_response(response) - response = self.tool_call_parser.parse_response(response) - return response diff --git a/recipe/lagent/environment/lagent/parsers.py b/recipe/lagent/environment/lagent/parsers.py deleted file mode 100644 index 27ac55dd00..0000000000 --- a/recipe/lagent/environment/lagent/parsers.py +++ /dev/null @@ -1,122 +0,0 @@ -import ast -import json -import re -from typing import Protocol - -from xtuner.v1.ray.environment.lagent.schema import AgentMessage - - -class ResponseParser(Protocol): - """Protocol for agent response parsers.""" - - def parse_response(self, data: AgentMessage) -> AgentMessage: ... - - -class Qwen3TokenReasonParser: - def __init__(self, tokenizer_path: str, resoning_token=dict(start="", end="")): - self.start = resoning_token.get("start", "") - self.end = resoning_token.get("end", "") - - from transformers import AutoTokenizer - - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True) - - def parse_response(self, data: AgentMessage) -> AgentMessage: - think, content = "", data.content or "" - thinking_start_idx = thinking_end_idx = -1 - if self.end in data.content: - think, content = data.content.rsplit(self.end, 1) - if self.start in think: - think = think.split(self.start, 1)[-1] - else: - thinking_start_idx = 0 - data.thinking = think.strip() - data.content = content.strip() - thinking_ids = [] - thinking_logprobs = [] - content_ids = data.content_ids or [] - content_logprobs = data.content_logprobs or [] - start_token_ids = self.tokenizer.encode(self.start, add_special_tokens=False) - end_token_ids = self.tokenizer.encode(self.end, add_special_tokens=False) - # find first start_token_ids and last end_token_ids in content_ids - # thinking ids should contain start_token_ids and end_token_ids - for i in range(len(content_ids) - len(start_token_ids) + 1): - if content_ids[i : i + len(start_token_ids)] == start_token_ids: - thinking_start_idx = i - break - for i in range(len(content_ids) - len(end_token_ids), -1, -1): - if content_ids[i : i + len(end_token_ids)] == end_token_ids: - thinking_end_idx = i + len(end_token_ids) - break - if thinking_start_idx != -1 and thinking_end_idx != -1 and thinking_end_idx > thinking_start_idx: - thinking_ids = content_ids[thinking_start_idx:thinking_end_idx] - thinking_logprobs = content_logprobs[thinking_start_idx:thinking_end_idx] - data.thinking_ids = thinking_ids - data.thinking_logprobs = thinking_logprobs - # remove thinking ids from content ids and logprobs - data.content_ids = content_ids[:thinking_start_idx] + content_ids[thinking_end_idx:] - data.content_logprobs = content_logprobs[:thinking_start_idx] + content_logprobs[thinking_end_idx:] - return data - - -class Qwen3FunctionCallParser: - def parse_response(self, data: AgentMessage) -> AgentMessage: - matches = re.findall(r"\s*(\{.*?\})\s*", data.content, flags=re.DOTALL) - tool_calls, error_message = [], None - for m in matches: - tool_call = None - try: - tool_call = json.loads(m) - except json.JSONDecodeError as json_err: - try: - tool_call = ast.literal_eval(m) - except (SyntaxError, ValueError) as eval_err: - error_message = ( - f"JSON parsing failed with both json.loads and ast.literal_eval:\n" - f"- JSON Decode Error: {json_err}\n" - f"- Fallback Syntax/Value Error: {eval_err}\n" - f"- Problematic JSON text: {m}" - ) - continue - if tool_call is not None: - tool_calls.append(tool_call) - - if tool_calls: - data.tool_calls = tool_calls - if error_message: - data.extra_info["parse_tool_call_error"] = error_message - return data - - -class Qwen3_5FunctionCallParser: - def parse_response(self, data: AgentMessage) -> AgentMessage: - tool_call_blocks = re.findall(r"(.*?)", data.content, flags=re.DOTALL) - tool_calls, error_message = [], None - - for block in tool_call_blocks: - func_match = re.search(r"]+)>(.*?)", block, flags=re.DOTALL) - if not func_match: - error_message = "Could not find a valid ... block inside ." - continue - - func_name = func_match.group(1).strip() - func_body = func_match.group(2) - - param_matches = re.finditer(r"]+)>(.*?)", func_body, flags=re.DOTALL) - parameters = {} - for p_match in param_matches: - p_name = p_match.group(1).strip() - p_value = p_match.group(2).strip() - try: - parsed_value = ast.literal_eval(p_value) - except (ValueError, SyntaxError): - parsed_value = p_value - parameters[p_name] = parsed_value - - tool_calls.append({"name": func_name, "arguments": parameters}) - - if tool_calls: - data.tool_calls = tool_calls - if error_message: - data.extra_info["parse_tool_call_error"] = error_message - return data diff --git a/recipe/lagent/environment/lagent/schema.py b/recipe/lagent/environment/lagent/schema.py deleted file mode 100644 index d6891cfff2..0000000000 --- a/recipe/lagent/environment/lagent/schema.py +++ /dev/null @@ -1,84 +0,0 @@ -from typing import List, Optional - -from lagent.schema import AgentMessage as BaseAgentMessage -from lagent.schema import AgentStatusCode -from pydantic import Field - -from xtuner.v1.data_proto.rl_data import RLRolloutResponseItem, RolloutState - - -class AgentMessage(BaseAgentMessage): - """Extends the base AgentMessage to include RL model response - conversion.""" - - content_ids: Optional[List[int]] = Field(default=None, repr=False) - content_logprobs: Optional[List[float]] = Field(default=None, repr=False) - thinking_ids: Optional[List[int]] = Field(default=None, repr=False) - thinking_logprobs: Optional[List[float]] = Field(default=None, repr=False) - reward: Optional[float] = None - raw_content: Optional[str] = None - raw_content_ids: Optional[List[int]] = Field(default=None, repr=False) - raw_content_logprobs: Optional[List[float]] = Field(default=None, repr=False) - - def merge_with(self, other: "AgentMessage") -> "AgentMessage": - assert self.finish_reason == "abort", f"Cannot merge with non-aborted message: {self.finish_reason}" # type: ignore[has-type] - self.raw_content = (self.raw_content or "") + (other.raw_content or "") - self.content = self.raw_content - if self.raw_content_ids is not None: - self.raw_content_ids.extend(other.raw_content_ids or []) - self.content_ids = self.raw_content_ids - if self.raw_content_logprobs is not None: - self.raw_content_logprobs.extend(other.raw_content_logprobs or []) - self.content_logprobs = self.raw_content_logprobs - self.reward = other.reward - self.finish_reason = other.finish_reason # type: ignore[has-type, assignment] - self.stream_state = other.stream_state # type: ignore[has-type, assignment] - self.extra_info = other.extra_info # type: ignore[has-type, assignment] - return self - - @classmethod - def from_model_response(cls, model_response: RLRolloutResponseItem, sender: str) -> "AgentMessage": - """Convert model response dict to AgentMessage.""" - return cls( - sender=sender, - content=model_response.response or "", - content_ids=model_response.response_ids, - content_logprobs=model_response.logprobs, - thinking=None, - thinking_ids=None, - tool_calls=None, - tool_calls_ids=None, - raw_content=model_response.response or "", - raw_content_ids=model_response.response_ids, - raw_content_logprobs=model_response.logprobs, - extra_info=model_response.extra_info, - stream_state=( - AgentStatusCode.END - if model_response.state in [RolloutState.COMPLETED, RolloutState.ABORTED] - else ( - AgentStatusCode.SESSION_OUT_OF_LIMIT - if model_response.finish_reason == "length" - else AgentStatusCode.SERVER_ERR - ) - ), - finish_reason=model_response.finish_reason, - ) - - def to_model_request(self, role: str = "assistant") -> dict: - """Convert AgentMessage to model request dict.""" - return { - "role": role, - "content": self.content, - "content_ids": self.content_ids, - "content_logprobs": self.content_logprobs, - "thinking": self.thinking, - "thinking_ids": self.thinking_ids, - "tool_calls": self.tool_calls, - "tool_calls_ids": self.tool_calls_ids, - "raw_content": self.raw_content, - "raw_content_ids": self.raw_content_ids, - "raw_content_logprobs": self.raw_content_logprobs, - "extra_info": self.extra_info, - "stream_state": self.stream_state, - "finish_reason": self.finish_reason, - } diff --git a/recipe/lagent/environment/lagent/tokenize.py b/recipe/lagent/environment/lagent/tokenize.py deleted file mode 100644 index 41ae95ecbd..0000000000 --- a/recipe/lagent/environment/lagent/tokenize.py +++ /dev/null @@ -1,135 +0,0 @@ -import base64 -import os -import re -from typing import Any, Dict, List - -import ray -from ray import cloudpickle - -from xtuner.v1.utils import get_logger - - -ENABLE_INTERLEAVED_THINKING = os.getenv("ENABLE_INTERLEAVED_THINKING", "1") == "1" -ENABLE_THINKING = os.getenv("ENABLE_THINKING", "1") == "1" - -logger = get_logger() -logger.info(f"[agent_tokenize_fn] ENABLE_INTERLEAVED_THINKING={ENABLE_INTERLEAVED_THINKING}") -logger.info(f"[agent_tokenize_fn] ENABLE_THINKING={ENABLE_THINKING}") - - -def tokenize( - tokenizer, - messages, - tools=None, - enable_interleaved_thinking: bool = ENABLE_INTERLEAVED_THINKING, - enable_thinking: bool = ENABLE_THINKING, - **kwargs, -) -> dict: - input_ids = tokenizer.encode("", add_special_tokens=False) - thinking_start_ids = tokenizer.encode("", add_special_tokens=False) - thinking_end_ids = tokenizer.encode("", add_special_tokens=False) - routed_experts = None - previous_routed_experts_tasks = set() - - def get_content_index(content_ids) -> int: - content_ids_str = " ".join([str(content_id) for content_id in content_ids]) - thinking_end_ids_str = " ".join([str(thinking_end_id) for thinking_end_id in thinking_end_ids]) - return len(content_ids) - len(content_ids_str.split(thinking_end_ids_str)[-1].split()) - - def split_conversation(messages: List[Dict[str, Any]]) -> List[List[Dict[str, Any]]]: - final_chunks: List[List[Dict[str, Any]]] = [] - context_chunk: List[Dict[str, Any]] = [] - for message in messages: - if message["role"] == "assistant": - if context_chunk: - final_chunks.append(context_chunk) - final_chunks.append([message]) - context_chunk = [] - else: - context_chunk.append(message) - if context_chunk: - final_chunks.append(context_chunk) - return final_chunks - - labels: List[int] = [] - logprobs: List[float] = [] - msg_num = 0 - for idx, msg in enumerate(split_conversation(messages)): - msg_num += len(msg) - if msg[0]["role"] == "assistant": - if msg[0].get("raw_content_ids"): - token_ids = msg[0]["raw_content_ids"] - else: - assistant_with_gen = tokenizer.apply_chat_template( - msg, tokenize=False, add_generation_prompt=True, enable_thinking=enable_thinking - ) - assistant_without_gen = tokenizer.apply_chat_template( - msg, tokenize=False, add_generation_prompt=False, enable_thinking=enable_thinking - ) - prompt = assistant_without_gen[len(assistant_with_gen) - len(assistant_without_gen) :] - token_ids = tokenizer.encode(prompt, add_special_tokens=False) - - content_idx = 0 - if not enable_interleaved_thinking and msg_num < len(messages) or msg[0].get("remove_thinking"): - content_idx = get_content_index(token_ids) - token_ids = token_ids[content_idx:] - - if msg[0].get("loss", True): - labels.extend(token_ids) - logprobs.extend(msg[0]["raw_content_logprobs"][content_idx:]) - else: - labels.extend([-100] * len(token_ids)) - logprobs.extend([0] * len(token_ids)) - - if ( - isinstance(msg[0].get("extra_info"), dict) - and "routed_experts" in msg[0]["extra_info"] - and msg[0]["extra_info"]["routed_experts"] is not None - ): - routed_experts_ref = msg[0]["extra_info"]["routed_experts"] - if isinstance(routed_experts_ref, ray.ObjectRef): - if routed_experts_ref.hex() in previous_routed_experts_tasks: - logger.warning( - "[tokenize_fn] Detected repeated routed_experts_ref, setting routed_experts to None to avoid errors." - ) - routed_experts = None - else: - routed_experts = routed_experts_ref - previous_routed_experts_tasks.add(routed_experts_ref.hex()) - else: - assert isinstance(routed_experts_ref, str), ( - f"Expected routed_experts_ref to be a base64 string, but got {type(routed_experts_ref)}" - ) - ref_bytes = base64.b64decode(routed_experts_ref.encode("utf-8")) - routed_experts = cloudpickle.loads(ref_bytes) - else: - routed_experts = None - - else: - prompt = tokenizer.apply_chat_template( - msg, - tokenize=False, - add_generation_prompt=True, - add_special_tokens=False, - tools=tools if idx == 0 else None, - enable_thinking=enable_thinking, - ) - if ( - not enable_interleaved_thinking - and msg_num < len(messages) - and re.search(r"assistant\n\s*$", prompt) - ): - prompt = prompt.rsplit("", 1)[0] - token_ids = tokenizer.encode(prompt, add_special_tokens=False) - if idx > 0: - token_ids = tokenizer.encode("\n", add_special_tokens=False) + token_ids - if isinstance(msg[-1].get("extra_info"), dict) and msg[-1]["extra_info"].get("previous_completions", []): - token_ids += tokenizer.encode( - "".join(msg[-1]["extra_info"]["previous_completions"]), add_special_tokens=False - ) - token_ids += thinking_start_ids - labels.extend([-100] * len(token_ids)) - logprobs.extend([0] * len(token_ids)) - input_ids.extend(token_ids) - assert len(input_ids) == len(labels) == len(logprobs) - return {"input_ids": input_ids, "labels": labels, "logprobs": logprobs, "routed_experts": routed_experts} diff --git a/recipe/tb2_eval/__init__.py b/recipe/tb2_eval/__init__.py deleted file mode 100644 index c23aefaa56..0000000000 --- a/recipe/tb2_eval/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -"""tb2-eval bench adapter for the sandbox_agent_loop framework.""" - -from .local_run.dataset import TB2EvalBench -from .pipeline import runner - -__all__ = ["TB2EvalBench", "runner"] diff --git a/recipe/tb2_eval/infer/agents/interndp/__init__.py b/recipe/tb2_eval/infer/agents/interndp/__init__.py deleted file mode 100644 index f151137ce4..0000000000 --- a/recipe/tb2_eval/infer/agents/interndp/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# This file marks the interndp directory as a Python package. \ No newline at end of file diff --git a/recipe/tb2_eval/infer/agents/interndp/config.py b/recipe/tb2_eval/infer/agents/interndp/config.py deleted file mode 100644 index 5e8075fbc0..0000000000 --- a/recipe/tb2_eval/infer/agents/interndp/config.py +++ /dev/null @@ -1,97 +0,0 @@ -"""Agent config for interndp (tb2-rl). Paths read from env at daemon-start time.""" - -import os - -from lagent.agents.fc_agent import get_tool_prompt - -tool_template = """# Tools - -You have access to the following functions: - - -{tools} - - -If you choose to call a function ONLY reply in the following format with NO suffix: - - - - -value_1 - - -This is the value for the second parameter -that can span -multiple lines - - - - - -Reminder: -- Function calls MUST follow the specified format: an inner block must be nested within XML tags -- Required parameters MUST be specified -- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after -- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls -""" - -workspace = os.environ.get("TASK_WORKSPACE", "/app") -skills_root = f"{workspace}/skills" - -model = dict( - type="lagent.llms.model.AsyncAPIClient", - model=dict( - model=os.environ["RL_LLM_MODEL"], - base_url=os.environ.get( - "RL_LLM_BASE_URL", - "http://s-20260104203038-22bhb.ailab-evalservice.pjh-service.org.cn/v1", - ), - api_key=os.environ.get("RL_LLM_API_KEY", "sk-admin"), - ), - sample_params=dict(temperature=0.7, top_p=1.0, top_k=50), - timeout=900, - max_retry=5, - sleep_interval=5, - extra_body=dict(spaces_between_special_tokens=False), -) - -base_actions = [dict(type="lagent.actions.tmux_action.TerminalExecute")] - -policy_agent = dict( - type="lagent.agents.AsyncAgent", - llm=model, - template=get_tool_prompt(base_actions, template=tool_template), - hooks=[dict(type="lagent.hooks.logger.MessageLogger")], -) - -env_agent = dict( - type="lagent.agents.env_agent.RLEnvAgent", - actions=base_actions, - max_turn=100, - enable_no_thinking_penalty=False, - max_tool_response_length=4096, - tool_response_truncate_side="left", - hooks=[dict(type="lagent.hooks.logger.MessageLogger")], - -) - -agent_config = dict( - type="lagent.agents.fc_agent.FunctionCallAgent", - policy_agent=policy_agent, - env_agent=env_agent, - finish_condition='lagent.agents.env_agent.finish_condition_func', - initialize_input=False, -) - - -if __name__ == "__main__": - import asyncio - - from lagent.utils import create_object - - async def main(): - agent = create_object(agent_config) - res = await agent("list the files in the current directory and tell me which is the largest") - print(res) - - asyncio.run(main()) diff --git a/recipe/tb2_eval/infer/agents/interndp/install-deps.sh b/recipe/tb2_eval/infer/agents/interndp/install-deps.sh deleted file mode 100644 index bdea01a1dd..0000000000 --- a/recipe/tb2_eval/infer/agents/interndp/install-deps.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -# interndp agent install-deps stub — runner already uploads lagent source; -# nothing agent-specific to install here. -set -euo pipefail -echo "[interndp] install-deps: ok" diff --git a/recipe/tb2_eval/infer/agents/interndp/tools/__init__.py b/recipe/tb2_eval/infer/agents/interndp/tools/__init__.py deleted file mode 100644 index 82789f2075..0000000000 --- a/recipe/tb2_eval/infer/agents/interndp/tools/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# This file is intentionally left blank. \ No newline at end of file diff --git a/recipe/tb2_eval/infer/setup/pre_entry.sh b/recipe/tb2_eval/infer/setup/pre_entry.sh deleted file mode 100644 index a9e787a5d1..0000000000 --- a/recipe/tb2_eval/infer/setup/pre_entry.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# --------------------------------------------------------------------------- -# pre_entry.sh — tb2-eval pre-agent environment setup. -# -# Runs inside the infer sandbox, after the task tree has been seeded under -# $TASK_WORKSPACE and before the agent starts. -# -# Each tb2-eval task uses its own pre-built docker image (specified in -# task.toml [environment] docker_image). At runtime we seed the workspace -# ourselves: ``environment/files/*`` are already uploaded into -# $TASK_WORKSPACE/ by the pipeline's UploadHook — this script is a -# placeholder kept for parity and to run any task-level setup.sh if present. -# --------------------------------------------------------------------------- -set -euo pipefail - -: "${TASK_WORKSPACE:?TASK_WORKSPACE not set}" - -# Optional: run upstream setup.sh if a task ships one. -if [ -f "$TASK_WORKSPACE/environment/setup.sh" ]; then - (cd "$(dirname "$TASK_WORKSPACE")" && bash "$TASK_WORKSPACE/environment/setup.sh" "$TASK_WORKSPACE") -fi - -exit 0 \ No newline at end of file diff --git a/recipe/tb2_eval/judgers/rule_grader/grader.py b/recipe/tb2_eval/judgers/rule_grader/grader.py deleted file mode 100644 index 674236272e..0000000000 --- a/recipe/tb2_eval/judgers/rule_grader/grader.py +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/env python3 -"""Emit a ``JudgerResult`` line to stdout that matches official TB2 scoring. - -The bench's ``tests/test.sh`` writes the authoritative binary outcome to -``/logs/verifier/reward.txt`` (``1`` iff every pytest invocation in that -script exited 0, else ``0``) — including the multi-pytest case in tasks -like ``fix-code-vulnerability``. We read that file as the source of truth -for ``total``. CTRF is parsed only for per-test observability in the -``criteria`` field and never used for scoring. -""" - -from __future__ import annotations - -import argparse -import json -import sys -from pathlib import Path - - -def _log_tail(path: Path, bytes_: int = 800) -> str: - try: - return path.read_text(errors="replace")[-bytes_:] - except Exception: - return "" - - -def _read_reward(path: Path) -> float | None: - try: - raw = path.read_text().strip() - except Exception: - return None - if not raw: - return None - try: - return float(raw) - except ValueError: - return None - - -def _parse_criteria(ctrf_path: Path) -> tuple[dict[str, dict[str, float]], int, str | None]: - """Parse CTRF into per-test criteria for observability only. - - Returns: - tuple[dict[str, dict[str, float]], int, str | None]: ``(criteria, test_count, - error)``. ``criteria`` maps test name to ``{"score": 0.0|1.0}``. ``error`` is - ``None`` on success or a message describing why CTRF was unreadable. - """ - try: - data = json.loads(ctrf_path.read_text()) - except Exception as exc: - return {}, 0, f"ctrf missing/parse failed: {exc}" - tests = (data.get("results", {}) or {}).get("tests", []) or [] - criteria: dict[str, dict[str, float]] = {} - for t in tests: - name = t.get("name", "unknown") - passed = t.get("status") == "passed" - criteria[name] = {"score": 1.0 if passed else 0.0} - return criteria, len(tests), None - - -def main() -> int: - ap = argparse.ArgumentParser() - ap.add_argument("--ctrf", required=True) - ap.add_argument("--log", required=True) - ap.add_argument("--reward-file", required=True) - ap.add_argument("--pytest-rc", type=int, required=True) - ap.add_argument("--judger-name", default="rule_grader") - args = ap.parse_args() - - ctrf_path = Path(args.ctrf) - log_path = Path(args.log) - reward_path = Path(args.reward_file) - - reward = _read_reward(reward_path) - criteria, test_count, ctrf_error = _parse_criteria(ctrf_path) - - result: dict = { - "judger_name": args.judger_name, - "criteria": criteria, - "metadata": { - "pytest_rc": args.pytest_rc, - "test_count": test_count, - "reward_source": "reward.txt" if reward is not None else "pytest_rc", - }, - } - - if reward is not None: - result["total"] = round(reward, 4) - else: - # reward.txt missing/unreadable: fall back to test.sh exit code. - result["total"] = 1.0 if args.pytest_rc == 0 else 0.0 - result["error"] = ( - f"reward file unreadable at {reward_path}; fell back to pytest_rc. " - f"log tail: {_log_tail(log_path)}" - ) - - if ctrf_error is not None: - # CTRF is observability-only; surface the parse error but don't change total. - result.setdefault("error", ctrf_error) - - print(json.dumps(result, ensure_ascii=False)) - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/recipe/tb2_eval/judgers/rule_grader/run.sh b/recipe/tb2_eval/judgers/rule_grader/run.sh deleted file mode 100644 index 7d79329c98..0000000000 --- a/recipe/tb2_eval/judgers/rule_grader/run.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash -# --------------------------------------------------------------------------- -# run.sh — tb2-eval verifier entrypoint. -# -# The bench ships its own test harness at /tests/test.sh which: -# - installs pytest + pytest-json-ctrf + /tests/test_requirements.txt -# - runs /tests/test_outputs.py with --ctrf /logs/verifier/ctrf.json -# - writes the authoritative 0/1 outcome to /logs/verifier/reward.txt -# (for multi-pytest tasks like fix-code-vulnerability, this is the -# AND of all pytest exit codes — matching official TB2 scoring) -# -# We invoke it and hand both reward.txt and the resulting CTRF to our -# shared emitter. reward.txt drives the JudgerResult `total`; CTRF is -# parsed for per-test observability only. -# -# Env: -# $TESTS_DIR tests directory inside the sandbox (default: /tests) -# $JUDGER_DIR where grader.py lives -# $JUDGER_NAME name to emit in the JudgerResult (default: rule_grader) -# --------------------------------------------------------------------------- -set -uo pipefail - -TESTS_DIR="${TESTS_DIR:-/tests}" -JUDGER_DIR="${JUDGER_DIR:-$(dirname "$0")}" -JUDGER_NAME="${JUDGER_NAME:-rule_grader}" - -TEST_LOG=/tmp/tb2_eval_test.log -: > "$TEST_LOG" - -chmod +x "$TESTS_DIR/test.sh" 2>/dev/null || true -bash "$TESTS_DIR/test.sh" > "$TEST_LOG" 2>&1 -TEST_RC=$? - -PY=/mnt/llm-ai-infra/miniconda3/envs/train/bin/python3 -if [ ! -x "$PY" ]; then - PY=$(command -v python3 || echo python) -fi - -"$PY" "$JUDGER_DIR/grader.py" \ - --ctrf /logs/verifier/ctrf.json \ - --log "$TEST_LOG" \ - --reward-file /logs/verifier/reward.txt \ - --pytest-rc "$TEST_RC" \ - --judger-name "$JUDGER_NAME" \ No newline at end of file diff --git a/recipe/tb2_eval/local_run/__init__.py b/recipe/tb2_eval/local_run/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/recipe/tb2_eval/local_run/__main__.py b/recipe/tb2_eval/local_run/__main__.py deleted file mode 100644 index f40ef0defa..0000000000 --- a/recipe/tb2_eval/local_run/__main__.py +++ /dev/null @@ -1,170 +0,0 @@ -"""CLI for running rollout samples through this recipe's pipeline. - -Usage: - python -m recipe.tb2_eval.local_run --limit 5 --output results.jsonl -""" - -from __future__ import annotations - -import argparse -import asyncio -import importlib.util -import json -import os -import sys -import traceback -from copy import deepcopy -from pathlib import Path -from typing import Any - -from lagent.utils import create_object - -from xtuner.v1.data_proto.rl_data import RolloutState -from xtuner.v1.rl.agent_loop.sandbox_agent_loop import AgentInSandboxLoop, AgentRolloutItem -from xtuner.v1.rl.agent_loop.sandbox_agent_loop.trace import init_writer - - -def _load_config(path: Path) -> Any: - spec = importlib.util.spec_from_file_location("local_run_config", path) - if spec is None or spec.loader is None: - raise RuntimeError(f"cannot load {path}") - mod = importlib.util.module_from_spec(spec) - spec.loader.exec_module(mod) - return mod - - -def _inject_session_id(runner_cfg: dict[str, Any], session_id: str) -> None: - for entry in runner_cfg.get("infer", {}).get("entries", []): - if isinstance(entry, dict) and entry.get("name") == "start_agent_daemon": - entry.setdefault("env", {})["XTUNER_SESSION_ID"] = session_id - - -async def _run_one(dataset: Any, item: AgentRolloutItem) -> dict[str, Any]: - runner_cfg = item.pipeline or dataset.pipeline - if isinstance(runner_cfg, dict): - runner_cfg = deepcopy(runner_cfg) - _inject_session_id(runner_cfg, str(item.uid)) - runner = create_object(runner_cfg) - else: - runner = runner_cfg - result = await runner.run(item) - dumped = result.model_dump(mode="json", exclude={"artifacts", "pipeline"}) - dumped["artifacts"] = _serialize_artifacts(result.artifacts) - return dumped - - -async def _run_agentloop(dataset: Any, item: AgentRolloutItem, agent_loop: AgentInSandboxLoop) -> dict[str, Any]: - item = item.model_copy(update={"pipeline": item.pipeline or dataset.pipeline}, deep=True) - if item.task_root is None: - raise ValueError("AgentRolloutItem.task_root is required.") - instruction_path = Path(item.task_root) / item.instruction - content = instruction_path.read_text(encoding="utf-8") - prompt_ids = agent_loop.tokenizer.encode(content, add_special_tokens=False) - - rollout_state = RolloutState( - message=[{"role": "user", "content": content}], - prompt_ids=prompt_ids, - num_tokens=len(prompt_ids), - data_source={item.data_source: 1.0}, - reward_model={"style": item.data_source}, - uid=item.uid, - message_uid=item.group_id, - extra_fields={"rollout_item": item}, - ) - result = await agent_loop.generate_sample(rollout_state) - return { - "id": item.id, - "status": result.status.value, - "reward": result.reward["score"] if result.reward and "score" in result.reward else None, - "error": result.error_msg, - "finish_reason": result.finish_reason, - "response": result.response, - "response_ids_len": len(result.response_ids or []), - "prompt_ids_len": len(result.prompt_ids or []), - "agent_artifacts": _serialize_artifacts(result.extra_fields.get("agent_artifacts", {})), - } - - -def _serialize_artifacts(artifacts: dict[str, Any]) -> dict[str, Any]: - """Keep text artifacts as-is; collapse bytes blobs to a size placeholder.""" - out: dict[str, Any] = {} - for key, value in artifacts.items(): - if isinstance(value, (bytes, bytearray)): - out[key] = f"<{len(value)} bytes>" - else: - out[key] = value - return out - - -async def main_async(args: argparse.Namespace) -> int: - init_writer() - cfg = _load_config(Path(args.config)) - dataset = cfg.dataset - - pairs: list[tuple[Path, AgentRolloutItem]] - if args.tasks: - wanted = {str(Path(p).resolve()) for p in args.tasks} - pairs = [(td, item) for td, item in dataset.iter_tasks() if str(td.resolve()) in wanted] - else: - pairs = list(dataset.iter_tasks()) - if args.limit: - pairs = pairs[: args.limit] - if not pairs: - print("no tasks to run", file=sys.stderr) - return 1 - - print(f"running {len(pairs)} task(s) (concurrency={args.concurrency})", file=sys.stderr) - sem = asyncio.Semaphore(max(1, args.concurrency)) - agent_loop = None - if args.mode == "agentloop": - if not args.hf_checkpoint: - raise ValueError("--hf-checkpoint is required in agentloop mode.") - agent_loop = AgentInSandboxLoop(hf_checkpoint=args.hf_checkpoint) - - async def guarded(idx: int, td: Path, item: AgentRolloutItem) -> dict[str, Any]: - async with sem: - item = item.model_copy(update={"group_id": 0, "uid": idx}) - try: - if args.mode == "agentloop": - assert agent_loop is not None - return await _run_agentloop(dataset, item, agent_loop) - return await _run_one(dataset, item) - except Exception as exc: - tb = traceback.format_exc() - print(f"[{item.id}] uncaught: {type(exc).__name__}: {exc}\n{tb}", file=sys.stderr) - return {"id": item.id, "task_dir": str(td), "error": f"{type(exc).__name__}: {exc}", "traceback": tb} - - out_fp = open(args.output, "w") if args.output else None - try: - coros = [guarded(i, td, item) for i, (td, item) in enumerate(pairs)] - for coro in asyncio.as_completed(coros): - result = await coro - line = json.dumps(result, ensure_ascii=False) - if out_fp is not None: - out_fp.write(line + "\n") - out_fp.flush() - print(json.dumps({k: result.get(k) for k in ("id", "status", "reward", "error")}, ensure_ascii=False)) - finally: - if out_fp is not None: - out_fp.close() - return 0 - - -def main() -> int: - parser = argparse.ArgumentParser(description="Run rollout samples through this recipe's pipeline.") - parser.add_argument("--config", default=str(Path(__file__).parent / "config.py")) - parser.add_argument("--tasks", nargs="*", help="Specific task dirs to run; default: all from dataset") - parser.add_argument("--limit", type=int, default=0, help="Limit total tasks (0=all)") - parser.add_argument("--concurrency", type=int, default=4) - parser.add_argument("--output", help="Optional JSONL path to dump full per-sample results") - parser.add_argument("--mode", choices=("runner", "agentloop"), default="runner") - parser.add_argument( - "--hf-checkpoint", - default=os.environ.get("HF_CHECKPOINT") or os.environ.get("QWEN3P5_VL_MODEL_PATH"), - help="Tokenizer/processor checkpoint used by agentloop mode.", - ) - return asyncio.run(main_async(parser.parse_args())) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/recipe/tb2_eval/local_run/config.py b/recipe/tb2_eval/local_run/config.py deleted file mode 100644 index 9c0eaf0de6..0000000000 --- a/recipe/tb2_eval/local_run/config.py +++ /dev/null @@ -1,15 +0,0 @@ -"""Run tb2-eval tasks with the default agent pipeline. - -Invoke via:: - - python -m recipe.tb2_eval.local_run --limit 5 -""" - -from recipe.tb2_eval.local_run.dataset import TB2EvalBench -from recipe.tb2_eval.pipeline import runner - - -dataset = TB2EvalBench( - jsonl_path="/mnt/shared-storage-user/llmit/user/wangziyi/projs/terminalbench2-harbor-p-cluster/tb2_eval_tasks.jsonl", - pipeline=runner, -) diff --git a/recipe/tb2_eval/local_run/dataset.py b/recipe/tb2_eval/local_run/dataset.py deleted file mode 100644 index f4eedfed82..0000000000 --- a/recipe/tb2_eval/local_run/dataset.py +++ /dev/null @@ -1,71 +0,0 @@ -"""tb2-eval dataset: read prebuilt JSONL records, yield AgentRolloutItem. - -The JSONL is produced by ``recipe.tb2_eval.scripts.generate_jsonl``. Each -record fully describes a task (id, instruction, tags, ability, -pipeline_overrides), so this module does nothing but field-mapping — no -``task.toml`` parsing. -""" - -from __future__ import annotations - -import json -import logging -from pathlib import Path -from typing import Iterator - -from xtuner.v1.rl.agent_loop.sandbox_agent_loop.runner import Runner -from xtuner.v1.rl.agent_loop.sandbox_agent_loop.schemas import AgentRolloutItem - - -logger = logging.getLogger(__name__) - - -class TB2EvalBench: - """tb2-eval dataset iterator. One pipeline (with per-task overrides) for - every task it yields.""" - - name = "tb2-eval" - - def __init__( - self, - jsonl_path: str | Path, - *, - pipeline: Runner | dict, - skip_ids: set[str] | list[str] | None = None, - ): - """ - Args: - jsonl_path (str | Path): JSONL produced by - ``scripts/generate_jsonl.py`` (one record per task). - pipeline: Shared Runner config for every task under this dataset. - skip_ids (set[str] | list[str] | None): Sample ids to exclude. - """ - self.pipeline = pipeline - self.skip_ids = set(skip_ids or ()) - self._records = [ - json.loads(line) - for line in Path(jsonl_path).read_text(encoding="utf-8").splitlines() - if line.strip() - ] - - def iter_tasks(self) -> Iterator[tuple[Path, AgentRolloutItem]]: - for rec in self._records: - if rec["id"] in self.skip_ids: - logger.info("skipping %s (in skip_ids)", rec["id"]) - continue - try: - yield Path(rec["task_dir"]), self.load_task(rec) - except Exception as exc: - logger.warning("skipping %s: %s", rec.get("id"), exc) - - def load_task(self, rec: dict) -> AgentRolloutItem: - return AgentRolloutItem( - id=rec["id"], - data_source=self.name, - ability=rec.get("ability"), - tags=rec.get("tags", []), - instruction=rec["instruction"], - task_root=Path(rec["task_dir"]), - pipeline=self.pipeline, - pipeline_overrides=rec.get("pipeline_overrides", {}), - ) diff --git a/recipe/tb2_eval/pipeline.py b/recipe/tb2_eval/pipeline.py deleted file mode 100644 index d9532c8418..0000000000 --- a/recipe/tb2_eval/pipeline.py +++ /dev/null @@ -1,323 +0,0 @@ -"""tb2-eval rollout runner config. - -Layout of a tb2-eval task (flat, no category subdirs):: - - / - task.toml — metadata + per-task docker_image - instruction.md — natural-language task - environment/Dockerfile — baked into the per-task image - environment/files/ — data files the Dockerfile COPYs to /app/ - tests/test.sh — bench-provided verifier entrypoint - tests/test_outputs.py — pytest module - tests/test_requirements.txt - -Unlike tb2-rl (which uses a fixed ``t-data-processing-v1`` image for all -tasks), each eval task uses its own pre-built docker image specified in -``task.toml [environment] docker_image``. The image is passed at runtime in -the pipeline config's ``sandboxes.main`` entry (see ``tb2_eval_tokenize_fn.py``). - -The ``DEFAULT_SANDBOX`` below uses a placeholder image; in practice the -per-task image always overrides it via the sample's pipeline config. -""" - -from __future__ import annotations - -import os -from pathlib import Path -from types import SimpleNamespace -from typing import Any - -from lagent.serving.sandbox.providers.gateway import GatewayProvider - -from xtuner.v1.rl.agent_loop.sandbox_agent_loop import ( - AgentSpec, - DetachedShellEntry, - DownloadHook, - EntryCapture, - EntryDiagnostics, - EntryFailurePolicy, - EntryMonitor, - EntryProcessHealthCheck, - ExecHook, - InstallLagent, - Judger, - JudgerValidator, - ParseJudgerStdout, - PickAgent, - ReadFileHook, - ReturnCodeFileCompletion, - Runner, - RunAgentInstallDeps, - SandboxHealthCheck, - SandboxPool, - SandboxSpec, - SandboxStage, - ShellEntry, - UploadAgentConfigSource, - UploadChosenAgent, - UploadHook, -) - -HERE = Path(__file__).resolve().parent -SETUP_DIR = HERE / "infer" / "setup" -AGENT_TEMPLATES = HERE / "infer" / "agents" -JUDGERS = HERE / "judgers" - - -# ───────────────────────────────────────────────────────────────── -# Sandbox runtime paths -# ───────────────────────────────────────────────────────────────── - -PATHS = SimpleNamespace( - setup_dir="/tmp/infer/setup", - judger_dir="/tmp/judgers/rule_grader", - agent_config="/tmp/agent_config.py", - agent_sock="/tmp/lagent_agent.sock", - agent_daemon_log="/tmp/agent_daemon.log", - agent_daemon_pid="/tmp/agent_daemon.pid", - agent_response="/tmp/agent_response.txt", - trajectory="/tmp/trajectory.json", - message="/tmp/message.json", - tests="/tests", -) - -SHARED_LAGENT_PYTHON = os.getenv( - "LAGENT_PYTHON", - "/mnt/llm-ai-infra/miniconda3/envs/train/bin/python", -) -LAGENT_PYTHONPATH = "/app:/tmp:${PYTHONPATH:-}" - -START_AGENT_DAEMON = ( - f'PYTHONPATH="{LAGENT_PYTHONPATH}" "{SHARED_LAGENT_PYTHON}" ' - f"-m lagent.serving.sandbox.client_cli start-agent-daemon " - f"--mode agent --config {PATHS.agent_config} --sock {PATHS.agent_sock} " - f"--pid-file {PATHS.agent_daemon_pid} --log {PATHS.agent_daemon_log} --truncate-log" -) -WAIT_AGENT_DAEMON = ( - f'PYTHONPATH="{LAGENT_PYTHONPATH}" "{SHARED_LAGENT_PYTHON}" ' - f"-m lagent.serving.sandbox.client_cli wait-ready " - f"--sock {PATHS.agent_sock} --pid-file {PATHS.agent_daemon_pid} " - f"--log {PATHS.agent_daemon_log} --timeout 60" -) -AGENT_CHAT = ( - f'PYTHONPATH="{LAGENT_PYTHONPATH}" "{SHARED_LAGENT_PYTHON}" ' - f"-m lagent.serving.sandbox.client_cli chat " - f"--sock {PATHS.agent_sock} --instruction-file \"/app/instruction.md\" " - f"--response-out {PATHS.agent_response} --log {PATHS.agent_daemon_log}" -) -AGENT_STATE_DICT = ( - f'PYTHONPATH="{LAGENT_PYTHONPATH}" "{SHARED_LAGENT_PYTHON}" ' - f"-m lagent.serving.sandbox.client_cli state-dict " - f"--sock {PATHS.agent_sock} --trajectory-out {PATHS.trajectory} " - f"--log {PATHS.agent_daemon_log}" -) -AGENT_GET_MESSAGES = ( - f'PYTHONPATH="{LAGENT_PYTHONPATH}" "{SHARED_LAGENT_PYTHON}" ' - f"-m lagent.serving.sandbox.client_cli get-messages " - f"--sock {PATHS.agent_sock} --message-out {PATHS.message} " - f"--log {PATHS.agent_daemon_log}" -) -STOP_AGENT_DAEMON = ( - f'PYTHONPATH="{LAGENT_PYTHONPATH}" "{SHARED_LAGENT_PYTHON}" ' - f"-m lagent.serving.sandbox.client_cli shutdown " - f"--sock {PATHS.agent_sock} --pid-file {PATHS.agent_daemon_pid} " - f"--log {PATHS.agent_daemon_log}" -) - - -def entry_failure(*, include_entry_output: bool = False) -> dict[str, Any]: - files = [dict(path=PATHS.agent_daemon_log, key="daemon_log", optional=True)] - if include_entry_output: - files.extend( - [ - dict(entry_file="stdout", key="entry_stdout", optional=True), - dict(entry_file="stderr", key="entry_stderr", optional=True), - ] - ) - return dict( - type=EntryFailurePolicy, - diagnostics=dict(type=EntryDiagnostics, files=files), - diagnostic_error_policy="preserve_entry_error", - ) - - -# ───────────────────────────────────────────────────────────────── -# Defaults -# ───────────────────────────────────────────────────────────────── - -DEFAULT_WORKSPACE = "/app" -DEFAULT_AGENTS: list[dict[str, Any]] = [ - dict( - type=AgentSpec, - name="interndp", - config="config.py", - install="install-deps.sh", - tools="tools", - weight=1.0, - ) -] - -# Placeholder image — each task overrides this via its pipeline config. -DEFAULT_SANDBOX = dict( - type=SandboxSpec, - image="tb2-eval-placeholder", - ttl_seconds=11700, - key=os.getenv("SANDBOX_PROVIDER_KEY", "lkk-as8dHd2Q"), - workspace_path=DEFAULT_WORKSPACE, -) -DEFAULT_PROVIDER = { - "type": GatewayProvider, - "gateway_url": os.getenv("SANDBOX_GATEWAY_URL", "http://env-gateway.ailab.ailab.ai"), -} - -runner = dict( - type=Runner, - pool=dict( - type=SandboxPool, - provider=DEFAULT_PROVIDER, - specs={"main": DEFAULT_SANDBOX}, - ), - infer=dict( - type=SandboxStage, - sandbox="main", - pre=[ - # ── Stage 1: workspace setup ────────────────────────────────── - # Create the workspace dir, upload bench-level setup scripts, - # then run pre_entry.sh for any per-bench bootstrap. - dict(type=ExecHook, cmd=f"mkdir -p {DEFAULT_WORKSPACE}"), - dict( - type=UploadHook, - mappings=[ - dict(base=str(SETUP_DIR), source="*", target=PATHS.setup_dir + "/", flatten=True), - ], - ), - dict( - type=ExecHook, - cmd=f"bash {PATHS.setup_dir}/pre_entry.sh", - env={"TASK_WORKSPACE": DEFAULT_WORKSPACE}, - timeout=300, - ), - - # ── Stage 2: task data ──────────────────────────────────────── - # Place per-task files (instruction + environment/files/*) under - # the workspace so the agent sees them. - dict(type=UploadHook, mappings=[dict(source="instruction.md", target=f"{DEFAULT_WORKSPACE}/instruction.md")]), - dict( - type=UploadHook, - mappings=[dict(base="environment/files", source="**/*", target=f"{DEFAULT_WORKSPACE}/")], - ), - - # ── Stage 3: agent harness ──────────────────────────────────── - # Install lagent runtime, pick an agent variant, upload its - # harness + config, run its install-deps. - dict(type=InstallLagent, lagent_src_dir=os.getenv("LAGENT_SRC_DIR", "/mnt/shared-storage-user/llmit/user/liukuikun/workspace/lagent")), - dict(type=PickAgent, agents=DEFAULT_AGENTS, template_root=str(AGENT_TEMPLATES)), - dict(type=UploadChosenAgent, target_dir=f"{DEFAULT_WORKSPACE}/agent/"), - dict(type=UploadAgentConfigSource, dst=PATHS.agent_config), - dict(type=RunAgentInstallDeps, workspace=DEFAULT_WORKSPACE), - ], - entries=[ - dict( - type=ShellEntry, - name="start_agent_daemon", - cmd=START_AGENT_DAEMON, - timeout=60, - failure=entry_failure(), - env={ - "RL_LLM_MODEL": os.environ.get('RL_LLM_MODEL', ''), - } - ), - dict( - type=ShellEntry, - name="wait_agent_daemon", - cmd=WAIT_AGENT_DAEMON, - timeout=90, - failure=entry_failure(), - ), - dict( - type=DetachedShellEntry, - name="agent_chat", - cmd=AGENT_CHAT, - capture=dict(type=EntryCapture, root="/tmp", prefix="xt_entry"), - monitor=dict( - type=EntryMonitor, - timeout=10800, - probes=[ - dict(type=ReturnCodeFileCompletion, interval_sec=2.0), - dict(type=SandboxHealthCheck, interval_sec=10.0, probe_timeout_sec=10.0, fail_after=3), - dict(type=EntryProcessHealthCheck, interval_sec=10.0, probe_timeout_sec=10.0, fail_after=2), - ], - ), - failure=entry_failure(include_entry_output=True), - ), - # dict( - # type=ShellEntry, - # name="agent_state_dict", - # cmd=AGENT_STATE_DICT, - # timeout=300, - # failure=entry_failure(), - # ), - dict( - type=ShellEntry, - name="agent_get_messages", - cmd=AGENT_GET_MESSAGES, - timeout=300, - failure=entry_failure(), - ), - # `|| true` 让 stop 失败不污染 stage status —— sandbox 一会儿 - # 也会被 pool.release_all 释放,daemon 自然死。debug 想保留 - # daemon 的话注释掉这条 entry 即可。 - dict( - type=ShellEntry, - name="stop_agent_daemon", - cmd=STOP_AGENT_DAEMON + " || true", - timeout=30, - ), - ], - post=[ - dict(type=ReadFileHook, path=PATHS.message, key="message"), - dict(type=ReadFileHook, path=PATHS.agent_response, key="agent_response"), - # workspace tar(debug 时本地解压看产物,失败 silent + warning) - # dict(type=DownloadHook, paths=[DEFAULT_WORKSPACE]), - ], - ), - validate=dict( - type=JudgerValidator, - judgers=[ - dict( - type=Judger, - name="rule_grader", - weight=1.0, - stage=dict( - type=SandboxStage, - sandbox="main", - pre=[ - dict( - type=UploadHook, - mappings=[ - dict(base=str(JUDGERS / "rule_grader"), source="*", target=PATHS.judger_dir + "/", flatten=True), - dict(base="tests", source="**/*", target=f"{PATHS.tests}/"), - ], - ), - dict(type=ExecHook, cmd=f"chmod +x {PATHS.tests}/test.sh || true", optional=True), - ], - entries=[ - dict( - type=ShellEntry, - name="run_tests", - cmd=f"bash {PATHS.judger_dir}/run.sh", - env={ - "JUDGER_NAME": "rule_grader", - "TASK_WORKSPACE": DEFAULT_WORKSPACE, - "TESTS_DIR": PATHS.tests, - "JUDGER_DIR": PATHS.judger_dir, - }, - timeout=900, - ) - ], - post=[dict(type=ParseJudgerStdout, judger_name="rule_grader")], - ), - ) - ], - ), -) diff --git a/recipe/tb2_eval/scripts/generate_jsonl.py b/recipe/tb2_eval/scripts/generate_jsonl.py deleted file mode 100644 index eb29b8d323..0000000000 --- a/recipe/tb2_eval/scripts/generate_jsonl.py +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env python3 -"""Scan tb2-eval ``tasks/`` tree and emit one JSONL record per task directory. - -Each record fully describes a task — downstream datasets do nothing but -field-mapping; ``task.toml`` parsing happens here only. - -Record fields: - task_dir absolute path to the task directory - id sample id (task.toml `id` or dir name) - instruction relative path to the instruction file - tags list[str] - ability domain / category - pipeline_overrides dict to deep-merge into Runner._pool config -""" - -from __future__ import annotations - -import argparse -import json -import sys -import tomllib -from pathlib import Path -from typing import Any - - -def iter_task_dirs(tasks_root: Path) -> list[Path]: - """Return sorted direct-child task directories (flat layout).""" - roots: list[Path] = [] - for toml in sorted(tasks_root.rglob("task.toml")): - parent = toml.parent - try: - rel = parent.relative_to(tasks_root) - except ValueError: - continue - if len(rel.parts) == 1: - roots.append(parent) - return roots - - -def _load_task_toml(path: Path) -> dict: - raw = tomllib.loads(path.read_text(encoding="utf-8")) - if "task" in raw and isinstance(raw["task"], dict): - for k, v in raw.pop("task").items(): - raw.setdefault(k, v) - return raw - - -def build_record(task_dir: Path) -> dict[str, Any]: - toml = _load_task_toml(task_dir / "task.toml") - tags = list((toml.get("metadata") or {}).get("tags") or toml.get("tags") or []) - image = (toml.get("environment") or {}).get("docker_image") - if not image: - raise KeyError(f"[environment] docker_image missing in {task_dir / 'task.toml'}") - return { - "task_dir": str(task_dir.resolve()), - "id": toml.get("id") or task_dir.name, - "instruction": "instruction.md", - "tags": tags, - "ability": toml.get("domain") or (tags[1] if len(tags) > 1 else None), - "pipeline_overrides": {"pool": {"specs": {"main": {"image": image}}}}, - } - - -def main() -> int: - parser = argparse.ArgumentParser(description=__doc__.splitlines()[0]) - parser.add_argument( - "--tasks-root", - type=Path, - default=Path( - "/mnt/shared-storage-user/llmit/user/wangziyi/projs/terminalbench2-harbor-p-cluster/terminal-bench-2" - ), - help="Absolute path to the eval tasks directory.", - ) - parser.add_argument( - "-o", - "--output", - type=Path, - default=None, - help="Write JSONL here. Defaults to /../tb2_eval_tasks.jsonl", - ) - args = parser.parse_args() - - tasks_root = args.tasks_root.resolve() - if not tasks_root.is_dir(): - print(f"tasks root is not a directory: {tasks_root}", file=sys.stderr) - return 1 - - output = args.output or (tasks_root.parent / "tb2_eval_tasks.jsonl") - skipped = 0 - with open(output, "w", encoding="utf-8") as fp: - for task_dir in iter_task_dirs(tasks_root): - try: - rec = build_record(task_dir) - except Exception as exc: - print(f"warning: skipping {task_dir}: {exc}", file=sys.stderr) - skipped += 1 - continue - fp.write(json.dumps(rec, ensure_ascii=False) + "\n") - print(f"wrote: {output}" + (f" ({skipped} skipped)" if skipped else "")) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/recipe/tb2_eval/xtuner_dataset.py b/recipe/tb2_eval/xtuner_dataset.py deleted file mode 100644 index a594bfb832..0000000000 --- a/recipe/tb2_eval/xtuner_dataset.py +++ /dev/null @@ -1,119 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -"""tb2-eval XTuner training tokenize function. - -Reads jsonl records produced by ``recipe.tb2_eval.scripts.generate_jsonl`` -and emits ``RolloutState`` for the trainer. The per-task -``AgentRolloutItem`` (carrying pipeline + per-task overrides) lives in -``RolloutState.extra_fields["rollout_item"]`` for the runner to consume at -rollout time. -""" - -from pathlib import Path - -from pydantic import BaseModel, ConfigDict -from transformers import PreTrainedTokenizer - -from xtuner.v1.data_proto.rl_data import RolloutState -from xtuner.v1.datasets.data_item import CacheItem -from xtuner.v1.datasets.utils import CachableTokenizeFunction -from xtuner.v1.rl.agent_loop.sandbox_agent_loop.schemas import AgentRolloutItem -from xtuner.v1.utils import get_logger - -logger = get_logger() - - -class RLTB2EvalTokenizeFn(CachableTokenizeFunction[RolloutState]): - """tb2-eval tokenize function aligned with ``RLTextTokenizeFn``.""" - - DATA_SOURCE_NAME = "tb2-eval" - PIPELINE_DOTTED = "recipe.tb2_eval.pipeline.runner" - - def __init__( - self, - tokenizer: PreTrainedTokenizer, - max_length: int | None = None, - tools_schema: list | None = None, - data_judger_mapping: dict | None = None, - system_prompt: str | None = None, - ): - super().__init__(tokenizer) - self.max_length = max_length - self.tools_schema = tools_schema if tools_schema is not None else [] - self.data_judger_mapping = data_judger_mapping - self.system_prompt = system_prompt - - def __call__(self, item: dict, **kwargs) -> RolloutState | CacheItem: - task_dir = Path(item["task_dir"]) - instruction_path = task_dir / item["instruction"] - context = instruction_path.read_text(encoding="utf-8") - - message = [{"role": "user", "content": context}] - if self.system_prompt: - message = [{"role": "system", "content": self.system_prompt}] + message - - raw_prompt = self.tokenizer.apply_chat_template( - message, tools=self.tools_schema, add_generation_prompt=True, tokenize=False - ) - data = self.tokenizer(raw_prompt, add_special_tokens=False) - prompt_token_ids = data["input_ids"] - num_tokens = len(prompt_token_ids) - - if self.state == "cache": - if self.max_length is not None and num_tokens > self.max_length: - num_tokens = 0 # filtered out by the dataset filter - return CacheItem( - num_tokens=num_tokens, - proxy_attn_flops=float(num_tokens), - ) - - if self.max_length is not None: - assert num_tokens <= self.max_length, f"num_tokens {num_tokens} > max_length {self.max_length}" - - if self.data_judger_mapping is not None: - data_source = self.data_judger_mapping.get(self.DATA_SOURCE_NAME) - else: - data_source = {self.DATA_SOURCE_NAME: 1.0} - - rollout_item = AgentRolloutItem( - id=item["id"], - data_source=self.DATA_SOURCE_NAME, - ability=item.get("ability"), - tags=item.get("tags", []), - instruction=item["instruction"], - task_root=task_dir, - pipeline=self.PIPELINE_DOTTED, - pipeline_overrides=item.get("pipeline_overrides", {}), - ) - - return RolloutState( - prompt_ids=prompt_token_ids, - message=message, - reward_model={"style": self.DATA_SOURCE_NAME}, - num_tokens=num_tokens, - proxy_attn_flops=float(num_tokens), - data_source=data_source, - extra_fields={ - "rollout_item": rollout_item, - "ability": item.get("ability"), - }, - ) - - def hash(self) -> str: - return type(self).__name__ - - -class RLTB2EvalTokenizeFnConfig(BaseModel): - model_config = ConfigDict(title="tb2-eval RL dataset config for xtuner", extra="forbid") - max_length: int | None = None - tools_schema: list | None = None - data_judger_mapping: dict | None = None - system_prompt: str | None = None - - def build(self, tokenizer: PreTrainedTokenizer, **kwargs) -> RLTB2EvalTokenizeFn: - return RLTB2EvalTokenizeFn( - tokenizer=tokenizer, - max_length=self.max_length, - tools_schema=self.tools_schema, - data_judger_mapping=self.data_judger_mapping, - system_prompt=self.system_prompt, - ) diff --git a/recipe/tb2_rl/__init__.py b/recipe/tb2_rl/__init__.py deleted file mode 100644 index a544ffa927..0000000000 --- a/recipe/tb2_rl/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -"""tb2-rl bench adapter for the sandbox_agent_loop framework.""" - -from .local_run.dataset import TB2RLBench -from .pipeline import runner - -__all__ = ["TB2RLBench", "runner"] diff --git a/recipe/tb2_rl/infer/agents/interndp/__init__.py b/recipe/tb2_rl/infer/agents/interndp/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/recipe/tb2_rl/infer/agents/interndp/config.py b/recipe/tb2_rl/infer/agents/interndp/config.py deleted file mode 100644 index f89769116e..0000000000 --- a/recipe/tb2_rl/infer/agents/interndp/config.py +++ /dev/null @@ -1,101 +0,0 @@ -"""Agent config for interndp (tb2-rl). Paths read from env at daemon-start time.""" - -import os - -from lagent.agents.fc_agent import get_tool_prompt - -tool_template = """# Tools - -You have access to the following functions: - - -{tools} - - -If you choose to call a function ONLY reply in the following format with NO suffix: - - - - -value_1 - - -This is the value for the second parameter -that can span -multiple lines - - - - - -Reminder: -- Function calls MUST follow the specified format: an inner block must be nested within XML tags -- Required parameters MUST be specified -- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after -- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls -""" - -workspace = os.environ.get("TASK_WORKSPACE", "/app") -skills_root = f"{workspace}/skills" - -model = dict( - type="lagent.llms.model.AsyncAPIClient", - model=dict( - model=os.environ.get( - "RL_LLM_MODEL", - "", - ), - base_url=os.environ.get( - "RL_LLM_BASE_URL", - "http://s-20260104203038-22bhb.ailab-evalservice.pjh-service.org.cn/v1", - ), - api_key=os.environ.get("RL_LLM_API_KEY", "sk-admin"), - ), - sample_params=dict(temperature=0.7, top_p=1.0, top_k=50), - timeout=350, - max_retry=2, - sleep_interval=5, - extra_body=dict(spaces_between_special_tokens=False), -) - -base_actions = [dict(type="lagent.actions.tmux_action.TerminalExecute")] - -policy_agent = dict( - type="lagent.agents.AsyncAgent", - llm=model, - template=get_tool_prompt(base_actions, template=tool_template), - hooks=[dict(type="lagent.hooks.logger.MessageLogger")], -) - -env_agent = dict( - type="lagent.agents.env_agent.RLEnvAgent", - actions=base_actions, - max_turn=30, - max_tool_response_length=4096, - tool_response_truncate_side="left", - enable_no_thinking_penalty=False, - hooks=[dict(type="lagent.hooks.logger.MessageLogger")], -) - -agent_config = dict( - type="lagent.agents.fc_agent.FunctionCallAgent", - policy_agent=policy_agent, - env_agent=env_agent, - finish_condition='lagent.agents.env_agent.finish_condition_func', - initialize_input=False, - hooks=[dict(type="lagent.hooks.logger.MessageLogger")], - -) - - -if __name__ == "__main__": - import asyncio - - from lagent.utils import create_object - - async def main(): - agent = create_object(agent_config) - res = await agent("list the files in the current directory and tell me which is the largest") - print(res) - - asyncio.run(main()) diff --git a/recipe/tb2_rl/infer/agents/interndp/install-deps.sh b/recipe/tb2_rl/infer/agents/interndp/install-deps.sh deleted file mode 100755 index bdea01a1dd..0000000000 --- a/recipe/tb2_rl/infer/agents/interndp/install-deps.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -# interndp agent install-deps stub — runner already uploads lagent source; -# nothing agent-specific to install here. -set -euo pipefail -echo "[interndp] install-deps: ok" diff --git a/recipe/tb2_rl/infer/agents/interndp/tools/__init__.py b/recipe/tb2_rl/infer/agents/interndp/tools/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/recipe/tb2_rl/infer/setup/pre_entry.sh b/recipe/tb2_rl/infer/setup/pre_entry.sh deleted file mode 100755 index 49ae75db02..0000000000 --- a/recipe/tb2_rl/infer/setup/pre_entry.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash -# --------------------------------------------------------------------------- -# pre_entry.sh — tb2-rl pre-agent environment setup. -# -# Runs inside the infer sandbox, after the task tree has been seeded under -# $TASK_WORKSPACE and before the agent starts. -# -# The upstream Dockerfile convention is ``COPY files/ /app/`` at image-build -# time. We use the pre-built ``t-data-processing-v1`` image instead, so at -# runtime we seed /app/ ourselves: ``environment/files/*`` are already -# uploaded into $TASK_WORKSPACE/ by the pipeline's UploadHook — this script -# is a no-op placeholder kept for parity with claw_bench and to host any -# future task-level setup.sh invocation. -# --------------------------------------------------------------------------- -set -euo pipefail - -: "${TASK_WORKSPACE:?TASK_WORKSPACE not set}" - -# Optional: run upstream setup.sh if a task ever ships one. -if [ -f "$TASK_WORKSPACE/environment/setup.sh" ]; then - (cd "$(dirname "$TASK_WORKSPACE")" && bash "$TASK_WORKSPACE/environment/setup.sh" "$TASK_WORKSPACE") -fi - -exit 0 diff --git a/recipe/tb2_rl/judgers/rule_grader/grader.py b/recipe/tb2_rl/judgers/rule_grader/grader.py deleted file mode 100755 index e31aab8afc..0000000000 --- a/recipe/tb2_rl/judgers/rule_grader/grader.py +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env python3 -"""Parse a CTRF JSON report + test log → emit a ``JudgerResult`` line to stdout. - -Honors ``@pytest.mark.weight(N)`` via pytest-json-ctrf's ``extra``/``metadata`` -section. Tests with no explicit weight get 1.0. -""" - -from __future__ import annotations - -import argparse -import json -import sys -from pathlib import Path - - -def _extract_weight(test: dict) -> float: - for section in ("extra", "metadata"): - extras = test.get(section) or [] - if isinstance(extras, list): - for e in extras: - if isinstance(e, dict) and e.get("key") == "weight": - try: - return float(e.get("value", 1.0)) - except (TypeError, ValueError): - return 1.0 - elif isinstance(extras, dict): - w = extras.get("weight") - if w is not None: - try: - return float(w) - except (TypeError, ValueError): - return 1.0 - return 1.0 - - -def _log_tail(path: Path, bytes_: int = 800) -> str: - try: - return path.read_text(errors="replace")[-bytes_:] - except Exception: - return "" - - -def main() -> int: - ap = argparse.ArgumentParser() - ap.add_argument("--ctrf", required=True) - ap.add_argument("--log", required=True) - ap.add_argument("--pytest-rc", type=int, required=True) - ap.add_argument("--judger-name", default="rule_grader") - args = ap.parse_args() - - ctrf_path = Path(args.ctrf) - log_path = Path(args.log) - - try: - data = json.loads(ctrf_path.read_text()) - except Exception as exc: - print( - json.dumps( - { - "judger_name": args.judger_name, - "total": 0.0, - "error": f"ctrf missing/parse failed: {exc}. log tail: {_log_tail(log_path)}", - }, - ensure_ascii=False, - ) - ) - return 0 - - tests = (data.get("results", {}) or {}).get("tests", []) or [] - criteria: dict[str, dict[str, float]] = {} - for t in tests: - name = t.get("name", "unknown") - passed = t.get("status") == "passed" - weight = _extract_weight(t) - criteria[name] = {"score": 1.0 if passed else 0.0, "weight": weight} - - total_w = sum(c["weight"] for c in criteria.values()) - if total_w <= 0: - total = 0.0 - else: - total = sum(c["score"] * c["weight"] for c in criteria.values()) / total_w - - print( - json.dumps( - { - "judger_name": args.judger_name, - "total": round(total, 4), - "criteria": criteria, - "metadata": { - "pytest_rc": args.pytest_rc, - "test_count": len(tests), - }, - }, - ensure_ascii=False, - ) - ) - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/recipe/tb2_rl/judgers/rule_grader/run.sh b/recipe/tb2_rl/judgers/rule_grader/run.sh deleted file mode 100755 index e43032dc93..0000000000 --- a/recipe/tb2_rl/judgers/rule_grader/run.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -# --------------------------------------------------------------------------- -# run.sh — tb2-rl rule_grader entrypoint. -# -# The bench ships its own test harness at /tests/test.sh which: -# - installs pytest + pytest-json-ctrf + /tests/test_requirements.txt -# - runs /tests/test_outputs.py with --ctrf /logs/verifier/ctrf.json -# - writes /logs/verifier/reward.txt (1 or 0) -# -# We invoke it and hand the resulting CTRF to grader.py so the stage's -# stdout is a single judger score JSON line (what ParseJudgerStdout expects). -# -# Env: -# $TESTS_DIR tests directory inside the sandbox (default: /tests) -# $JUDGER_DIR where grader.py lives (default: dirname of this script) -# $JUDGER_NAME name to emit in the score JSON (default: rule_grader) -# --------------------------------------------------------------------------- -set -uo pipefail - -TESTS_DIR="${TESTS_DIR:-/tests}" -JUDGER_DIR="${JUDGER_DIR:-$(dirname "$0")}" -JUDGER_NAME="${JUDGER_NAME:-rule_grader}" - -TEST_LOG=/tmp/tb2_rl_test.log -: > "$TEST_LOG" - -chmod +x "$TESTS_DIR/test.sh" 2>/dev/null || true -bash "$TESTS_DIR/test.sh" > "$TEST_LOG" 2>&1 -TEST_RC=$? - -PY=/mnt/llm-ai-infra/miniconda3/envs/train/bin/python3 -if [ ! -x "$PY" ]; then - PY=$(command -v python3 || echo python) -fi - -"$PY" "$JUDGER_DIR/grader.py" \ - --ctrf /logs/verifier/ctrf.json \ - --log "$TEST_LOG" \ - --pytest-rc "$TEST_RC" \ - --judger-name "$JUDGER_NAME" diff --git a/recipe/tb2_rl/local_run/__init__.py b/recipe/tb2_rl/local_run/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/recipe/tb2_rl/local_run/__main__.py b/recipe/tb2_rl/local_run/__main__.py deleted file mode 100644 index ce18058834..0000000000 --- a/recipe/tb2_rl/local_run/__main__.py +++ /dev/null @@ -1,175 +0,0 @@ -"""CLI for running rollout samples through this recipe's pipeline. - -Usage: - python -m recipe.tb2_rl.local_run --limit 5 --output results.jsonl -""" - -from __future__ import annotations - -import argparse -import asyncio -import importlib.util -import json -import os -import sys -import traceback -from copy import deepcopy -from pathlib import Path -from typing import Any - -from lagent.utils import create_object - -from xtuner.v1.data_proto.rl_data import RolloutState -from xtuner.v1.rl.agent_loop.sandbox_agent_loop import AgentInSandboxLoop, AgentRolloutItem -from xtuner.v1.rl.agent_loop.sandbox_agent_loop.trace import init_writer - - -def _load_config(path: Path) -> Any: - spec = importlib.util.spec_from_file_location("local_run_config", path) - if spec is None or spec.loader is None: - raise RuntimeError(f"cannot load {path}") - mod = importlib.util.module_from_spec(spec) - spec.loader.exec_module(mod) - return mod - - -def _inject_session_id(runner_cfg: dict[str, Any], session_id: str) -> None: - for entry in runner_cfg.get("infer", {}).get("entries", []): - if isinstance(entry, dict) and entry.get("name") == "start_agent_daemon": - entry.setdefault("env", {})["XTUNER_SESSION_ID"] = session_id - - -async def _run_one(dataset: Any, item: AgentRolloutItem) -> dict[str, Any]: - runner_cfg = item.pipeline or dataset.pipeline - if isinstance(runner_cfg, dict): - runner_cfg = deepcopy(runner_cfg) - _inject_session_id(runner_cfg, str(item.uid)) - runner = create_object(runner_cfg) - else: - runner = runner_cfg - result = await runner.run(item) - dumped = result.model_dump(mode="json", exclude={"artifacts", "pipeline"}) - dumped["artifacts"] = _serialize_artifacts(result.artifacts) - return dumped - - -async def _run_agentloop(dataset: Any, item: AgentRolloutItem, agent_loop: AgentInSandboxLoop) -> dict[str, Any]: - item = item.model_copy(update={"pipeline": item.pipeline or dataset.pipeline}, deep=True) - if item.task_root is None: - raise ValueError("AgentRolloutItem.task_root is required.") - instruction_path = Path(item.task_root) / item.instruction - content = instruction_path.read_text(encoding="utf-8") - prompt_ids = agent_loop.tokenizer.encode(content, add_special_tokens=False) - - rollout_state = RolloutState( - message=[{"role": "user", "content": content}], - prompt_ids=prompt_ids, - num_tokens=len(prompt_ids), - data_source={item.data_source: 1.0}, - reward_model={"style": item.data_source}, - uid=item.uid, - message_uid=item.group_id, - extra_fields={"rollout_item": item}, - ) - result = await agent_loop.generate_sample(rollout_state) - return { - "id": item.id, - "status": result.status.value, - "reward": result.reward["score"] if result.reward and "score" in result.reward else None, - "error": result.error_msg, - "finish_reason": result.finish_reason, - "response": result.response, - "response_ids_len": len(result.response_ids or []), - "prompt_ids_len": len(result.prompt_ids or []), - "agent_artifacts": _serialize_artifacts(result.extra_fields.get("agent_artifacts", {})), - } - - -def _serialize_artifacts(artifacts: dict[str, Any]) -> dict[str, Any]: - """Keep text artifacts as-is; collapse bytes blobs to a size placeholder.""" - out: dict[str, Any] = {} - for key, value in artifacts.items(): - if isinstance(value, (bytes, bytearray)): - out[key] = f"<{len(value)} bytes>" - else: - out[key] = value - return out - - -async def main_async(args: argparse.Namespace) -> int: - init_writer() - cfg = _load_config(Path(args.config)) - dataset = cfg.dataset - - if args.mode == "agentloop": - import ray - ray.init(address="auto") - - pairs: list[tuple[Path, AgentRolloutItem]] - if args.tasks: - wanted = {str(Path(p).resolve()) for p in args.tasks} - pairs = [(td, item) for td, item in dataset.iter_tasks() if str(td.resolve()) in wanted] - else: - pairs = list(dataset.iter_tasks()) - if args.limit: - pairs = pairs[: args.limit] - if not pairs: - print("no tasks to run", file=sys.stderr) - return 1 - - print(f"running {len(pairs)} task(s) (concurrency={args.concurrency})", file=sys.stderr) - sem = asyncio.Semaphore(max(1, args.concurrency)) - agent_loop = None - if args.mode == "agentloop": - if not args.hf_checkpoint: - raise ValueError("--hf-checkpoint is required in agentloop mode.") - agent_loop = AgentInSandboxLoop(hf_checkpoint=args.hf_checkpoint) - - async def guarded(idx: int, td: Path, item: AgentRolloutItem) -> dict[str, Any]: - async with sem: - item = item.model_copy(update={"group_id": 0, "uid": idx}) - try: - if args.mode == "agentloop": - assert agent_loop is not None - return await _run_agentloop(dataset, item, agent_loop) - return await _run_one(dataset, item) - except Exception as exc: - tb = traceback.format_exc() - print(f"[{item.id}] uncaught: {type(exc).__name__}: {exc}\n{tb}", file=sys.stderr) - return {"id": item.id, "task_dir": str(td), "error": f"{type(exc).__name__}: {exc}", "traceback": tb} - - out_fp = open(args.output, "w") if args.output else None - try: - coros = [guarded(i, td, item) for i, (td, item) in enumerate(pairs)] - for coro in asyncio.as_completed(coros): - result = await coro - print(result) - line = json.dumps(result, ensure_ascii=False) - if out_fp is not None: - out_fp.write(line + "\n") - out_fp.flush() - print(json.dumps({k: result.get(k) for k in ("id", "status", "reward", "error")}, ensure_ascii=False)) - finally: - if out_fp is not None: - out_fp.close() - return 0 - - -def main() -> int: - parser = argparse.ArgumentParser(description="Run rollout samples through this recipe's pipeline.") - parser.add_argument("--config", default=str(Path(__file__).parent / "config.py")) - parser.add_argument("--tasks", nargs="*", help="Specific task dirs to run; default: all from dataset") - parser.add_argument("--limit", type=int, default=0, help="Limit total tasks (0=all)") - parser.add_argument("--concurrency", type=int, default=4) - parser.add_argument("--output", help="Optional JSONL path to dump full per-sample results") - parser.add_argument("--mode", choices=("runner", "agentloop"), default="runner") - parser.add_argument( - "--hf-checkpoint", - default=os.environ.get("HF_CHECKPOINT") or os.environ.get("QWEN3P5_VL_MODEL_PATH"), - help="Tokenizer/processor checkpoint used by agentloop mode.", - ) - return asyncio.run(main_async(parser.parse_args())) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/recipe/tb2_rl/local_run/config.py b/recipe/tb2_rl/local_run/config.py deleted file mode 100644 index b69824c630..0000000000 --- a/recipe/tb2_rl/local_run/config.py +++ /dev/null @@ -1,15 +0,0 @@ -"""Run tb2-rl tasks with the default agent pipeline. - -Invoke via:: - - python -m recipe.tb2_rl.local_run --limit 5 -""" - -from recipe.tb2_rl.local_run.dataset import TB2RLBench -from recipe.tb2_rl.pipeline import runner - - -dataset = TB2RLBench( - jsonl_path="/mnt/shared-storage-user/llmit1/user/liukuikun/delivery/data/tb2_rl_tasks.jsonl", - pipeline=runner, -) diff --git a/recipe/tb2_rl/local_run/dataset.py b/recipe/tb2_rl/local_run/dataset.py deleted file mode 100644 index f4b8f5fea6..0000000000 --- a/recipe/tb2_rl/local_run/dataset.py +++ /dev/null @@ -1,70 +0,0 @@ -"""tb2-rl dataset: read prebuilt JSONL records, yield AgentRolloutItem. - -The JSONL is produced by ``recipe.tb2_rl.scripts.generate_jsonl``. Each record -fully describes a task (id, instruction, tags, ability, pipeline_overrides), -so this module does nothing but field-mapping — no ``task.toml`` parsing. -""" - -from __future__ import annotations - -import json -import logging -from pathlib import Path -from typing import Iterator - -from xtuner.v1.rl.agent_loop.sandbox_agent_loop.runner import Runner -from xtuner.v1.rl.agent_loop.sandbox_agent_loop.schemas import AgentRolloutItem - - -logger = logging.getLogger(__name__) - - -class TB2RLBench: - """tb2-rl dataset iterator. One pipeline (with per-task overrides) for - every task it yields.""" - - name = "tb2-rl" - - def __init__( - self, - jsonl_path: str | Path, - *, - pipeline: Runner | dict, - skip_ids: set[str] | list[str] | None = None, - ): - """ - Args: - jsonl_path (str | Path): JSONL produced by - ``scripts/generate_jsonl.py`` (one record per task). - pipeline: Shared Runner config for every task under this dataset. - skip_ids (set[str] | list[str] | None): Sample ids to exclude. - """ - self.pipeline = pipeline - self.skip_ids = set(skip_ids or ()) - self._records = [ - json.loads(line) - for line in Path(jsonl_path).read_text(encoding="utf-8").splitlines() - if line.strip() - ] - - def iter_tasks(self) -> Iterator[tuple[Path, AgentRolloutItem]]: - for rec in self._records: - if rec["id"] in self.skip_ids: - logger.info("skipping %s (in skip_ids)", rec["id"]) - continue - try: - yield Path(rec["task_dir"]), self.load_task(rec) - except Exception as exc: - logger.warning("skipping %s: %s", rec.get("id"), exc) - - def load_task(self, rec: dict) -> AgentRolloutItem: - return AgentRolloutItem( - id=rec["id"], - data_source=self.name, - ability=rec.get("ability"), - tags=rec.get("tags", []), - instruction=rec["instruction"], - task_root=Path(rec["task_dir"]), - pipeline=self.pipeline, - pipeline_overrides=rec.get("pipeline_overrides", {}), - ) diff --git a/recipe/tb2_rl/pipeline.py b/recipe/tb2_rl/pipeline.py deleted file mode 100644 index f68eff6a22..0000000000 --- a/recipe/tb2_rl/pipeline.py +++ /dev/null @@ -1,319 +0,0 @@ -"""tb2-rl rollout runner config. - -Layout of an upstream tb2-rl task:: - - task.toml — metadata - instruction.md — natural-language task (paths are absolute, e.g. /app/foo.csv) - environment/Dockerfile — baked into the pre-built ``t-data-processing-v1`` image - environment/files/ — data files that the Dockerfile COPYs to /app/ - tests/test.sh — bench-provided verifier entrypoint - tests/test_outputs.py — pytest module - tests/test_requirements.txt - -At runtime we do NOT rebuild the image; we use the pre-built -``t-data-processing-v1`` and seed ``/app`` + ``/tests`` ourselves: - - mirror ``instruction.md`` → ``/app/instruction.md`` - - mirror ``environment/files/*`` → ``/app/`` - - mirror ``tests/*`` → ``/tests/`` (bench's test.sh reads from there) -""" - -from __future__ import annotations - -import os -from pathlib import Path -from types import SimpleNamespace -from typing import Any - -from lagent.serving.sandbox.providers.gateway import GatewayProvider - -from xtuner.v1.rl.agent_loop.sandbox_agent_loop import ( - AgentSpec, - DetachedShellEntry, - DownloadHook, - EntryCapture, - EntryDiagnostics, - EntryFailurePolicy, - EntryMonitor, - EntryProcessHealthCheck, - ExecHook, - InstallLagent, - Judger, - JudgerValidator, - ParseJudgerStdout, - PickAgent, - ReadFileHook, - ReturnCodeFileCompletion, - Runner, - RunAgentInstallDeps, - SandboxHealthCheck, - SandboxPool, - SandboxSpec, - SandboxStage, - ShellEntry, - UploadAgentConfigSource, - UploadChosenAgent, - UploadHook, -) - -HERE = Path(__file__).resolve().parent -SETUP_DIR = HERE / "infer" / "setup" -AGENT_TEMPLATES = HERE / "infer" / "agents" -JUDGERS = HERE / "judgers" - - -# ───────────────────────────────────────────────────────────────── -# Sandbox runtime paths -# ───────────────────────────────────────────────────────────────── - -PATHS = SimpleNamespace( - setup_dir="/tmp/infer/setup", - judger_dir="/tmp/judgers/rule_grader", - agent_config="/tmp/agent_config.py", - agent_sock="/tmp/lagent_agent.sock", - agent_daemon_log="/tmp/agent_daemon.log", - agent_daemon_pid="/tmp/agent_daemon.pid", - agent_response="/tmp/agent_response.txt", - trajectory="/tmp/trajectory.json", - message="/tmp/message.json", - tests="/tests", -) - -SHARED_LAGENT_PYTHON = os.getenv( - "LAGENT_PYTHON", - "/mnt/llm-ai-infra/miniconda3/envs/train/bin/python", -) -LAGENT_PYTHONPATH = "/app:/tmp:${PYTHONPATH:-}" - -START_AGENT_DAEMON = ( - f'PYTHONPATH="{LAGENT_PYTHONPATH}" "{SHARED_LAGENT_PYTHON}" ' - f"-m lagent.serving.sandbox.client_cli start-agent-daemon " - f"--mode agent --config {PATHS.agent_config} --sock {PATHS.agent_sock} " - f"--pid-file {PATHS.agent_daemon_pid} --log {PATHS.agent_daemon_log} --truncate-log" -) -WAIT_AGENT_DAEMON = ( - f'PYTHONPATH="{LAGENT_PYTHONPATH}" "{SHARED_LAGENT_PYTHON}" ' - f"-m lagent.serving.sandbox.client_cli wait-ready " - f"--sock {PATHS.agent_sock} --pid-file {PATHS.agent_daemon_pid} " - f"--log {PATHS.agent_daemon_log} --timeout 60" -) -AGENT_CHAT = ( - f'PYTHONPATH="{LAGENT_PYTHONPATH}" "{SHARED_LAGENT_PYTHON}" ' - f"-m lagent.serving.sandbox.client_cli chat " - f"--sock {PATHS.agent_sock} --instruction-file \"/app/instruction.md\" " - f"--response-out {PATHS.agent_response} --log {PATHS.agent_daemon_log}" -) -AGENT_STATE_DICT = ( - f'PYTHONPATH="{LAGENT_PYTHONPATH}" "{SHARED_LAGENT_PYTHON}" ' - f"-m lagent.serving.sandbox.client_cli state-dict " - f"--sock {PATHS.agent_sock} --trajectory-out {PATHS.trajectory} " - f"--log {PATHS.agent_daemon_log}" -) -AGENT_GET_MESSAGES = ( - f'PYTHONPATH="{LAGENT_PYTHONPATH}" "{SHARED_LAGENT_PYTHON}" ' - f"-m lagent.serving.sandbox.client_cli get-messages " - f"--sock {PATHS.agent_sock} --message-out {PATHS.message} " - f"--log {PATHS.agent_daemon_log}" -) -STOP_AGENT_DAEMON = ( - f'PYTHONPATH="{LAGENT_PYTHONPATH}" "{SHARED_LAGENT_PYTHON}" ' - f"-m lagent.serving.sandbox.client_cli shutdown " - f"--sock {PATHS.agent_sock} --pid-file {PATHS.agent_daemon_pid} " - f"--log {PATHS.agent_daemon_log}" -) - - -def entry_failure(*, include_entry_output: bool = False) -> dict[str, Any]: - files = [dict(path=PATHS.agent_daemon_log, key="daemon_log", optional=True)] - if include_entry_output: - files.extend( - [ - dict(entry_file="stdout", key="entry_stdout", optional=True), - dict(entry_file="stderr", key="entry_stderr", optional=True), - ] - ) - return dict( - type=EntryFailurePolicy, - diagnostics=dict(type=EntryDiagnostics, files=files), - diagnostic_error_policy="preserve_entry_error", - ) - - -# ───────────────────────────────────────────────────────────────── -# Defaults -# ───────────────────────────────────────────────────────────────── - -DEFAULT_WORKSPACE = "/app" -DEFAULT_AGENTS: list[dict[str, Any]] = [ - dict( - type=AgentSpec, - name="interndp", - config="config.py", - install="install-deps.sh", - tools="tools", - weight=1.0, - ) -] - -DEFAULT_SANDBOX = dict( - type=SandboxSpec, - image="t-data-processing-v1", - ttl_seconds=11700, - key=os.getenv("SANDBOX_PROVIDER_KEY", "lkk-as8dHd2Q"), - workspace_path=DEFAULT_WORKSPACE, -) -DEFAULT_PROVIDER = { - "type": GatewayProvider, - "gateway_url": os.getenv("SANDBOX_GATEWAY_URL", "http://env-gateway.ailab.ailab.ai"), -} - -runner = dict( - type=Runner, - pool=dict( - type=SandboxPool, - provider=DEFAULT_PROVIDER, - specs={"main": DEFAULT_SANDBOX}, - ), - infer=dict( - type=SandboxStage, - sandbox="main", - pre=[ - # ── Stage 1: workspace setup ────────────────────────────────── - # Create the workspace dir, upload bench-level setup scripts, - # then run pre_entry.sh for any per-bench bootstrap. - dict(type=ExecHook, cmd=f"mkdir -p {DEFAULT_WORKSPACE}"), - dict( - type=UploadHook, - mappings=[ - dict(base=str(SETUP_DIR), source="*", target=PATHS.setup_dir + "/", flatten=True), - ], - ), - dict( - type=ExecHook, - cmd=f"bash {PATHS.setup_dir}/pre_entry.sh", - env={"TASK_WORKSPACE": DEFAULT_WORKSPACE}, - timeout=300, - ), - - # ── Stage 2: task data ──────────────────────────────────────── - # Place per-task files (instruction + environment/files/*) under - # the workspace so the agent sees them. - dict(type=UploadHook, mappings=[dict(source="instruction.md", target=f"{DEFAULT_WORKSPACE}/instruction.md")]), - dict( - type=UploadHook, - mappings=[dict(base="environment/files", source="**/*", target=f"{DEFAULT_WORKSPACE}/")], - ), - - # ── Stage 3: agent harness ──────────────────────────────────── - # Install lagent runtime, pick an agent variant, upload its - # harness + config, run its install-deps. - dict(type=InstallLagent, lagent_src_dir=os.getenv("LAGENT_SRC_DIR", "/mnt/shared-storage-user/llmit/user/liukuikun/workspace/lagent")), - dict(type=PickAgent, agents=DEFAULT_AGENTS, template_root=str(AGENT_TEMPLATES)), - dict(type=UploadChosenAgent, target_dir=f"{DEFAULT_WORKSPACE}/agent/"), - dict(type=UploadAgentConfigSource, dst=PATHS.agent_config), - dict(type=RunAgentInstallDeps, workspace=DEFAULT_WORKSPACE), - ], - entries=[ - dict( - type=ShellEntry, - name="start_agent_daemon", - cmd=START_AGENT_DAEMON, - timeout=60, - failure=entry_failure(), - env={ - "RL_LLM_MODEL": os.environ.get('RL_LLM_MODEL', ""), - } - ), - dict( - type=ShellEntry, - name="wait_agent_daemon", - cmd=WAIT_AGENT_DAEMON, - timeout=90, - failure=entry_failure(), - ), - dict( - type=DetachedShellEntry, - name="agent_chat", - cmd=AGENT_CHAT, - capture=dict(type=EntryCapture, root="/tmp", prefix="xt_entry"), - monitor=dict( - type=EntryMonitor, - timeout=7200, - probes=[ - dict(type=ReturnCodeFileCompletion, interval_sec=2.0), - dict(type=SandboxHealthCheck, interval_sec=10.0, probe_timeout_sec=10.0, fail_after=3), - dict(type=EntryProcessHealthCheck, interval_sec=10.0, probe_timeout_sec=10.0, fail_after=2), - ], - ), - failure=entry_failure(include_entry_output=True), - ), - # dict( - # type=ShellEntry, - # name="agent_state_dict", - # cmd=AGENT_STATE_DICT, - # timeout=300, - # failure=entry_failure(), - # ), - dict( - type=ShellEntry, - name="agent_get_messages", - cmd=AGENT_GET_MESSAGES, - timeout=300, - failure=entry_failure(), - ), - # `|| true` 让 stop 失败不污染 stage status —— sandbox 一会儿 - # 也会被 pool.release_all 释放,daemon 自然死。debug 想保留 - # daemon 的话注释掉这条 entry 即可。 - dict( - type=ShellEntry, - name="stop_agent_daemon", - cmd=STOP_AGENT_DAEMON + " || true", - timeout=30, - ), - ], - post=[ - dict(type=ReadFileHook, path=PATHS.message, key="message"), - dict(type=ReadFileHook, path=PATHS.agent_response, key="agent_response"), - # workspace tar(debug 时本地解压看产物,失败 silent + warning) - # dict(type=DownloadHook, paths=[DEFAULT_WORKSPACE]), - ], - ), - validate=dict( - type=JudgerValidator, - judgers=[ - dict( - type=Judger, - name="rule_grader", - weight=1.0, - stage=dict( - type=SandboxStage, - sandbox="main", - pre=[ - dict( - type=UploadHook, - mappings=[ - dict(base=str(JUDGERS / "rule_grader"), source="*", target=PATHS.judger_dir + "/", flatten=True), - dict(base="tests", source="**/*", target=f"{PATHS.tests}/"), - ], - ), - dict(type=ExecHook, cmd=f"chmod +x {PATHS.tests}/test.sh || true", optional=True), - ], - entries=[ - dict( - type=ShellEntry, - name="run_tests", - cmd=f"bash {PATHS.judger_dir}/run.sh", - env={ - "JUDGER_NAME": "rule_grader", - "TASK_WORKSPACE": DEFAULT_WORKSPACE, - "TESTS_DIR": PATHS.tests, - "JUDGER_DIR": PATHS.judger_dir, - }, - timeout=900, - ) - ], - post=[dict(type=ParseJudgerStdout, judger_name="rule_grader")], - ), - ) - ], - ), -) diff --git a/recipe/tb2_rl/scripts/generate_jsonl.py b/recipe/tb2_rl/scripts/generate_jsonl.py deleted file mode 100644 index 875da9cb77..0000000000 --- a/recipe/tb2_rl/scripts/generate_jsonl.py +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env python3 -"""Scan tb2-rl ``tasks/`` tree and emit one JSONL record per task directory. - -Each record fully describes a task — downstream datasets do nothing but -field-mapping; ``task.toml`` parsing happens here only. - -Record fields: - task_dir absolute path to the task directory - id sample id (task.toml `id` or dir name) - instruction relative path to the instruction file - tags list[str] - ability domain / category - pipeline_overrides dict to deep-merge into Runner._pool config -""" - -from __future__ import annotations - -import argparse -import json -import sys -import tomllib -from pathlib import Path -from typing import Any - - -image_mapping = { - "data_processing": "t-data-processing-v1", - "data_querying": "t-data-querying-v1", - "data_science": "t-data-science-v1", - "debugging": "t-debugging-v1", - "dependency_management": "t-dependency-management-v1", - "file_operations": "t-file-operations-v1", - "scientific_computing": "t-scientific-computing-v1", - "security": "t-security-v1", - "software_engineering": "t-data-science-v1", - "system_administration": "t-data-science-v1", -} - - -def iter_task_dirs(tasks_root: Path) -> list[Path]: - """Return sorted task directories (parents of ``task.toml``).""" - roots: list[Path] = [] - for toml in sorted(tasks_root.rglob("task.toml")): - parent = toml.parent - try: - parent.relative_to(tasks_root) - except ValueError: - continue - roots.append(parent) - return roots - - -def _load_task_toml(path: Path) -> dict: - raw = tomllib.loads(path.read_text(encoding="utf-8")) - if "task" in raw and isinstance(raw["task"], dict): - for k, v in raw.pop("task").items(): - raw.setdefault(k, v) - return raw - - -def build_record(task_dir: Path, tasks_root: Path) -> dict[str, Any]: - toml = _load_task_toml(task_dir / "task.toml") - tags = list((toml.get("metadata") or {}).get("tags") or toml.get("tags") or []) - rel = task_dir.relative_to(tasks_root).as_posix() - image = image_mapping[rel.split("/")[0]] - return { - "task_dir": str(task_dir.resolve()), - "id": toml.get("id") or task_dir.name, - "instruction": "instruction.md", - "tags": tags, - "ability": toml.get("domain") or (tags[1] if len(tags) > 1 else None), - "pipeline_overrides": {"pool": {"specs": {"main": {"image": image}}}}, - } - - -def main() -> int: - parser = argparse.ArgumentParser(description=__doc__.splitlines()[0]) - parser.add_argument( - "--tasks-root", - type=Path, - default=Path("/mnt/shared-storage-user/llmit1/user/liukuikun/delivery/data/terminalbench2_rl_data"), - help="Absolute path to the bench tasks directory.", - ) - parser.add_argument( - "-o", - "--output", - type=Path, - default=None, - help="Write JSONL here. Defaults to /../tb2_rl_tasks.jsonl", - ) - args = parser.parse_args() - - tasks_root = args.tasks_root.resolve() - if not tasks_root.is_dir(): - print(f"tasks root is not a directory: {tasks_root}", file=sys.stderr) - return 1 - - output = args.output or (tasks_root.parent / "tb2_rl_tasks.jsonl") - skipped = 0 - with open(output, "w", encoding="utf-8") as fp: - for task_dir in iter_task_dirs(tasks_root): - try: - rec = build_record(task_dir, tasks_root) - except Exception as exc: - print(f"warning: skipping {task_dir}: {exc}", file=sys.stderr) - skipped += 1 - continue - fp.write(json.dumps(rec, ensure_ascii=False) + "\n") - print(f"wrote: {output}" + (f" ({skipped} skipped)" if skipped else "")) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/recipe/tb2_rl/xtuner_dataset.py b/recipe/tb2_rl/xtuner_dataset.py deleted file mode 100644 index 0c20230c60..0000000000 --- a/recipe/tb2_rl/xtuner_dataset.py +++ /dev/null @@ -1,119 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -"""tb2-rl XTuner training tokenize function. - -Reads jsonl records produced by ``recipe.tb2_rl.scripts.generate_jsonl`` -and emits ``RolloutState`` for the trainer. The per-task -``AgentRolloutItem`` (carrying pipeline + per-task overrides) lives in -``RolloutState.extra_fields["rollout_item"]`` for the runner to consume at -rollout time. -""" - -from pathlib import Path - -from pydantic import BaseModel, ConfigDict -from transformers import PreTrainedTokenizer - -from xtuner.v1.data_proto.rl_data import RolloutState -from xtuner.v1.datasets.data_item import CacheItem -from xtuner.v1.datasets.utils import CachableTokenizeFunction -from xtuner.v1.rl.agent_loop.sandbox_agent_loop.schemas import AgentRolloutItem -from xtuner.v1.utils import get_logger - -logger = get_logger() - - -class RLTB2RLTokenizeFn(CachableTokenizeFunction[RolloutState]): - """tb2-rl tokenize function aligned with ``RLTextTokenizeFn``.""" - - DATA_SOURCE_NAME = "tb2-rl" - PIPELINE_DOTTED = "recipe.tb2_rl.pipeline.runner" - - def __init__( - self, - tokenizer: PreTrainedTokenizer, - max_length: int | None = None, - tools_schema: list | None = None, - data_judger_mapping: dict | None = None, - system_prompt: str | None = None, - ): - super().__init__(tokenizer) - self.max_length = max_length - self.tools_schema = tools_schema if tools_schema is not None else [] - self.data_judger_mapping = data_judger_mapping - self.system_prompt = system_prompt - - def __call__(self, item: dict, **kwargs) -> RolloutState | CacheItem: - task_dir = Path(item["task_dir"]) - instruction_path = task_dir / item["instruction"] - context = instruction_path.read_text(encoding="utf-8") - - message = [{"role": "user", "content": context}] - if self.system_prompt: - message = [{"role": "system", "content": self.system_prompt}] + message - - raw_prompt = self.tokenizer.apply_chat_template( - message, tools=self.tools_schema, add_generation_prompt=True, tokenize=False - ) - data = self.tokenizer(raw_prompt, add_special_tokens=False) - prompt_token_ids = data["input_ids"] - num_tokens = len(prompt_token_ids) - - if self.state == "cache": - if self.max_length is not None and num_tokens > self.max_length: - num_tokens = 0 # filtered out by the dataset filter - return CacheItem( - num_tokens=num_tokens, - proxy_attn_flops=float(num_tokens), - ) - - if self.max_length is not None: - assert num_tokens <= self.max_length, f"num_tokens {num_tokens} > max_length {self.max_length}" - - if self.data_judger_mapping is not None: - data_source = self.data_judger_mapping.get(self.DATA_SOURCE_NAME) - else: - data_source = {self.DATA_SOURCE_NAME: 1.0} - - rollout_item = AgentRolloutItem( - id=item["id"], - data_source=self.DATA_SOURCE_NAME, - ability=item.get("ability"), - tags=item.get("tags", []), - instruction=item["instruction"], - task_root=task_dir, - pipeline=self.PIPELINE_DOTTED, - pipeline_overrides=item.get("pipeline_overrides", {}), - ) - - return RolloutState( - prompt_ids=prompt_token_ids, - message=message, - reward_model={"style": self.DATA_SOURCE_NAME}, - num_tokens=num_tokens, - proxy_attn_flops=float(num_tokens), - data_source=data_source, - extra_fields={ - "rollout_item": rollout_item, - "ability": item.get("ability"), - }, - ) - - def hash(self) -> str: - return type(self).__name__ - - -class RLTB2RLTokenizeFnConfig(BaseModel): - model_config = ConfigDict(title="tb2-rl RL dataset config for xtuner", extra="forbid") - max_length: int | None = None - tools_schema: list | None = None - data_judger_mapping: dict | None = None - system_prompt: str | None = None - - def build(self, tokenizer: PreTrainedTokenizer, **kwargs) -> RLTB2RLTokenizeFn: - return RLTB2RLTokenizeFn( - tokenizer=tokenizer, - max_length=self.max_length, - tools_schema=self.tools_schema, - data_judger_mapping=self.data_judger_mapping, - system_prompt=self.system_prompt, - )