diff --git a/.gitignore b/.gitignore index f42cda4d9..7c2046ad1 100644 --- a/.gitignore +++ b/.gitignore @@ -58,4 +58,5 @@ k8s/helm/nexent/Chart.lock MAC_DEVELOPMENT_GUIDE.md # Mac本地开发数据持久化(无需提交) data/ +sdk/benchmark/.env /docker/.env.bak diff --git a/backend/agents/create_agent_info.py b/backend/agents/create_agent_info.py index b38b17e56..fe3f1a524 100644 --- a/backend/agents/create_agent_info.py +++ b/backend/agents/create_agent_info.py @@ -31,6 +31,7 @@ from utils.model_name_utils import add_repo_to_name from utils.prompt_template_utils import get_agent_prompt_template from utils.config_utils import tenant_config_manager, get_model_name_from_config +from utils.context_utils import build_context_components from consts.const import LOCAL_MCP_SERVER, MODEL_CONFIG_MAPPING, LANGUAGE, DATA_PROCESS_SERVICE, MINIO_DEFAULT_BUCKET from consts.exceptions import ValidationError @@ -412,6 +413,9 @@ async def create_agent_config( # Get skills list for prompt template skills = _get_skills_for_template(agent_id, tenant_id, version_no) + time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + is_manager = len(managed_agents) > 0 or len(external_a2a_agents) > 0 + render_kwargs = { "duty": duty_prompt, "constraint": constraint_prompt, @@ -424,11 +428,30 @@ async def create_agent_config( "APP_DESCRIPTION": app_description, "memory_list": memory_list, "knowledge_base_summary": knowledge_base_summary, - "time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "time": time_str, "user_id": user_id, } system_prompt = Template(prompt_template["system_prompt"], undefined=StrictUndefined).render(render_kwargs) + context_components = build_context_components( + duty=duty_prompt, + constraint=constraint_prompt, + few_shots=few_shots_prompt, + app_name=app_name, + app_description=app_description, + time_str=time_str, + user_id=user_id, + language=language, + is_manager=is_manager, + tools=render_kwargs["tools"], + skills=skills, + managed_agents=render_kwargs["managed_agents"], + external_a2a_agents=render_kwargs["external_a2a_agents"], + memory_list=memory_list, + memory_search_query=last_user_query, + knowledge_base_summary=knowledge_base_summary, + ) + model_id_to_use = override_model_id if override_model_id else agent_info.get("model_id") model_max_tokens = 10000 if model_id_to_use is not None: @@ -459,7 +482,8 @@ async def create_agent_config( provide_run_summary=agent_info.get("provide_run_summary", False), managed_agents=managed_agents, external_a2a_agents=external_a2a_agents, - context_manager_config=cm_config + context_manager_config=cm_config, + context_components=context_components, ) return agent_config diff --git a/backend/utils/context_utils.py b/backend/utils/context_utils.py new file mode 100644 index 000000000..740bf66df --- /dev/null +++ b/backend/utils/context_utils.py @@ -0,0 +1,1332 @@ +"""Context component building utilities for system prompt assembly. + +Provides build_context_components() to convert agent configuration data +into ContextComponent instances for use with ContextManager. + +This module implements the piecewise component architecture where each +semantic section of the system prompt is emitted by a dedicated function, +allowing ContextManager to assemble them in the correct order. +""" + +from datetime import datetime +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +if TYPE_CHECKING: + from nexent.core.agents.agent_model import ( + ContextComponent, + ToolsComponent, + SkillsComponent, + MemoryComponent, + KnowledgeBaseComponent, + ManagedAgentsComponent, + ExternalAgentsComponent, + SystemPromptComponent, + ToolConfig, + AgentConfig, + ExternalA2AAgentConfig, + ) + + +# ============================================================================= +# SECTION 1: Long-text format functions (expanded from Jinja2 templates) +# Each function accepts language and is_manager params for variant-specific text +# ============================================================================= + + +def _format_memory_context( + memory_list: List[Any], + language: str = "zh", +) -> str: + """Format memory search results with full usage guidelines. + + Jinja2 templates have ~30 lines of "记忆使用准则" text that must be + included here for semantic equivalence. + """ + if not memory_list: + return "" + + # Group memories by level in correct order: tenant, user_agent, user, agent + level_order = ["tenant", "user_agent", "user", "agent"] + memory_by_level: Dict[str, List[Any]] = {} + for mem in memory_list: + if isinstance(mem, dict): + level = mem.get("memory_level", "user") + if level not in memory_by_level: + memory_by_level[level] = [] + memory_by_level[level].append(mem) + + lines = [] + + if language == "zh": + lines.append("### 上下文记忆") + lines.append("基于之前的交互记录,以下是按作用域和重要程度排序的最相关记忆:") + lines.append("") + + for level in level_order: + if level in memory_by_level: + level_title = { + "tenant": "Tenant", + "user_agent": "User_agent", + "user": "User", + "agent": "Agent", + }.get(level, level.title()) + lines.append(f"**{level_title} 层级记忆:**") + for item in memory_by_level[level]: + content = item.get("memory", "") or item.get("content", "") + score = item.get("score", 0.0) + lines.append(f"- {content} `({score:.2f})`") + lines.append("") + + lines.append("**记忆使用准则:**") + lines.append("1. **冲突处理优先级**:当记忆信息存在矛盾时,严格按以下顺序处理:") + lines.append("- **最优先**:在上述列表中位置靠前的记忆具有优先权") + lines.append("- **次优先**:当前对话内容与记忆直接冲突时,以当前对话为准") + lines.append("- **次优先**:相关度分数越高,表示记忆越可信") + lines.append("") + lines.append("2. **记忆整合最佳实践**:") + lines.append(" - 自然地将相关记忆融入回答中,避免显式使用\"根据记忆\"、\"根据上下文\"或\"根据交互记忆\"等语言") + lines.append(" - 利用记忆信息调整回答的语调、方式和技术深度以适应用户") + lines.append(" - 让记忆指导您对用户偏好和上下文的理解") + lines.append("") + lines.append("3. **级别特定说明**:") + lines.append(" - **tenant(租户级)**:组织层面的约束和政策(不可违背)") + lines.append(" - **user_agent(用户-代理级)**:特定用户在代理中的交互模式和既定工作流程") + lines.append(" - **user(用户级)**:用户的个人偏好、技能水平和历史上下文") + lines.append(" - **agent(代理级)**:您的既定行为模式和能力特征,通常对所有用户共享(重要性最低)") + else: + lines.append("### Contextual Memory") + lines.append("Based on previous interactions, here are the most relevant memories organized by scope and importance:") + lines.append("") + + for level in level_order: + if level in memory_by_level: + lines.append(f"**{level.title()} Level Memory:**") + for item in memory_by_level[level]: + content = item.get("memory", "") or item.get("content", "") + score = item.get("score", 0.0) + lines.append(f"- {content} `({score:.2f})`") + lines.append("") + + lines.append("**Memory Usage Guidelines:**") + lines.append("1. **Conflict Resolution Priority**: When memories contradict each other, follow this strict order:") + lines.append(" - **Primary**: Information appearing EARLIER in the above numbered list takes precedence") + lines.append(" - **Secondary**: Current conversation context overrides historical memory when directly contradicted") + lines.append(" - **Tertiary**: Higher relevance scores indicate more trustworthy information") + lines.append("") + lines.append("2. **Memory Integration Best Practices**:") + lines.append(" - Seamlessly weave relevant memories into your responses without explicitly saying \"I remember\", \"based on memory\" or \"based on context\"") + lines.append(" - Use memories to inform your tone, approach, and technical level appropriate for this user") + lines.append(" - Let memories guide your assumptions about user preferences and context") + lines.append("") + lines.append("3. **Level-Specific Considerations**:") + lines.append(" - **tenant**: Organizational constraints and policies (non-negotiable)") + lines.append(" - **user_agent**: Specific interaction dynamics and established workflow patterns") + lines.append(" - **user**: Individual preferences, skills, and historical context") + lines.append(" - **agent**: Your established behavioral patterns and capabilities, usually shared by all users (least important)") + + return "\n".join(lines) + + +def _format_skills_description( + skills: List[Dict[str, str]], + language: str = "zh", +) -> str: + """Format skill descriptions with full 6-step usage process. + + Jinja2 templates have ~50 lines of "技能使用流程" text that must be + included here for semantic equivalence. + """ + if not skills: + return "" + + lines = [] + + # Build the block + skills_block_lines = [""] + for skill in skills: + name = skill.get("name", "") + desc = skill.get("description", "") + skills_block_lines.append(" ") + skills_block_lines.append(f" {name}") + skills_block_lines.append(f" {desc}") + skills_block_lines.append(" ") + skills_block_lines.append("") + skills_block = "\n".join(skills_block_lines) + + if language == "zh": + lines.append("### 可用技能") + lines.append("") + lines.append("你拥有以下技能(Skills)。技能是预定义的专业能力模块,包含详细执行指南和可选的附加脚本。") + lines.append("") + lines.append(skills_block) + lines.append("") + lines.append("**技能使用流程**:") + lines.append("1. 收到用户请求后,首先审视 `` 中每个技能的 description,判断是否有匹配的技能。") + lines.append("2. **加载技能**:根据不同场景选择读取方式:") + lines.append(" - **首次加载**:调用 `read_skill_md(\"skill_name\")` 读取技能的完整执行指南(默认读取 SKILL.md)") + lines.append(" - **精确读取**:如只需特定文件(如示例、参考文档),可指定 additional_files:") + lines.append(" ") + lines.append(" skill_content = read_skill_md(\"skill_name\", [\"examples.md\", \"reference/api_doc\"])") + lines.append(" print(skill_content)") + lines.append(" ") + lines.append(" 注意:当 additional_files 非空时,默认不再自动读取 SKILL.md,如需同时读取请显式指定。") + lines.append("") + lines.append(" - **加载技能配置**:如果技能需要读取配置变量,可先调用 `read_skill_config(\"skill_name\")` 读取配置字符串,通过 `json.loads` 方法转化为配置字典,再从中获取所需值:") + lines.append(" ") + lines.append(" import json") + lines.append(" config = json.loads(read_skill_config(\"skill_name\"))") + lines.append(" # 返回示例: {\"key_a\": {\"key2\": \"value2\"}, \"others\": {...}}") + lines.append(" value = config[\"key1\"][\"key2\"]") + lines.append(" print(value)") + lines.append(" ") + lines.append("") + lines.append("3. **遵循技能指南**:技能内容注入后,严格按其中的步骤执行。不要跳过技能指南中的步骤,也不要用自行编写的代码替代技能定义的流程。") + lines.append("") + lines.append("4. **执行技能脚本**:如果技能指南中引用了附加脚本(形如 ``),使用以下格式调用:") + lines.append(" 代码:") + lines.append(" ") + lines.append(" result = run_skill_script(\"skill_name\", \"script_path\")") + lines.append(" print(result)") + lines.append(" ") + lines.append(" 对于需要附加参数的脚本,需要参照脚本调用说明,将参数直接以字符串形式传递。") + lines.append(" 例如对于希望附加的参数:--param1 value1 --flag,则使用以下格式调用run_skill_script:") + lines.append(" ") + lines.append(" result = run_skill_script(\"skill_name\", \"script_path\", \"--param1 value1 --flag\")") + lines.append(" print(result)") + lines.append(" ") + lines.append(" 注意:只执行技能指南中明确声明的脚本路径,绝不自行构造脚本路径。") + lines.append("") + lines.append("5. **整合输出**:根据技能指南要求的输出格式,结合脚本执行结果生成最终回答。") + lines.append("") + lines.append("6. **引用场景处理**:当技能内容中出现引用标记或需要引用其他文件时,需要识别并再次调用 read_skill_md:") + lines.append(" - **引用模板识别**:注意技能内容中形如 `` 或自然语言式的引用声明(如\"详见 examples.md\"、\"请参考 reference/api_doc\")") + lines.append(" - **自动补全**:发现引用后,尝试读取被引用的文件获取更多信息") + lines.append(" - **示例**:") + lines.append(" ") + lines.append(" # 技能内容提示\"请参考 examples.md 获取详细示例\"") + lines.append(" additional_info = read_skill_md(\"skill_name\", [\"examples.md\"])") + lines.append(" print(additional_info)") + lines.append(" ") + else: + lines.append("### Available Skills") + lines.append("") + lines.append("You have the following Skills. Skills are predefined professional capability modules with detailed execution guides and optional additional scripts.") + lines.append("") + lines.append(skills_block) + lines.append("") + lines.append("**Skill Usage Process**:") + lines.append("1. After receiving a user request, first examine the description of each skill in `` to determine if there is a matching skill.") + lines.append("2. **Load Skill**: Choose the appropriate reading method based on the scenario:") + lines.append(" - **First-time load**: Call `read_skill_md(\"skill_name\")` to read the complete execution guide (defaults to reading SKILL.md)") + lines.append(" - **Precise read**: If you only need specific files (like examples, reference docs), specify additional_files:") + lines.append(" ") + lines.append(" skill_content = read_skill_md(\"skill_name\", [\"examples.md\", \"reference/api_doc\"])") + lines.append(" print(skill_content)") + lines.append(" ") + lines.append(" Note: When additional_files is non-empty, SKILL.md is no longer auto-read. If you need both, explicitly specify it.") + lines.append("") + lines.append(" - **Load skill config**: If the skill needs configuration variables, call `read_skill_config(\"skill_name\")` to read the config string, convert to dict via `json.loads`, then access values:") + lines.append(" ") + lines.append(" import json") + lines.append(" config = json.loads(read_skill_config(\"skill_name\"))") + lines.append(" # Example: {\"key_a\": {\"key2\": \"value2\"}, \"others\": {...}}") + lines.append(" value = config[\"key1\"][\"key2\"]") + lines.append(" print(value)") + lines.append(" ") + lines.append("") + lines.append("3. **Follow Skill Guide**: After skill content is injected, strictly follow its steps. Do not skip steps or replace with your own code.") + lines.append("") + lines.append("4. **Execute Skill Script**: If the skill guide references additional scripts (like ``), call:") + lines.append(" ") + lines.append(" result = run_skill_script(\"skill_name\", \"script_path\")") + lines.append(" print(result)") + lines.append(" ") + lines.append(" For scripts needing extra params, pass them as a command-line string per the script's calling instructions.") + lines.append(" Example for --param1 value1 --flag:") + lines.append(" ") + lines.append(" result = run_skill_script(\"skill_name\", \"script_path\", \"--param1 value1 --flag\")") + lines.append(" print(result)") + lines.append(" ") + lines.append(" Note: Only execute script paths explicitly declared in the skill guide. Never construct paths yourself.") + lines.append("") + lines.append("5. **Integrate Output**: Generate the final answer based on the skill guide's output format and script execution results.") + lines.append("") + lines.append("6. **Handle References**: When the skill content has reference markers or needs to reference other files, identify and call read_skill_md again:") + lines.append(" - **Reference template recognition**: Look for patterns like `` or natural-language references (\"see examples.md\", \"refer to reference/api_doc\")") + lines.append(" - **Auto-complete**: After discovering a reference, try reading the referenced file for more info") + lines.append(" - **Example**:") + lines.append(" ") + lines.append(" # Skill content says \"see examples.md for detailed examples\"") + lines.append(" additional_info = read_skill_md(\"skill_name\", [\"examples.md\"])") + lines.append(" print(additional_info)") + lines.append(" ") + + return "\n".join(lines) + + +def _format_tools_description( + tools: Dict[str, Any], + knowledge_base_summary: Optional[str] = None, + language: str = "zh", + is_manager: bool = True, +) -> str: + """Format tool descriptions with file URL usage guide. + + Jinja2 templates have ~10 lines of "文件链接使用指南" text that must be + included here for semantic equivalence. + + Note: Managed agents use different presigned_url guidance than manager agents. + """ + if not tools: + no_tools_msg = "- 当前没有可用的工具" if language == "zh" else "- No tools are currently available" + return no_tools_msg + + lines = [] + + if language == "zh": + lines.append("- 你只能使用以下工具,不得使用任何其他工具:") + else: + lines.append("- You can only use the following tools and may not use any other tools:") + + for name, tool in tools.items(): + if hasattr(tool, 'description'): + desc = tool.description + inputs = tool.inputs + output_type = tool.output_type + source = getattr(tool, 'source', 'local') + else: + desc = tool.get('description', '') + inputs = tool.get('inputs', '') + output_type = tool.get('output_type', '') + source = tool.get('source', 'local') + + # MCP tools have [MCP] prefix + if source == 'mcp': + if language == "zh": + lines.append(f"- [MCP] {name}: {desc}") + lines.append(f" 接受输入: {inputs}") + lines.append(f" 返回输出类型: {output_type}") + else: + lines.append(f"- [MCP] {name}: {desc}") + lines.append(f" Accepts input: {inputs}") + lines.append(f" Returns output type: {output_type}") + else: + if language == "zh": + lines.append(f"- {name}: {desc}") + lines.append(f" 接受输入: {inputs}") + lines.append(f" 返回输出类型: {output_type}") + else: + lines.append(f"- {name}: {desc}") + lines.append(f" Accepts input: {inputs}") + lines.append(f" Returns output type: {output_type}") + + # Knowledge base summary + if knowledge_base_summary: + if language == "zh": + lines.append("- knowledge_base_search工具只能使用以下知识库索引,请根据用户问题选择最相关的一个或多个知识库索引:") + lines.append(f" {knowledge_base_summary}") + else: + lines.append("- knowledge_base_search tool can only use the following knowledge base indexes, please select the most relevant one or more knowledge base indexes based on the user's question:") + lines.append(f" {knowledge_base_summary}") + + # File URL usage guide + lines.append("") + if language == "zh": + lines.append("### 文件链接使用指南") + lines.append("当处理用户上传的文件时,请根据工具类型选择正确的 URL:") + lines.append("1. **调用标记为 [MCP] 的工具**(外部工具,运行在 Nexent 之外):") + if is_manager: + lines.append(" → 使用 **Download URL**(格式:`https://minio.example.com/...?token=xxx`)") + lines.append(" 原因:MCP 工具运行在外部服务,无法访问内部 S3 存储") + else: + lines.append(" → 使用 **presigned_url**(已包含代理前缀,格式:`http://.../api/nb/v1/file/fetch?presigned_url=...`)") + lines.append(" 直接使用用户上传文件信息中提供的 **presigned_url** 字段,无需拼接。") + lines.append("2. **调用其他所有工具**(内部工具,如 analyze_text_file、analyze_image 等):") + lines.append(" → 使用 **S3 URL**(格式:`s3:/nexent/attachments/xxx.pdf`)") + lines.append(" 原因:内部工具运行在 Nexent 内部,可以直接访问 MinIO 存储") + else: + lines.append("### File URL Usage Guide") + lines.append("When processing user-uploaded files, choose the correct URL based on tool type:") + lines.append("1. **Calling tools marked with [MCP]** (external tools that run outside Nexent):") + if is_manager: + lines.append(" → Use **Download URL** (format: `https://minio.example.com/...?token=xxx`)") + lines.append(" Reason: MCP tools run on external services and cannot access internal S3 storage") + else: + lines.append(" → Use **presigned_url** (already includes proxy prefix, format: `http://.../api/nb/v1/file/fetch?presigned_url=...`)") + lines.append(" Directly use the **presigned_url** field provided in the user's uploaded file info. No need to construct or append anything.") + lines.append("2. **Calling all other tools** (internal tools like analyze_text_file, analyze_image):") + lines.append(" → Use **S3 URL** (format: `s3:/nexent/attachments/xxx.pdf`)") + lines.append(" Reason: Internal tools run inside Nexent and can directly access MinIO storage") + + return "\n".join(lines) + + +def _format_managed_agents_description( + managed_agents: Dict[str, Any], + language: str = "zh", +) -> str: + """Format managed sub-agent descriptions with calling specifications. + + Jinja2 templates have ~15 lines of "内部助手调用规范" text that must be + included here for semantic equivalence. + """ + if not managed_agents: + return "" + + lines = [] + + if language == "zh": + lines.append("你可以使用以下内部助手(通过函数调用方式协作):") + for name, agent in managed_agents.items(): + desc = agent.description if hasattr(agent, 'description') else agent.get('description', '') + lines.append(f" - {name}: {desc}") + lines.append("") + lines.append("内部助手调用规范:") + lines.append(" 1. 调用方式:") + lines.append(" - 接受输入:{\"task\": {\"type\": \"string\", \"description\": \"任务描述\"}}") + lines.append(" - 返回输出类型:{\"type\": \"string\", \"description\": \"执行结果\"}") + lines.append(" 2. 使用策略:") + lines.append(" - 任务分解:单次调用中不要让助手一次做过多的事情,任务拆分是你的工作,你需要将复杂任务分解为可管理的子任务") + lines.append(" - 专业匹配:根据助手的专长分配任务") + lines.append(" - 信息整合:整合不同助手的输出生成连贯解决方案") + lines.append(" - 效率优化:避免重复工作") + lines.append(" 3. 协作要求:") + lines.append(" - 评估助手返回的结果") + lines.append(" - 必要时提供额外指导或重新分配任务") + lines.append(" - 在助手结果基础上进行工作,避免重复工作") + lines.append(" - 注意保留子助手回答中的特殊符号,如索引溯源信息等") + else: + lines.append("You can use the following internal agents (via function calls):") + for name, agent in managed_agents.items(): + desc = agent.description if hasattr(agent, 'description') else agent.get('description', '') + lines.append(f" - {name}: {desc}") + lines.append("") + lines.append("Internal agent calling specifications:") + lines.append(" 1. Calling method:") + lines.append(" - Accepts input: {\"task\": {\"type\": \"string\", \"description\": \"task description\"}}") + lines.append(" - Returns output type: {\"type\": \"string\", \"description\": \"execution result\"}") + lines.append(" 2. Usage strategy:") + lines.append(" - Task decomposition: Don't let agents do too many things in a single call, task breakdown is your job, you need to decompose complex tasks into manageable subtasks") + lines.append(" - Professional matching: Assign tasks based on agent expertise") + lines.append(" - Information integration: Integrate outputs from different agents to generate coherent solutions") + lines.append(" - Efficiency optimization: Avoid duplicate work") + lines.append(" 3. Collaboration requirements:") + lines.append(" - Evaluate agent returned results") + lines.append(" - Provide additional guidance or reassign tasks when necessary") + lines.append(" - Work based on agent results, avoid duplicate work") + lines.append(" - Pay attention to preserving special symbols in sub-agent answers, such as index traceability information") + + return "\n".join(lines) + + +def _format_external_agents_description( + external_a2a_agents: Dict[str, Any], + language: str = "zh", +) -> str: + """Format external A2A agent descriptions with calling specifications. + + Jinja2 templates have ~5 lines of "外部助手调用规范" text that must be + included here for semantic equivalence. + """ + if not external_a2a_agents: + return "" + + lines = [] + + if language == "zh": + lines.append("你还可以使用以下外部助手(通过 A2A 协议远程调用):") + for agent_id, agent in external_a2a_agents.items(): + name = agent.name if hasattr(agent, 'name') else agent.get('name', '') + desc = agent.description if hasattr(agent, 'description') else agent.get('description', '') + lines.append(f" - {name}: {desc}") + lines.append("") + lines.append("外部助手调用规范:") + lines.append(" 1. 调用格式:`agent_name(task=\"自然语言任务描述\")`,注意:只需要 task 参数,不需要其他参数") + lines.append(" 2. 例如:`tool_assistant(task=\"北京天气怎么样\")`") + lines.append(" 3. 任务描述使用自然语言,让外部助手自动识别和处理") + else: + lines.append("You can also use the following external agents (called via A2A protocol remotely):") + for agent_id, agent in external_a2a_agents.items(): + name = agent.name if hasattr(agent, 'name') else agent.get('name', '') + desc = agent.description if hasattr(agent, 'description') else agent.get('description', '') + lines.append(f" - {name}: {desc}") + lines.append("") + lines.append("External agent calling specifications:") + lines.append(" 1. Call format: `agent_name(task=\"natural language task description\")`, NOTE: only task parameter is needed, no other parameters") + lines.append(" 2. Example: `tool_assistant(task=\"What's the weather in Beijing?\")`") + lines.append(" 3. Use natural language for task description, let the external agent handle the rest") + + return "\n".join(lines) + + +def _format_skills_usage_requirements( + skills: List[Dict[str, str]], + language: str = "zh", +) -> str: + """Format skills usage requirements section. + + This is the "技能使用要求" section that appears after the skills reference + in the Available Resources section. + """ + if not skills: + no_skills_msg = "- 当前没有可用的技能" if language == "zh" else "- No skills are currently available" + return no_skills_msg + + lines = [] + + if language == "zh": + lines.append("- 你拥有上述 `` 中列出的技能。技能中引用的脚本通过 `run_skill_script()` 函数调用,该函数由平台提供,不需要导入。") + lines.append("") + lines.append("### 技能使用要求") + lines.append("1. **技能优先**:如果用户请求匹配了某个技能的 description,必须先调用 `read_skill_md()` 加载技能指南,再按指南执行。不得跳过技能自行编写代码解决。") + lines.append("2. **忠实执行**:读取技能内容后,严格按技能指南中的步骤操作。不要自行修改流程、跳过步骤或用通用代码替代技能定义的流程。") + lines.append("3. **脚本调用规范**:只使用 `run_skill_script` 工具执行技能指南中明确要求的脚本。传入的 `skill_name` 和 `script_path` 必须与技能指南中的声明完全一致,不要自行拼接或猜测路径。如果需要附加参数,将参数以命令行字符串形式传递给`run_skill_script`。") + lines.append("4. **失败回退**:如果 `read_skill_md` 返回错误或 `run_skill_script` 执行失败,向用户说明情况,并尝试用通用推理模式提供替代方案。") + lines.append("5. **技能组合**:如果一个任务需要多个技能配合,按逻辑依赖顺序依次加载和执行,前一个技能的输出可作为后一个技能的输入。") + else: + lines.append("- You have the skills listed in `` above. Scripts referenced in skills are called via the `run_skill_script()` function, which is provided by the platform and does not need to be imported.") + lines.append("") + lines.append("### Skill Usage Requirements") + lines.append("1. **Skill Priority**: If a user request matches a skill's description, you must first call `read_skill_md()` to load the skill guide, then execute per the guide. Do not skip skills and write your own code.") + lines.append("2. **Faithful Execution**: After reading skill content, strictly follow the skill guide's steps. Do not modify the flow, skip steps, or replace with generic code.") + lines.append("3. **Script Calling Specification**: Only use `run_skill_script` to execute scripts explicitly required in the skill guide. The `skill_name` and `script_path` must match the skill guide's declaration exactly. Do not construct or guess paths. For extra params, pass them as a command-line string to `run_skill_script`.") + lines.append("4. **Failure Fallback**: If `read_skill_md` returns an error or `run_skill_script` fails, explain to the user and try to provide an alternative via general reasoning mode.") + lines.append("5. **Skill Combination**: If a task needs multiple skills, load and execute in logical dependency order. The output of one skill can be input to the next.") + + return "\n".join(lines) + + +def _format_agent_fallback( + managed_agents: Dict[str, Any], + external_a2a_agents: Dict[str, Any], + language: str = "zh", +) -> str: + """Format fallback message when no agents are available.""" + if managed_agents or external_a2a_agents: + return "" + + return "- 当前没有可用的助手" if language == "zh" else "- No agents are currently available" + + +def _format_app_context(app_name: str, app_description: str, user_id: str, time_str: str) -> str: + """Format application context for system prompt injection.""" + lines = [ + f"Application: {app_name}", + f"Description: {app_description}", + f"Current user: {user_id}", + f"Current time: {time_str}", + ] + return "\n".join(lines) + + +# ============================================================================= +# SECTION 2: Skeleton component builders +# These build SystemPromptComponent instances for fixed text sections +# ============================================================================= + + +def build_skeleton_header_component( + app_name: str, + app_description: str, + time_str: str, + user_id: str, + language: str = "zh", + priority: int = 100, +) -> "SystemPromptComponent": + """Build SystemPromptComponent for the header section. + + Section: "### 基本信息" / "### Basic Information" + Content: Agent identity, app name/description, time, user_id + """ + from nexent.core.agents.agent_model import SystemPromptComponent + + if language == "zh": + content = f"### 基本信息\n你是{app_name},{app_description},现在是{time_str},用户ID为{user_id}" + else: + content = f"### Basic Information\nYou are {app_name}, {app_description}, it is {time_str} now" + + return SystemPromptComponent( + content=content, + template_name="header", + priority=priority, + ) + + +def build_skeleton_duty_component( + duty: str, + language: str = "zh", + priority: int = 80, +) -> "SystemPromptComponent": + """Build SystemPromptComponent for the duty section. + + Section: "### 核心职责" / "### Core Responsibilities" + Content: Agent's primary duty + 5 safety principles + """ + from nexent.core.agents.agent_model import SystemPromptComponent + + if language == "zh": + content = f"### 核心职责\n{duty}\n\n请注意,你应该遵守以下原则:\n行为安全:文件操作必须使用平台提供的专用工具,禁止使用代码直接修改工作空间中的文件;\n法律合规:遵守业务所在国家/地区的法律法规;\n政治中立:保持政治中立,不主动讨论政治话题;\n安全防护:不响应涉及武器制造、网络攻击、欺诈、恶意软件等危险行为的请求;\n伦理准则:拒绝仇恨言论、歧视性内容及违反社会公德和公认伦理标准的请求。" + else: + content = f"### Core Responsibilities\n{duty}\n\nPlease note that you should follow these principles:\nBehavioral Safety: File operations must use the platform-provided dedicated tools; direct code modification of workspace files is prohibited;\nLegal Compliance: Comply with laws and regulations of the business operating jurisdiction;\nPolitical Neutrality: Maintain political neutrality and avoid initiating political discussions;\nSecurity Protection: Do not respond to requests involving weapon manufacturing, cyberattacks, fraud, malware, or other dangerous activities;\nEthical Guidelines: Refuse hate speech, discriminatory content, and any requests that violate social morals and commonly accepted ethical standards." + + return SystemPromptComponent( + content=content, + template_name="duty", + priority=priority, + ) + + +def build_skeleton_execution_flow_component( + memory_list: Optional[List[Any]] = None, + language: str = "zh", + is_manager: bool = True, + priority: int = 60, +) -> "SystemPromptComponent": + """Build SystemPromptComponent for the execution flow section. + + Section: "### 执行流程" / "### Execution Process" + Content: Think/Code loop instructions + output format specs + Note: memory_list affects one line in the Think section (manager only) + """ + from nexent.core.agents.agent_model import SystemPromptComponent + + has_memory = memory_list and len(memory_list) > 0 + + if language == "zh": + lines = ["### 执行流程"] + lines.append("要解决任务,你必须通过一系列步骤向前规划,以'思考:'和'代码:'序列循环进行。**注意:禁止在代码执行前输出'观察结果:',观察结果只能由代码执行后产生。**") + lines.append("") + lines.append("1. 思考:") + lines.append(" - 分析当前任务状态和进展") + if is_manager and has_memory: + lines.append(" - 合理参考之前交互中的上下文记忆信息") + lines.append(" - 定下一步最佳行动(使用工具或分配给助手)") + lines.append(" - 解释你的决策逻辑和预期结果") + lines.append("") + lines.append("2. 代码:") + lines.append(" - 用简单的Python编写代码") + lines.append(" - 遵循python代码规范和python语法") + lines.append(" - 正确调用工具或助手解决问题") + lines.append(" - 考虑到代码执行与展示用户代码的区别,使用'代码'表达运行代码,使用'代码'表达展示代码") + lines.append(" - 注意运行的代码不会被用户看到,所以如果用户需要看到代码,你需要使用'代码'表达展示代码。") + lines.append(" - **重要**:代码执行后,系统会返回 \"Observation:\" 标记的内容(这是真实的执行结果)。请基于这些真实结果继续下一步思考,**不要在代码执行前自行编造观察结果**。") + lines.append("") + lines.append("在思考结束后,当你认为可以回答用户问题,那么可以不生成代码,直接生成最终回答给到用户并停止循环。") + lines.append("") + lines.append("生成最终回答时,你需要遵循以下规范:") + lines.append("1. Markdown格式要求:") + lines.append(" - 使用标准Markdown语法格式化输出,支持标题、列表、表格、代码块、链接等") + lines.append(" - 展示图片和视频使用链接方式,不需要外套代码块,格式:[链接文本](URL),图片格式:![alt文本](图片URL),视频格式:") + lines.append(" - 段落之间使用单个空行分隔,避免多个连续空行") + lines.append(" - 数学公式使用标准Markdown格式:行内公式用 $公式$,块级公式用 $$公式$$") + lines.append("") + lines.append("2. 引用标记规范(仅在使用了检索工具时):") + lines.append(" - 引用标记格式必须严格为:`[[字母+数字]]`,例如:`[[a1]]`、`[[b2]]`、`[[c3]]`") + lines.append(" - 字母部分必须是单个小写字母(a-e),数字部分必须是整数") + lines.append(" - 引用标记的字母和数字必须与检索工具的检索结果一一对应") + lines.append(" - 引用标记应紧跟在相关信息或句子之后,通常放在句末或段落末尾") + lines.append(" - 多个引用标记可以连续使用,例如:`[[a1]][[b2]]`") + lines.append(" - **重要**:仅添加引用标记,不要添加链接、参考文献列表等多余内容") + lines.append(" - 如果检索结果中没有匹配的引用,则不显示该引用标记") + lines.append("") + lines.append("3. 格式细节要求:") + lines.append(" - 避免在Markdown中使用HTML标签,优先使用Markdown原生语法") + lines.append(" - 代码块中的代码应保持原始格式,不要添加额外的转义字符") + lines.append(" - 若未使用检索工具,则不添加任何引用标记") + else: + lines = ["### Execution Process"] + lines.append("To solve tasks, you must plan forward through a series of steps in a loop of 'Think:' and 'Code:' sequences. **IMPORTANT: You must NOT output 'Observe Results:' before code execution. Observation results can ONLY be generated after code execution.**") + lines.append("") + lines.append("1. Think:") + lines.append(" - Analyze current task status and progress") + if is_manager and has_memory: + lines.append(" - Reference relevant contextual memories from previous interactions when applicable") + lines.append(" - Determine the best next action (use tools or delegate to agents)") + lines.append(" - Explain your decision logic and expected results") + lines.append("") + lines.append("2. Code:") + lines.append(" - Write code in simple Python") + lines.append(" - Follow Python coding standards and Python syntax") + lines.append(" - Correctly call tools or agents to solve problems") + lines.append(" - To distinguish between code execution and displaying user code, use 'code' for executing code and 'code' for displaying code") + lines.append(" - Note that executed code is not visible to users. If users need to see the code, use 'code' for displaying code.") + lines.append(" - **IMPORTANT**: After code execution, the system will return content with \"Observation:\" marker (this is the real execution result). Please continue your next thinking based on these real results. **Do NOT fabricate observation results before code execution.**") + lines.append("") + lines.append("After thinking, when you believe you can answer the user's question, you can generate a final answer directly to the user without generating code and stop the loop.") + lines.append("") + lines.append("When generating the final answer, you need to follow these specifications:") + lines.append("1. **Markdown Format Requirements**:") + lines.append(" - Use standard Markdown syntax to format your output, supporting headings, lists, tables, code blocks, and links.") + lines.append(" - Display images and videos using links instead of wrapping them in code blocks. Use `[link text](URL)` for links, `![alt text](image URL)` for images, and `` for videos.") + lines.append(" - Use a single blank line between paragraphs, avoid multiple consecutive blank lines") + lines.append(" - Mathematical formulas use standard Markdown format: inline formulas use $formula$, block formulas use $$formula$$") + lines.append("") + lines.append("2. **Reference Mark Specifications** (only when retrieval tools are used):") + lines.append(" - Reference mark format must strictly be: `[[letter+number]]`, for example: `[[a1]]`, `[[b2]]`, `[[c3]]`") + lines.append(" - The letter part must be a single lowercase letter (a-e), the number part must be an integer") + lines.append(" - The letters and numbers of reference marks must correspond one-to-one with the retrieval results of retrieval tools") + lines.append(" - Reference marks should be placed immediately after relevant information or sentences, usually at the end of sentences or paragraphs") + lines.append(" - Multiple reference marks can be used consecutively, for example: `[[a1]][[b2]]`") + lines.append(" - **Important**: Only add reference marks, do not add links, reference lists, or other extraneous content") + lines.append(" - If there is no matching reference in the retrieval results, do not display that reference mark") + lines.append("") + lines.append("3. **Format Detail Requirements**:") + lines.append(" - Avoid using HTML tags in Markdown, prioritize native Markdown syntax") + lines.append(" - Code in code blocks should maintain original format, do not add extra escape characters") + lines.append(" - If no retrieval tools are used, do not add any reference marks") + + content = "\n".join(lines) + + return SystemPromptComponent( + content=content, + template_name="execution_flow", + priority=priority, + ) + + +def build_skeleton_constraint_component( + constraint: str, + language: str = "zh", + priority: int = 30, +) -> "SystemPromptComponent": + """Build SystemPromptComponent for the constraint section. + + Section: "### 资源使用要求" / "### Resource Usage Requirements" + Content: User-defined constraint text + """ + from nexent.core.agents.agent_model import SystemPromptComponent + + if language == "zh": + content = f"### 资源使用要求\n{constraint}" + else: + content = f"### Resource Usage Requirements\n{constraint}" + + return SystemPromptComponent( + content=content, + template_name="constraint", + priority=priority, + ) + + +def build_skeleton_code_norms_component( + language: str = "zh", + is_manager: bool = True, + priority: int = 20, +) -> "SystemPromptComponent": + """Build SystemPromptComponent for the Python code norms section. + + Section: "### python代码规范" / "### Python Code Specifications" + Content: 12 fixed code rules (11 for managed agents) + """ + from nexent.core.agents.agent_model import SystemPromptComponent + + if language == "zh": + lines = ["### python代码规范"] + lines.append("1. 如果认为是需要执行的代码,使用'代码'格式;如果是不需要执行仅用于展示的代码,使用'代码'格式,其中语言类型例如python、java、javascript等;") + lines.append("2. 只使用已定义的变量,变量将在多次调用之间持续保持;") + lines.append("3. 使用\"print()\"函数让下一次的模型调用看到对应变量信息;") + lines.append("4. 正确使用工具/助手的入参,使用关键字参数,不要用字典形式;") + lines.append("5. 避免在一轮对话中进行过多的工具/助手调用,这会导致输出格式难以预测;") + lines.append("6. 只在需要时调用工具/助手,不重复相同参数的调用;") + lines.append("7. 使用变量名保存函数调用结果,在每个中间步骤中,您可以使用\"print()\"来保存您需要的任何重要信息。被保存的信息在代码执行之间保持。print()输出的内容应被视为字符串,不要对其进行字典相关操作如.get()、[]等,避免类型错误;") + lines.append("9. 示例中的代码避免出现**if**、**for**等逻辑,仅调用工具/助手,示例中的每一次的行动都是确定事件。如果有不同的条件,你应该给出不同条件下的示例;") + lines.append("10. 工具调用使用关键字参数,如:tool_name(param1=\"value1\", param2=\"value2\");") + if is_manager: + lines.append("11. 助手调用必须使用task参数,如:assistant_name(task=\"任务描述\");") + lines.append("12. 不要放弃!你负责解决任务,而不是提供解决方向。") + else: + lines = ["### Python Code Specifications"] + lines.append("1. If it is considered to be code that needs to be executed, use 'code'. If the code does not need to be executed for display only, use 'code', where language_type can be python, java, javascript, etc;") + lines.append("2. Only use defined variables, variables will persist between multiple calls;") + lines.append("3. Use \"print()\" function to let the next model call see corresponding variable information;") + lines.append("4. Use tool/agent input parameters correctly, use keyword arguments, not dictionary format;") + lines.append("5. Avoid making too many tool/agent calls in one round of conversation, as this will make the output format unpredictable;") + lines.append("6. Only call tools/agents when needed, do not repeat calls with the same parameters;") + lines.append("7. Use variable names to save function call results. In each intermediate step, you can use \"print()\" to save any important information you need. The saved information persists between code executions. The content printed by print() should be treated as a string, do not perform dictionary-related operations such as .get(), [] etc., to avoid type errors;") + lines.append("8. Avoid **if**, **for** and other logic in example code, only call tools/agents. Each action in the example is a deterministic event. If there are different conditions, you should provide examples under different conditions;") + lines.append("9. Tool calls use keyword arguments, such as: tool_name(param1=\"value1\", param2=\"value2\");") + if is_manager: + lines.append("10. Agent calls must use task parameter, such as: agent_name(task=\"task description\");") + lines.append("11. Don't give up! You are responsible for solving the task, not providing solution directions.") + + content = "\n".join(lines) + + return SystemPromptComponent( + content=content, + template_name="code_norms", + priority=priority, + ) + + +def build_skeleton_footer_component( + few_shots: str, + language: str = "zh", + priority: int = 10, +) -> "SystemPromptComponent": + """Build SystemPromptComponent for the footer section. + + Section: "### 示例模板" + ending + Content: few_shots + "$1M reward" ending + """ + from nexent.core.agents.agent_model import SystemPromptComponent + + if language == "zh": + content = f"### 示例模板\n{few_shots}\n\n现在开始!如果你正确解决任务,你将获得100万美元的奖励。" + else: + content = f"### Example Templates\n{few_shots}\n\nNow start! If you solve the task correctly, you will receive a reward of 1 million dollars." + + return SystemPromptComponent( + content=content, + template_name="footer", + priority=priority, + ) + + +# ============================================================================= +# SECTION 3: Piecewise component builders (existing, enhanced) +# ============================================================================= + + +def build_tools_component( + tools: Dict[str, Any], + knowledge_base_summary: Optional[str] = None, + language: str = "zh", + is_manager: bool = True, + priority: int = 50, +) -> "ToolsComponent": + """Build ToolsComponent from tool configurations. + + Args: + tools: Dict of tool name -> ToolConfig or tool dict + knowledge_base_summary: Summary text from knowledge bases + language: Language code ('zh' or 'en') + is_manager: Whether this is a manager agent + priority: Component priority for selection + + Returns: + ToolsComponent instance + """ + from nexent.core.agents.agent_model import ToolsComponent + + tool_list = [] + for name, tool in tools.items(): + if hasattr(tool, 'description'): + tool_dict = { + "name": name, + "description": tool.description, + "inputs": getattr(tool, 'inputs', ''), + "output_type": getattr(tool, 'output_type', ''), + "source": getattr(tool, 'source', 'local'), + } + else: + tool_dict = { + "name": name, + "description": tool.get('description', ''), + "inputs": tool.get('inputs', ''), + "output_type": tool.get('output_type', ''), + "source": tool.get('source', 'local'), + } + tool_list.append(tool_dict) + + formatted_desc = _format_tools_description( + tools, + knowledge_base_summary=knowledge_base_summary, + language=language, + is_manager=is_manager, + ) + return ToolsComponent( + tools=tool_list, + formatted_description=formatted_desc, + priority=priority, + ) + + +def build_skills_component( + skills: List[Dict[str, str]], + language: str = "zh", + priority: int = 70, +) -> "SkillsComponent": + """Build SkillsComponent from skill configurations. + + Args: + skills: List of skill dicts with name and description + language: Language code ('zh' or 'en') + priority: Component priority for selection + + Returns: + SkillsComponent instance + """ + from nexent.core.agents.agent_model import SkillsComponent + + formatted_desc = _format_skills_description(skills, language=language) + return SkillsComponent( + skills=skills, + formatted_description=formatted_desc, + priority=priority, + ) + + +def build_memory_component( + memory_list: List[Any], + search_query: Optional[str] = None, + language: str = "zh", + priority: int = 90, +) -> "MemoryComponent": + """Build MemoryComponent from memory search results. + + Args: + memory_list: List of memory search results + search_query: Query used to search memory + language: Language code ('zh' or 'en') + priority: Component priority for selection + + Returns: + MemoryComponent instance + """ + from nexent.core.agents.agent_model import MemoryComponent + + memories = [] + for mem in memory_list: + if isinstance(mem, dict): + memories.append({ + "content": mem.get('memory', '') or mem.get('content', ''), + "memory_type": mem.get('memory_type', 'user'), + "metadata": mem.get('metadata', {}), + }) + elif isinstance(mem, str): + memories.append({ + "content": mem, + "memory_type": "user", + "metadata": {}, + }) + + formatted_content = _format_memory_context(memory_list, language=language) + return MemoryComponent( + memories=memories, + formatted_content=formatted_content, + search_query=search_query, + priority=priority, + ) + + +def build_knowledge_base_component( + knowledge_base_summary: str, + kb_ids: Optional[List[str]] = None, + priority: int = 10, +) -> "KnowledgeBaseComponent": + """Build KnowledgeBaseComponent from knowledge base summary. + + Args: + knowledge_base_summary: Summary text from knowledge bases + kb_ids: List of knowledge base IDs used + priority: Component priority for selection + + Returns: + KnowledgeBaseComponent instance + """ + from nexent.core.agents.agent_model import KnowledgeBaseComponent + + return KnowledgeBaseComponent( + summary=knowledge_base_summary, + kb_ids=kb_ids or [], + priority=priority, + ) + + +def build_managed_agents_component( + managed_agents: Dict[str, Any], + language: str = "zh", + priority: int = 45, +) -> "ManagedAgentsComponent": + """Build ManagedAgentsComponent from managed sub-agent configurations. + + Args: + managed_agents: Dict of agent name -> AgentConfig + language: Language code ('zh' or 'en') + priority: Component priority for selection + + Returns: + ManagedAgentsComponent instance + """ + from nexent.core.agents.agent_model import ManagedAgentsComponent + + agent_list = [] + for name, agent in managed_agents.items(): + if hasattr(agent, 'description'): + agent_dict = { + "name": name, + "description": agent.description, + "tools": [], + } + if hasattr(agent, 'tools'): + agent_dict["tools"] = [t.name for t in agent.tools if hasattr(t, 'name')] + else: + agent_dict = { + "name": name, + "description": agent.get('description', ''), + "tools": [], + } + agent_list.append(agent_dict) + + formatted_desc = _format_managed_agents_description(managed_agents, language=language) + return ManagedAgentsComponent( + agents=agent_list, + formatted_description=formatted_desc, + priority=priority, + ) + + +def build_external_agents_component( + external_a2a_agents: Dict[str, Any], + language: str = "zh", + priority: int = 44, +) -> "ExternalAgentsComponent": + """Build ExternalAgentsComponent from external A2A agent configurations. + + Args: + external_a2a_agents: Dict of agent_id -> ExternalA2AAgentConfig + language: Language code ('zh' or 'en') + priority: Component priority for selection + + Returns: + ExternalAgentsComponent instance + """ + from nexent.core.agents.agent_model import ExternalAgentsComponent + + agent_list = [] + for agent_id, agent in external_a2a_agents.items(): + if hasattr(agent, 'agent_id'): + agent_dict = { + "agent_id": str(agent.agent_id), + "name": agent.name, + "description": agent.description, + "url": getattr(agent, 'url', ''), + } + else: + agent_dict = { + "agent_id": str(agent_id), + "name": agent.get('name', ''), + "description": agent.get('description', ''), + "url": agent.get('url', ''), + } + agent_list.append(agent_dict) + + formatted_desc = _format_external_agents_description(external_a2a_agents, language=language) + return ExternalAgentsComponent( + agents=agent_list, + formatted_description=formatted_desc, + priority=priority, + ) + + +def build_system_prompt_component( + content: str, + template_name: Optional[str] = None, + priority: int = 100, +) -> "SystemPromptComponent": + """Build SystemPromptComponent with rendered content. + + Args: + content: Rendered system prompt content + template_name: Source template name for reference + priority: Component priority (highest by default) + + Returns: + SystemPromptComponent instance + """ + from nexent.core.agents.agent_model import SystemPromptComponent + + return SystemPromptComponent( + content=content, + template_name=template_name, + priority=priority, + ) + + +def build_skills_usage_component( + skills: List[Dict[str, str]], + language: str = "zh", + priority: int = 40, +) -> "SystemPromptComponent": + """Build SystemPromptComponent for skills usage requirements. + + This is a skeleton-like component but its content depends on + whether skills exist, so it's built dynamically. + + Args: + skills: List of skill dicts + language: Language code ('zh' or 'en') + priority: Component priority + + Returns: + SystemPromptComponent instance + """ + from nexent.core.agents.agent_model import SystemPromptComponent + + content = _format_skills_usage_requirements(skills, language=language) + return SystemPromptComponent( + content=content, + template_name="skills_usage", + priority=priority, + ) + + +def build_agent_fallback_component( + managed_agents: Dict[str, Any], + external_a2a_agents: Dict[str, Any], + language: str = "zh", + priority: int = 5, +) -> "SystemPromptComponent": + """Build SystemPromptComponent for agent fallback message. + + Only emits content when no agents are available. + + Args: + managed_agents: Dict of managed agents + external_a2a_agents: Dict of external agents + language: Language code + priority: Component priority + + Returns: + SystemPromptComponent instance (may have empty content) + """ + from nexent.core.agents.agent_model import SystemPromptComponent + + content = _format_agent_fallback(managed_agents, external_a2a_agents, language=language) + return SystemPromptComponent( + content=content, + template_name="agent_fallback", + priority=priority, + ) + + +# ============================================================================= +# SECTION 4: Main assembly function - build_context_components +# ============================================================================= + + +def build_context_components( + # Raw params for piecewise assembly (NEW in Goal 3) + duty: Optional[str] = None, + constraint: Optional[str] = None, + few_shots: Optional[str] = None, + app_name: Optional[str] = None, + app_description: Optional[str] = None, + time_str: Optional[str] = None, + user_id: Optional[str] = None, + language: str = "zh", + is_manager: bool = True, + # Piecewise data sources + tools: Optional[Dict[str, Any]] = None, + skills: Optional[List[Dict[str, str]]] = None, + managed_agents: Optional[Dict[str, Any]] = None, + external_a2a_agents: Optional[Dict[str, Any]] = None, + memory_list: Optional[List[Any]] = None, + memory_search_query: Optional[str] = None, + knowledge_base_summary: Optional[str] = None, + kb_ids: Optional[List[str]] = None, + # Legacy param for fallback (removed short-circuit in Goal 3) + system_prompt: Optional[str] = None, + # Inclusion flags (kept for backward compatibility) + include_tools: bool = True, + include_skills: bool = True, + include_memory: bool = True, + include_knowledge_base: bool = True, + include_managed_agents: bool = True, + include_external_agents: bool = True, + include_app_context: bool = True, +) -> List["ContextComponent"]: + """Build list of ContextComponents from agent configuration data. + + Piecewise assembly: Each semantic section is emitted as a dedicated + ContextComponent, assembled in the exact order matching Jinja2 templates. + + Assembly order (12 sections): + 1. Header (基本信息) + 2. Memory (上下文记忆) - if memory_list exists + 3. Duty (核心职责 + 安全准则) + 4. Skills (可用技能 + 6步流程) - if skills exist + 5. Execution Flow (执行流程 + 输出规范) + 6. Tools (可用资源/1. 工具 + 文件链接指南) + 7. Managed Agents (可用资源/2. 助手) - if managed_agents exist + 8. External Agents (外部助手) - if external_a2a_agents exist + 9. Agent Fallback (当前没有可用的助手) - if no agents + 10. Skills Usage (可用资源/3. 技能 + 使用要求) + 11. Constraint (资源使用要求) + 12. Code Norms (python代码规范) + 13. Footer (示例模板 + 结尾) + + Note: The a330d815 short-circuit (if system_prompt: return [single]) + has been REMOVED. All callers must provide raw params for piecewise assembly. + The system_prompt param is kept for future fallback use but not currently + used in the piecewise path. + + Args: + duty: Agent's primary duty text + constraint: Resource usage constraint text + few_shots: Example templates text + app_name: Application name + app_description: Application description + time_str: Current time string + user_id: Current user ID + language: Language code ('zh' or 'en') + is_manager: Whether this is a manager agent + tools: Dict of tool name -> ToolConfig + skills: List of skill dicts with name and description + managed_agents: Dict of agent name -> AgentConfig + external_a2a_agents: Dict of agent_id -> ExternalA2AAgentConfig + memory_list: List of memory search results + memory_search_query: Query used to search memory + knowledge_base_summary: Summary text from knowledge bases + kb_ids: List of knowledge base IDs + system_prompt: (Legacy) Pre-rendered system prompt - NOT USED in piecewise path + include_*: Flags for backward compatibility + + Returns: + List of ContextComponent instances ready for ContextManager + """ + components: List = [] + + # 1. Header + if app_name and app_description and time_str and user_id: + components.append( + build_skeleton_header_component( + app_name=app_name, + app_description=app_description, + time_str=time_str, + user_id=user_id, + language=language, + ) + ) + + # 2. Memory (if exists) + if include_memory and memory_list: + components.append( + build_memory_component( + memory_list=memory_list, + search_query=memory_search_query, + language=language, + ) + ) + + # 3. Duty + Safety Principles + if duty: + components.append( + build_skeleton_duty_component( + duty=duty, + language=language, + ) + ) + + # 4. Skills (if exists) - includes 6-step process + if include_skills and skills: + components.append( + build_skills_component( + skills=skills, + language=language, + ) + ) + + # 5. Execution Flow + components.append( + build_skeleton_execution_flow_component( + memory_list=memory_list, + language=language, + is_manager=is_manager, + ) + ) + + # 6. Tools + File URL Guide + if include_tools and tools: + components.append( + build_tools_component( + tools=tools, + knowledge_base_summary=knowledge_base_summary, + language=language, + is_manager=is_manager, + ) + ) + + # 7. Managed Agents (if exists) - manager only + if is_manager and include_managed_agents and managed_agents: + components.append( + build_managed_agents_component( + managed_agents=managed_agents, + language=language, + ) + ) + + # 8. External Agents (if exists) - manager only + if is_manager and include_external_agents and external_a2a_agents: + components.append( + build_external_agents_component( + external_a2a_agents=external_a2a_agents, + language=language, + ) + ) + + # 9. Agent Fallback (if no agents available) - manager only + if is_manager and not managed_agents and not external_a2a_agents: + fallback_comp = build_agent_fallback_component( + managed_agents=managed_agents or {}, + external_a2a_agents=external_a2a_agents or {}, + language=language, + ) + if fallback_comp.content: # Only add if has content + components.append(fallback_comp) + + # 10. Skills Usage Requirements + if include_skills: + components.append( + build_skills_usage_component( + skills=skills or [], + language=language, + ) + ) + + # 11. Constraint + if constraint: + components.append( + build_skeleton_constraint_component( + constraint=constraint, + language=language, + ) + ) + + # 12. Code Norms + components.append( + build_skeleton_code_norms_component( + language=language, + is_manager=is_manager, + ) + ) + + # 13. Footer + if few_shots: + components.append( + build_skeleton_footer_component( + few_shots=few_shots, + language=language, + ) + ) + + return components + + +def build_app_context_string( + app_name: str, + app_description: str, + user_id: str, +) -> str: + """Build app context string for template injection. + + Args: + app_name: Application name + app_description: Application description + user_id: Current user ID + + Returns: + Formatted app context string + """ + time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + return _format_app_context(app_name, app_description, user_id, time_str) \ No newline at end of file diff --git a/sdk/benchmark/README.md b/sdk/benchmark/README.md new file mode 100644 index 000000000..2254ecc0d --- /dev/null +++ b/sdk/benchmark/README.md @@ -0,0 +1,324 @@ +# Agent Context Compression Benchmark + +## Objectives + +Evaluate whether the compressed Agent can still function properly: + +- **Continuation**: Can the agent continue the task after compression? +- **Memory Retention**: Can the agent remember key states after compression? +- **Token Reduction**: Does the token count effectively decrease? + + +--- + +## Two Evaluation Paths + +``` +benchmark/ +├── manual_cases/ # Handcrafted cases, complete evaluation pipeline +├── acon_eval/ # QA evaluation based on ACON dataset +├── eventqa_eval/ # Long-text memory evaluation based on EventQA dataset +└── paths.py # Shared path resolution +``` + +### 1. manual_cases — Handcrafted Case Evaluation + +Handcrafted test cases running the complete evaluation pipeline (continuation, probe, static inspection). + +``` +manual_cases/ +├── cases/ # test_benchmark.py input +│ └── / +│ ├── case.json # queries, probes, checks, config +│ └── history.json # conversation history +├── inspections/ # summary_inspector.py input (standalone run) +│ └── / +│ ├── history.json +│ ├── checks.json # [{"description": "...", "must_contain": [...]}] +│ ├── _result.json # output: inspection results +│ └── _summary.txt # output: raw summary text (--save-summary) +├── reports/ # test_benchmark.py output +│ ├── .json # single-case complete report +│ └── summary.json # cross-case aggregate metrics +├── agent_runner.py # agent run + tracing utilities +├── eval_utils.py # keyword evaluation +├── summary_inspector.py # standalone summary inspection (low cost, no agent run) +└── test_benchmark.py # complete benchmark runner +``` + +`case.json` format: + +```json +{ + "id": "example_infra", + "history_file": "history.json", + "queries": [], + "probes": [], + "summary_checks": [], + "task_checks": [], + "compressed_config": {} +} +``` + +- `id`: unique case identifier, also used as report filename +- `history_file`: conversation history file, relative to case directory (default `history.json`) +- `queries`: continuation queries +- `probes`: memory probe questions +- `summary_checks`: static summary inspections +- `task_checks`: task result inspections +- `compressed_config`: compression config overrides + +`history.json` format: + +```json +[ + {"role": "user", "content": "..."}, + {"role": "assistant", "content": "..."} +] +``` + + +#### Evaluation Metrics + +Each case runs two groups: + +1. **baseline** (no compression) +2. **compressed** (compression enabled) + +Core metrics: + +```python +task_success_retention = compressed_task_score / baseline_task_score + +probe_retention = compressed_probe_score / baseline_probe_score + +token_reduction = 1 - compressed_tokens / baseline_tokens +``` + +--- + +**Continuation Evaluation** +Continuation queries simulate real multi-turn Agent interactions. + +Allowed: + +- history growth +- continuous compression occurrence +- ContextManager reuse across turns + +This is a **stateful** evaluation. + + +**Probe Evaluation** +Probes check whether the compressed agent can **utilize** residual information to answer questions. + +Important rules: + +- freeze the compressed history snapshot (deep copy per probe) +- each probe runs independently +- probes cannot modify the original history (isolated via deep copy) +- probes cannot share context with each other + +Compression happens once, all probes reuse the result: + +1. Get summary + compression_boundary from the compressed run's `export_summary()` +2. Build precompressed history with `build_precompressed_history()`: + - compressed pairs replaced with a single (user=summary, assistant=ack) + - retained tail pairs preserved verbatim +3. Each probe runs with precompressed history + compression disabled +4. Avoid redundant compression LLM calls per probe (same input → same compression result, no need to call LLM repeatedly) + + +### 2. acon_eval — Dataset-driven QA Evaluation + +Uses ACON's `nq_multi_8` dataset (multi-objective questions + Wikipedia search) to evaluate compression's impact on QA accuracy. + +Unlike manual_cases, this **does not use** handcrafted probes or continuation queries, but directly compares baseline vs compressed **task accuracy** (EM/F1) on a standardized dataset. + +``` +acon_eval/ +├── data/nq_multi_8/ # ACON dataset (JSONL) +│ ├── train.jsonl +│ ├── test.jsonl +│ └── folds/ # few-shot fold data +├── outputs/ # results per mode +│ ├── baseline/test/ +│ │ ├── predictions.jsonl +│ │ └── summary.json +│ └── context_manager/test/ +│ ├── predictions.jsonl +│ └── summary.json +├── agent_runner.py # agent run + tracing +├── dataset.py # ACON dataset loader +├── eval_utils.py # EM/F1 scoring +├── run_acon_qa.py # main entry point +└── tools.py # wikipedia_search + final_answer tools +``` + +Usage: + +```bash +# First start ACON retriever service (see ACON README) https://github.com/microsoft/acon/blob/main/experiments/smolagents/README.md +# python retriever_server.py --index_path database/wikipedia/bm25/ --corpus_path database/wikipedia/wiki-18.jsonl +# The retriever_server.py above has been modified (see this directory's version). Also need to manually download bm25 index files and wiki-18 dataset +# bm25: https://huggingface.co/datasets/PeterJinGo/wiki-18-bm25-index/tree/main/bm25 +# wiki-18: https://huggingface.co/datasets/PeterJinGo/wiki-18-corpus/tree/main +python run_acon_qa.py \ + --data_folder ./data/nq_multi_8 \ + --split test \ + --mode baseline \ + --num_objectives 4 \ + --limit 1 + +python run_acon_qa.py \ + --data_folder ./data/nq_multi_8 \ + --split test \ + --mode context_manager \ + --num_objectives 4 \ + --token_threshold 6000 \ + --keep_recent_steps 4 \ + --enable_reload \ + --limit 1 + +``` + +**Modes**: `baseline` (no compression) vs `context_manager` (nexent built-in compression). +**Note**: The conversation history structure here differs from manual_cases. This test scenario has no previous history, only multi-step within the current session. + +--- + +### 3. eventqa_eval — EventQA Long-text Memory Evaluation + +Uses MemoryAgentBench's EventQA dataset (5 novels, each 390K–530K tokens, 100 "what happens next" six-choice MCQs per book) to evaluate compression's impact on **ultra-long document memory**. + +Like acon_eval, this is dataset-driven but with a different scenario: the entire novel as history to be compressed, MCQs directly serve as memory probes—questions come with prior events, naturally asking "given the compressed summary, what happens next", no need for additional probe construction. + +``` +eventqa_eval/ +├── data/ # novels downloaded by download_data.py +│ └── eventqa_full.jsonl +├── outputs/ # results per book +│ └── / +│ │ ├── predictions.jsonl # per-question baseline vs compressed comparison +│ │ └── summary.json # single-book metrics +│ └── summary.json # cross-book aggregate +├── download_data.py # download EventQA data from HuggingFace +├── dataset.py # EventQA loader + six-choice MCQ parser +├── eval_utils.py # six-choice accuracy scoring +└── run_eventqa.py # main entry point +``` + +**Two evaluation arms** (same model, clean retention ratio): + +| Arm | Compression | Novel Context | +|---|---|---| +| Baseline | Disabled | Entire novel truncated to model window then fed whole (questions beyond window will fail) | +| Compressed | Enabled | Novel chunked and fed in multiple turns, real ContextManager incremental compression; MCQs as probes | + +Both arms answer **the same 100 questions**, so the retention ratio is clean: + +```python +memory_retention = compressed_accuracy / baseline_accuracy + +token_reduction = 1 - last_compressed_tokens / last_uncompressed_tokens +``` + +No Continuation evaluation—EventQA MCQs are independent, no multi-turn task continuation. + +Usage: + +```bash +# One-time: download 5 novels (~13MB, written to data/) +python download_data.py + +# Smoke test: 1 book, 1 question, novel truncated to 48K chars (trigger compression) +python run_eventqa.py --book_limit 1 --limit 1 \ + --max_ingest_chars 48000 --chunk_chars 12000 \ + --token_threshold 3000 --keep_recent_pairs 1 + +# Full run: 5 books × 100 questions +python run_eventqa.py +``` + +**Note**: `eventqa_full` novels are 1.7M–3.2M characters, no model can ingest the entire book without compression, so baseline uses "truncate to window" as the no-compression control (`--baseline_context_chars` controls truncation length). The dataset also has `eventqa_65536` / `eventqa_131072` pre-truncated variants, but their questions differ from `eventqa_full`, cannot directly compare with full. + +--- + +## Supplementary Notes + +### Probe Construction Principle: Only Target Compressed Content + +The core purpose of probes is to detect memory retention, i.e., "whether the agent can answer information that was compressed away". +Therefore **probes should only ask about content in the compressed region**, not information retained in the tail steps. + +Compression boundary is temporal: `keep_recent_pairs=N` means the last N pairs are preserved verbatim, everything before enters the summary. Therefore: + +- **Probes should only ask about details in the early pairs (history first half)** +- If a probe asks about information in recent pairs, the agent can answer without the summary, the probe fails—cannot measure memory retention + +When constructing probes, no need to know exactly what the compressor retained, just ensure probe-dependent information comes from early history (region that will definitely be compressed). + +**Verify probe design**: Use `export_summary()`'s `compression_boundary` field to confirm which pairs were compressed vs retained. If the probe's answer isn't in the summary at all, that's a compressor problem (belongs to Static Inspection layer), not an agent problem. + +--- + +### Static Summary Inspection vs Probe Eval + +Both test different failure modes: + +| | Probe Eval | Static Summary Inspection | +|--|-----------|--------------------------| +| Input | Complete compressed context (summary + retained tail steps + system prompt) | Summary text only | +| Execution | Let agent answer questions (run LLM) | Directly inspect summary text for key information | +| What it tests | Whether agent **can utilize** residual information | Whether compressor **chose to retain** key information | +| Failure meaning | Summary has it but agent didn't use it → retrieval/utilization capability issue | Summary doesn't have it → compressor lost it | + +**Two different failure modes**: +1. Compressor retained, but agent didn't utilize → **Probe Eval** catches this, Inspection won't +2. Compressor didn't retain at all → Both catch this, but should attribute to Inspection layer + +--- + +### Static Summary Inspection + +Directly inspect whether the compressed summary still contains key information. + +#### Online Approach + +After agent run, export compression state: + +```python +compressed_state = shared_cm.export_summary() +# compressed_state contains: +# previous_summary / current_summary: compressed summary text +# compression_boundary: which pairs/steps were compressed vs retained +# previous_cache_info / current_cache_info: cache metadata + +for check in summary_checks: + eval_text(compressed_state["previous_summary"], check) +``` + +#### Offline Approach + +Run compression on pure text pairs without agent, using the same prompt and schema: + +```python +from nexent.core.agents.agent_context import compress_history_offline + +result = compress_history_offline( + pairs=[("What user said", "What assistant did"), ...], + model=llm_model, + config=ContextManagerConfig(), +) +# result["summary"]: compressed summary +# result["is_incremental"]: whether incremental compression was used +# result["is_fallback"]: whether LLM failed and used fallback +# result["input_text"]: raw text fed to LLM (for debugging) + +eval_text(result["summary"], {"must_contain": ["key_filename"]}) +``` + +Offline approach advantages: +- No need to run agent, just one LLM call for compression +- No dependency on AgentMemory, ActionStep and other runtime objects +- Suitable for batch evaluation of different prompt/schema impacts on compression quality \ No newline at end of file diff --git a/sdk/benchmark/acon_eval/.gitignore b/sdk/benchmark/acon_eval/.gitignore new file mode 100644 index 000000000..ac377248b --- /dev/null +++ b/sdk/benchmark/acon_eval/.gitignore @@ -0,0 +1,9 @@ +# Source datasets (regenerate via dataset.py / external download) +data/ + +# Generated benchmark results (regenerated by each run) +outputs/ + +# Runtime artifacts +__pycache__/ +nexent_context_metrics.log diff --git a/sdk/benchmark/acon_eval/README.md b/sdk/benchmark/acon_eval/README.md new file mode 100644 index 000000000..33f74aaea --- /dev/null +++ b/sdk/benchmark/acon_eval/README.md @@ -0,0 +1,281 @@ +# ACON QA 评估 + +基于 [ACON](https://github.com/microsoft/acon) 的 `nq_multi_8` 数据集(多目标问题 + Wikipedia 搜索),评估 nexent 上下文压缩对 QA 准确率的影响。 + +## 目的 + +对比 **baseline**(不压缩)与 **context_manager**(nexent 内置压缩)在标准化数据集上的任务准确率(EM/F1)、token 消耗和压缩成本。 + +与 `manual_cases` 不同,这里不使用手工构造的 probe 或 continuation query,而是直接在标准化数据集上衡量:上下文压缩介入后,agent 是否仍能正确回答多跳问题。 + +## 目录结构 + +``` +acon_eval/ +├── data/nq_multi_8/ # ACON 数据集(JSONL) +│ ├── train.jsonl +│ ├── test.jsonl +│ └── folds/ # few-shot 折叠数据 +├── outputs/ # 各模式结果 +│ ├── baseline/test/ +│ │ ├── predictions.jsonl # 逐样本预测 + 得分 +│ │ └── summary.json # 汇总 EM/F1/token 指标 +│ └── context_manager/test/ +│ ├── predictions.jsonl +│ └── summary.json +├── run_acon_qa.py # 主入口 +├── dataset.py # JSONL 加载器 + QAExample 数据类 +├── eval_utils.py # SQuAD 风格 EM 和 F1 评分 +├── tools.py # wikipedia_search + final_answer 工具 +└── retriever_sesrver.py # 本地 FastAPI 检索引擎(BM25 over wiki-18) +``` + +## 前置准备 + +### 1. 启动 ACON Retriever 服务 + +下载 BM25 索引(约 2.2GB)和 wiki-18 语料(2018 年英文 Wikipedia 全量快照,约 14GB,约 500 万篇条目),然后启动检索引擎: + +```bash +# 下载 BM25 索引 +# https://huggingface.co/datasets/PeterJinGo/wiki-18-bm25-index/tree/main/bm25 +# 下载 wiki-18 语料 +# https://huggingface.co/datasets/PeterJinGo/wiki-18-corpus/tree/main +# export OPENAI_API_KEY="xxx" 程序默认需要非空的OPENAI_API_KEY,但是实际上用不到,这里需要占位 + +python retriever_server.py \ + --index_path database/wikipedia/bm25/ \ + --corpus_path database/wikipedia/wiki-18.jsonl +``` + +服务监听在 `http://127.0.0.1:8005/retrieve`。 + +### 2. 数据集 + +将 `nq_multi_8` 数据集放入 `data/nq_multi_8/`。数据来源于 Natural Questions,每条样本包含 8 个子问题,需要 agent 通过 Wikipedia 搜索逐一回答。 + +**数据格式**(JSONL,每行一条): + +```json +{ + "id": "nq_multi8_test_2200", + "question": "where is the food stored in a yam plant?; who plays lefou in beauty and the beast 1991?; ...", + "answer": [ + ["an edible tuber"], + ["Jesse Corti", "Venezuelan voice actor Jesse Corti"], + ... + ] +} +``` + +字段说明: + +| 字段 | 说明 | +|---|---| +| `id` | 样本唯一标识 | +| `question` | 8 个子问题,用 `; ` 拼接。可通过 `--num_objectives` 截断使用前 N 个 | +| `answer` | 长度为 8 的列表,每个元素是一个 **gold answer 变体列表**(同义词/别名均视为正确) | + +`QALoader`(`dataset.py`)负责解析 JSONL,自动兼容 `id`/`qid`/`question_id`、`question`/`query`、`answer`/`answers`/`final_answer` 等多种字段名。 + +## Agent 工具 + +`tools.py` 定义了两个 smolagents `Tool` 子类,供 nexent agent 调用。 + +### wikipedia_search + +通过 HTTP POST 调用本地 retriever 服务,在 2018 Wikipedia BM25 索引中进行**关键词检索**(BM25 是基于词频和逆文档频率的词汇匹配算法,非语义搜索)。 + +**关键约束**:agent 必须通过 `wikipedia_search` 获取答案,**禁止依赖模型自有知识直接作答**。原因:(1) 数据集标注以 2018 Wikipedia 内容为准,模型训练数据可能过时或缺失相关条目;(2) 多跳搜索过程会产生 agent steps 累积,正是压缩评估所依赖的场景。 + +```python +class WikipediaSearchTool(Tool): + name = "wikipedia_search" + inputs = { + "query": {"type": "string"}, + "n_results": {"type": "integer", "nullable": True}, # 3~10,默认 3 + } + output_type = "string" + + def forward(self, query: str, n_results: int = 3) -> str: + # POST http://127.0.0.1:{port}/retrieve + # payload: {"queries": [query], "topk": n_results, "return_scores": True} + # 返回 "Retrieved documents:\n\n[Document 0]\n<内容>..." +``` + +- `n_results` 自动钳位到 [3, 10] +- 返回值是拼接好的文档文本,agent 以 Observation 形式接收 +- 端口通过 `--retriever_port` 指定(默认 8005) + +### final_answer + +提交最终答案,结束当前任务。 + +```python +class FinalAnswerTool(Tool): + name = "final_answer" + inputs = {"answer": {"type": "any"}} + output_type = "any" + + def forward(self, answer: Any) -> Any: + return answer +``` + +### 工具注册 + +`register_acon_tools()` 将两个类注入到 `nexent.core.tools` 和 `nexent.core.agents.nexent_agent` 模块的命名空间,使 `NexentAgent.create_local_tool()` 能通过 `globals()` 找到它们。`get_acon_tool_configs(port)` 返回对应的 `ToolConfig` 列表。 + +```python +from tools import register_acon_tools, get_acon_tool_configs +register_acon_tools() +tools = get_acon_tool_configs(port="8005") +``` + +## 用法 + +```bash +# Baseline(不压缩) +python run_acon_qa.py \ + --data_folder ./data/nq_multi_8 \ + --split test \ + --mode baseline \ + --num_objectives 4 \ + --limit 10 + +# Context manager(开启压缩) +python run_acon_qa.py \ + --data_folder ./data/nq_multi_8 \ + --split test \ + --mode context_manager \ + --num_objectives 4 \ + --token_threshold 6000 \ + --keep_recent_pairs 1 \ + --keep_recent_steps 4 \ + --limit 10 +``` + +### 关键参数 + +| 参数 | 默认值 | 说明 | +|---|---|---| +| `--mode` | `baseline` | `baseline`(不压缩)或 `context_manager`(开启压缩) | +| `--num_objectives` | `8` | 每个样本使用的子问题数(1-8) | +| `--token_threshold` | `7200` | 触发压缩的 token 阈值 | +| `--keep_recent_pairs` | `1` | 保留不压缩的最近消息对数 | +| `--keep_recent_steps` | `4` | 保留不压缩的最近 agent 步数 | +| `--max_steps` | `30` | 每个样本的最大 agent 步数 | +| `--retriever_port` | `8005` | Retriever 服务端口 | +| `--limit` | 无 | 限制样本数量 | +| `--id_list_file` | 无 | 按指定 ID 列表过滤样本 | + +## 评估流程 + +1. **加载数据集** — `QALoader` 读取 JSONL,生成 `QAExample` 对象(id、question、answer) +2. **构建 agent** — nexent `CoreAgent` 配备 `wikipedia_search` + `final_answer` 工具,以及自定义 QA system prompt,强制按顺序回答子问题并使用 ANSWER_Q 标记 +3. **逐样本运行** — agent 回答所有子问题;最终答案按 `;` 拆分后进行逐子问题评分 +4. **评分** — SQuAD 风格的归一化 EM 和 max-F1,与 gold answer 变体对比 +5. **汇总指标** — token 消耗、步数,以及(context_manager 模式下)压缩 token 成本 + +### Context Manager 模式细节 + +在 `context_manager` 模式下,共享的 `ContextManager` 追踪对话 token 数,超过阈值时触发压缩。压缩使用自定义 JSON schema 追踪每个子问题的进度(status、search_counts、answers),确保 agent 不会丢失"哪些子问题已回答/已耗尽"的状态。 + +> **说明**:该测试场景下不存在 previous history,只有 current 场景下的多步累积。压缩发生在 agent 步数增长的过程中。 + +## 评分指标 + +评分逻辑见 `eval_utils.py`,采用 SQuAD 风格的归一化 EM 和 F1。 + +### 答案归一化(`_normalize_answer`) + +在比较前,预测和 gold answer 都会经过以下归一化流水线: + +1. **小写化** — 全部转为小写 +2. **去标点** — 移除所有英文标点符号 +3. **去冠词** — 移除 `a`/`an`/`the` +4. **空白归一** — 多个连续空白合并为一个空格 +5. **复数归一** — 长度 > 3 且以 `s` 结尾(非 `ss` 结尾)的单词,去掉末尾 `s`,统一单复数形式 + +例如 `"the Cats"` → `"cat"`,`"September 1980"` → `"september 1980"`。 + +### Exact Match (EM) + +```python +em = any(normalize(pred) == normalize(gold_variant) for gold_variant in gold_list) +``` + +- 预测与 **任一** gold 变体归一化后完全一致即判为正确(True/False) +- 每个子问题独立计算,最终 `avg_em = sum(em_list) / n_sub` + +### F1 + +```python +pred_tokens = normalize(pred).split() +gold_tokens = normalize(gold).split() + +precision = overlap / len(pred_tokens) +recall = overlap / len(gold_tokens) +f1 = 2 * precision * recall / (precision + recall) +``` + +- 在 token 级别计算 precision/recall,取调和平均 +- 对每个 gold 变体分别计算 F1,**取最大值**(`f1_max`) +- 每个子问题独立计算,最终 `avg_f1 = sum(f1_list) / n_sub` + +### 最终得分 + +```python +n_sub = len(gold_answer_list) # 子问题数 +em_score = sum(em_list) / n_sub # 0.0 ~ 1.0 +f1_score = sum(f1_list) / n_sub # 0.0 ~ 1.0 +``` + +如果预测的子答案数量不足,缺失部分补空字符串;超出则截断,始终与 gold 子问题数对齐。 + +## 输出格式 + +### `predictions.jsonl`(每行一个 JSON 对象) + +```json +{ + "id": "example_id", + "question": "子问题1; 子问题2; ...", + "answer": [["gold1_v1", "gold1_v2"], ["gold2"]], + "prediction": ["pred1", "pred2"], + "pred_raw": "pred1; pred2", + "em": 0.5, + "f1": 0.67, + "em_list": [true, false], + "f1_list": [0.8, 0.54], + "step_count": 12, + "errors": [], + "total_input_tokens": 45000, + "total_output_tokens": 1200, + "cm_stats": {...}, + "cm_token_counts": {...} +} +``` + +### `summary.json` + +```json +{ + "total": 100, + "avg_em": 0.42, + "avg_f1": 0.58, + "mode": "context_manager", + "split": "test", + "num_objectives": 4, + "avg_input_tokens": 38000, + "avg_output_tokens": 1100, + "total_compression_input_tokens": 120000, + "total_compression_output_tokens": 8000, + "timestamp": "2026-05-25T..." +} +``` + +## 设计要点 + +- **无 prior history** — 与 `manual_cases` 不同,没有预存对话历史。压缩在 agent 步数累积过程中发生。 +- **自定义 summary schema** — 摘要追踪每个子问题的状态(answers、status、search_counts),而非通用对话摘要,因为 agent 的任务是结构化的多问题 QA。 +- **逐子问题评分** — 预测按 `;` 拆分,每个子答案独立评分后取平均,可细粒度地检测多跳链中哪一环在压缩下断裂。 diff --git a/sdk/benchmark/acon_eval/acon_eval_res.md b/sdk/benchmark/acon_eval/acon_eval_res.md new file mode 100644 index 000000000..e4e5ab521 --- /dev/null +++ b/sdk/benchmark/acon_eval/acon_eval_res.md @@ -0,0 +1,57 @@ +# ACON Multi-Objective QA Evaluation Results + +## Experiment Setup + +- **Data**: nq_multi_8/test, 100 samples, 8 sub-questions per sample, max_steps=40 +- **baseline**: `token_threshold=10^9`, compression never triggers, agent sees full conversation history +- **context_manager**: `token_threshold=7200`, triggers compression when exceeded, `keep_recent_pairs=1`, `keep_recent_steps=4` + +## Results + +| Metric | baseline | context_manager | Delta | +|---|---|---|---| +| Avg EM | **38.25%** | 34.88% | -3.37pp | +| Avg F1 | **49.46%** | 46.15% | -3.31pp | +| Avg Input Tokens | 188,232 | 92,294 | **-51.0%** | +| Avg Output Tokens | 2,294 | 2,209 | -3.7% | +| Avg Steps | 22.7 | 21.0 | -1.7 | + +## Compression Overhead + +context_manager incurs additional LLM calls for compression: + +- Avg compression input per sample: **9,715 tokens** +- Avg compression output per sample: **511 tokens** +- Only ~10% of total input, cost is well justified + +## Compression Strategy Analysis + +### Why `keep_recent_steps=4` is reasonable + +- Each sub-question consumes 1-3 steps (1-3 searches), so a 4-step window covers the full trajectory of the current sub-question +- Global state (answers, status, search counts, next action) is carried by the summary JSON, serving as long-term memory +- The agent never "forgets" completed answers — the summary explicitly requires: "Treat ANSWER_Q marker as authoritative; never replace with null or Unknown" +- `agent_context.py:613` has a safety mechanism: if the boundary splits a tool_call + observation pair, it auto-extends to `keep_n + 1` + +### Summary JSON schema + +The summary tracks per-question state machine: + +- `answers[]` — canonical answer for each sub-question (or null) +- `status[]` — one of: unstarted, searching, answered, exhausted +- `search_counts[]` — count of wikipedia_search calls per question +- `current_q` — next question to solve +- `pending_q` — questions still unstarted or searching +- `next_action` — specific mechanical next step + +## Possible Causes of 3pp EM Drop + +`keep_recent_steps=4` is well-designed; the gap is more likely from summary quality than window size: + +1. **Summary LLM fidelity**: the LLM generating the summary may mis-record answer text or status, permanently losing information +2. **Cross-question search context loss**: the summary preserves only answer strings, not raw search observations — cross-question reuse of earlier search results is inherently lost with summarization +3. **Incremental update drift**: after 20+ incremental updates, the summary state may drift from the true trajectory + +## Summary + +Trading **51% token savings** for **3pp quality drop**. The compression window configuration is sound; optimization headroom lies in summary fidelity rather than window size. diff --git a/sdk/benchmark/acon_eval/dataset.py b/sdk/benchmark/acon_eval/dataset.py new file mode 100644 index 000000000..ce3280381 --- /dev/null +++ b/sdk/benchmark/acon_eval/dataset.py @@ -0,0 +1,79 @@ + +"""Dataset loader for ACON's 8-objective QA benchmark (nq_multi_8). + +Adapted from ACON's experiments/smolagents/dataset.py. +Supports JSONL format with fields: id, question, answer. +""" +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional + + +@dataclass +class QAExample: + id: str + question: str + answer: Any # str or list[list[str]] — each sub-answer is a list of acceptable variants + contexts: Optional[List[str]] = None + + +class QALoader: + def __init__(self, data_path: str): + self.path = Path(data_path) + if not self.path.exists(): + raise FileNotFoundError(f"Data file not found: {self.path}") + self.is_jsonl = self.path.suffix.lower() in {".jsonl", ".jl"} + + def count(self, limit: Optional[int] = None) -> int: + if self.is_jsonl: + total = 0 + with self.path.open("r", encoding="utf-8") as f: + for line in f: + if line.strip(): + total += 1 + else: + data = json.loads(self.path.read_text(encoding="utf-8")) + if isinstance(data, dict) and "data" in data: + data = data["data"] + total = len(data) + + if limit is not None: + total = min(total, limit) + return total + + def _iter_jsonl(self) -> Iterable[Dict[str, Any]]: + with self.path.open("r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + yield json.loads(line) + + def _iter_json(self) -> Iterable[Dict[str, Any]]: + data = json.loads(self.path.read_text(encoding="utf-8")) + if isinstance(data, dict) and "data" in data: + data = data["data"] + for item in data: + yield item + + def _normalize(self, raw: Dict[str, Any]) -> QAExample: + qid = str(raw.get("id") or raw.get("qid") or raw.get("question_id") or "") + question = raw.get("question") or raw.get("query") or "" + answer = raw.get("answer") + if answer is None: + answer = raw.get("answers") or raw.get("final_answer") or "" + contexts = raw.get("contexts") or raw.get("supporting_facts") or None + return QAExample(id=qid, question=question, answer=answer, contexts=contexts) + + def iter(self, limit: Optional[int] = None) -> Iterable[QAExample]: + it = self._iter_jsonl() if self.is_jsonl else self._iter_json() + count = 0 + for raw in it: + ex = self._normalize(raw) + if not ex.question: + continue + yield ex + count += 1 + if limit is not None and count >= limit: + break diff --git a/sdk/benchmark/acon_eval/eval_utils.py b/sdk/benchmark/acon_eval/eval_utils.py new file mode 100644 index 000000000..89d44b90f --- /dev/null +++ b/sdk/benchmark/acon_eval/eval_utils.py @@ -0,0 +1,76 @@ + +"""ACON-style evaluation utilities: exact match and F1 scoring. + +Adapted from ACON's experiments/smolagents/eval_utils.py for use with +the nexent agent evaluation pipeline. +""" +import re +import string +from typing import Any + + +def _normalize_answer(s: str) -> str: + """SQuAD-style answer normalization with plural handling.""" + def lower(text: str) -> str: + return text.lower() + + def remove_punc(text: str) -> str: + return text.translate(str.maketrans('', '', string.punctuation)) + + def remove_articles(text: str) -> str: + return re.sub(r"\b(a|an|the)\b", " ", text) + + def white_space_fix(text: str) -> str: + return " ".join(text.split()) + + def normalize_plurals(text: str) -> str: + """Strip trailing 's' from words longer than 3 chars to unify singular/plural.""" + return " ".join( + word[:-1] if len(word) > 3 and word.endswith("s") and not word.endswith("ss") else word + for word in text.split() + ) + + return normalize_plurals(white_space_fix(remove_articles(remove_punc(lower(s))))) + + +def _f1_score(prediction: str, ground_truth: str) -> float: + pred_tokens = _normalize_answer(prediction).split() + gold_tokens = _normalize_answer(ground_truth).split() + if len(pred_tokens) == 0 and len(gold_tokens) == 0: + return 1.0 + if len(pred_tokens) == 0 or len(gold_tokens) == 0: + return 0.0 + common: dict[str, int] = {} + for t in pred_tokens: + common[t] = common.get(t, 0) + 1 + overlap = 0 + for t in gold_tokens: + if common.get(t, 0) > 0: + overlap += 1 + common[t] -= 1 + if overlap == 0: + return 0.0 + precision = overlap / len(pred_tokens) + recall = overlap / len(gold_tokens) + return 2 * precision * recall / (precision + recall) + + +def exact_match(pred: Any, gold: Any) -> bool: + """SQuAD-style normalized exact match.""" + def norm_one(x: Any) -> str: + if isinstance(x, (list, tuple)): + x = x[0] if x else "" + return _normalize_answer(str(x)) + + p = norm_one(pred) + if isinstance(gold, (list, tuple)): + return max(p == norm_one(g) for g in gold) + return p == norm_one(gold) + + +def f1_max(pred: Any, gold: Any) -> float: + """Max F1 over gold answer variants.""" + p = str(pred) if pred is not None else "" + if isinstance(gold, (list, tuple)): + return max((_f1_score(p, str(g)) for g in gold), default=0.0) + return _f1_score(p, str(gold)) diff --git a/sdk/benchmark/acon_eval/retriever_sesrver.py b/sdk/benchmark/acon_eval/retriever_sesrver.py new file mode 100644 index 000000000..2703c4981 --- /dev/null +++ b/sdk/benchmark/acon_eval/retriever_sesrver.py @@ -0,0 +1,423 @@ +import json +import os +import warnings +from typing import List, Dict, Optional +import argparse + +try: + import faiss +except: + print("faiss not found, try to install it via `pip install faiss-cpu` or `pip install faiss-gpu`") +import torch +import numpy as np +from transformers import AutoConfig, AutoTokenizer, AutoModel +from tqdm import tqdm +import datasets + +import uvicorn +from fastapi import FastAPI +from pydantic import BaseModel + + +parser = argparse.ArgumentParser(description="Launch the local faiss retriever.") +parser.add_argument("--index_path", type=str, default="search/database/wikipedia/bm25", help="Corpus indexing file.") +parser.add_argument("--corpus_path", type=str, default="search/database/wikipedia/wiki-18.jsonl", help="Local corpus file.") +parser.add_argument("--topk", type=int, default=3, help="Number of retrieved passages for one query.") +parser.add_argument("--retriever_model", type=str, default="intfloat/e5-base-v2", help="Name of the retriever model.") + +args = parser.parse_args() + +def load_corpus(corpus_path: str): + corpus = datasets.load_dataset( + 'json', + data_files=corpus_path, + split="train", + num_proc=4 + ) + return corpus + +def read_jsonl(file_path): + data = [] + with open(file_path, "r") as f: + for line in f: + data.append(json.loads(line)) + return data + +def load_docs(corpus, doc_idxs): + results = [corpus[int(idx)] for idx in doc_idxs] + return results + +def load_model(model_path: str, use_fp16: bool = False): + model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + model = AutoModel.from_pretrained(model_path, trust_remote_code=True) + model.eval() + model.cuda() + if use_fp16: + model = model.half() + tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=True) + return model, tokenizer + +def pooling( + pooler_output, + last_hidden_state, + attention_mask = None, + pooling_method = "mean" +): + if pooling_method == "mean": + last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0) + return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] + elif pooling_method == "cls": + return last_hidden_state[:, 0] + elif pooling_method == "pooler": + return pooler_output + else: + raise NotImplementedError("Pooling method not implemented!") + +class Encoder: + def __init__(self, model_name, model_path, pooling_method, max_length, use_fp16): + self.model_name = model_name + self.model_path = model_path + self.pooling_method = pooling_method + self.max_length = max_length + self.use_fp16 = use_fp16 + + self.model, self.tokenizer = load_model(model_path=model_path, use_fp16=use_fp16) + self.model.eval() + + @torch.no_grad() + def encode(self, query_list: List[str], is_query=True) -> np.ndarray: + # processing query for different encoders + if isinstance(query_list, str): + query_list = [query_list] + + if "e5" in self.model_name.lower(): + if is_query: + query_list = [f"query: {query}" for query in query_list] + else: + query_list = [f"passage: {query}" for query in query_list] + + if "bge" in self.model_name.lower(): + if is_query: + query_list = [f"Represent this sentence for searching relevant passages: {query}" for query in query_list] + + inputs = self.tokenizer(query_list, + max_length=self.max_length, + padding=True, + truncation=True, + return_tensors="pt" + ) + inputs = {k: v.cuda() for k, v in inputs.items()} + + if "T5" in type(self.model).__name__: + # T5-based retrieval model + decoder_input_ids = torch.zeros( + (inputs['input_ids'].shape[0], 1), dtype=torch.long + ).to(inputs['input_ids'].device) + output = self.model( + **inputs, decoder_input_ids=decoder_input_ids, return_dict=True + ) + query_emb = output.last_hidden_state[:, 0, :] + else: + output = self.model(**inputs, return_dict=True) + query_emb = pooling(output.pooler_output, + output.last_hidden_state, + inputs['attention_mask'], + self.pooling_method) + if "dpr" not in self.model_name.lower(): + query_emb = torch.nn.functional.normalize(query_emb, dim=-1) + + query_emb = query_emb.detach().cpu().numpy() + query_emb = query_emb.astype(np.float32, order="C") + + del inputs, output + torch.cuda.empty_cache() + + return query_emb + +class BaseRetriever: + def __init__(self, config): + self.config = config + self.retrieval_method = config.retrieval_method + self.topk = config.retrieval_topk + + self.index_path = config.index_path + self.corpus_path = config.corpus_path + + def _search(self, query: str, num: int, return_score: bool): + raise NotImplementedError + + def _batch_search(self, query_list: List[str], num: int, return_score: bool): + raise NotImplementedError + + def search(self, query: str, num: int = None, return_score: bool = False): + return self._search(query, num, return_score) + + def batch_search(self, query_list: List[str], num: int = None, return_score: bool = False): + return self._batch_search(query_list, num, return_score) +class BM25Retriever(BaseRetriever): + def __init__(self, config): + super().__init__(config) + from pyserini.search.lucene import LuceneSearcher + + if not os.path.exists(self.index_path): + raise FileNotFoundError(f"BM25 index path not found: {self.index_path}") + + self.searcher = LuceneSearcher(self.index_path) + self.contain_doc = self._check_contain_doc() + + if not self.contain_doc: + if not os.path.exists(self.corpus_path): + raise FileNotFoundError(f"Corpus file not found: {self.corpus_path}") + self.corpus = load_corpus(self.corpus_path) + + self.max_process_num = 8 + + def _check_contain_doc(self): + try: + doc = self.searcher.doc("0") or self.searcher.doc(0) + return doc is not None and doc.raw() is not None + except Exception: + return False + + def _search(self, query: str, num: int = None, return_score: bool = False): + if not query or not query.strip(): + return ([], []) if return_score else [] + + num = num or self.topk + hits = self.searcher.search(query, num) + + if not hits: + return ([], []) if return_score else [] + + scores = [hit.score for hit in hits] + + if len(hits) < num: + warnings.warn(f"Only retrieved {len(hits)} documents, fewer than requested topk={num}") + + if self.contain_doc: + results = [] + for hit in hits: + try: + raw = self.searcher.doc(hit.docid).raw() + obj = json.loads(raw) + content = obj.get("contents", "") + + lines = content.split("\n") + title = lines[0].strip("\"") if lines else "" + text = "\n".join(lines[1:]) if len(lines) > 1 else content + + results.append({ + "title": title, + "text": text, + "contents": content + }) + except Exception as e: + results.append({ + "title": "", + "text": "", + "contents": "", + "error": f"Failed to parse docid={hit.docid}: {str(e)}" + }) + else: + results = load_docs(self.corpus, [hit.docid for hit in hits]) + + return (results, scores) if return_score else results + + def _batch_search(self, query_list: List[str], num: int = None, return_score: bool = False): + if isinstance(query_list, str): + query_list = [query_list] + + results = [] + scores = [] + + for query in query_list: + item_result, item_score = self._search(query, num, True) + results.append(item_result) + scores.append(item_score) + + return (results, scores) if return_score else results + +class DenseRetriever(BaseRetriever): + def __init__(self, config): + super().__init__(config) + self.index = faiss.read_index(self.index_path) + if config.faiss_gpu: + co = faiss.GpuMultipleClonerOptions() + co.useFloat16 = True + co.shard = True + self.index = faiss.index_cpu_to_all_gpus(self.index, co=co) + + self.corpus = load_corpus(self.corpus_path) + self.encoder = Encoder( + model_name = self.retrieval_method, + model_path = config.retrieval_model_path, + pooling_method = config.retrieval_pooling_method, + max_length = config.retrieval_query_max_length, + use_fp16 = config.retrieval_use_fp16 + ) + self.topk = config.retrieval_topk + self.batch_size = config.retrieval_batch_size + + def _search(self, query: str, num: int = None, return_score: bool = False): + if num is None: + num = self.topk + query_emb = self.encoder.encode(query) + scores, idxs = self.index.search(query_emb, k=num) + idxs = idxs[0] + scores = scores[0] + results = load_docs(self.corpus, idxs) + if return_score: + return results, scores.tolist() + else: + return results + + def _batch_search(self, query_list: List[str], num: int = None, return_score: bool = False): + if isinstance(query_list, str): + query_list = [query_list] + if num is None: + num = self.topk + + results = [] + scores = [] + for start_idx in tqdm(range(0, len(query_list), self.batch_size), desc='Retrieval process: '): + query_batch = query_list[start_idx:start_idx + self.batch_size] + batch_emb = self.encoder.encode(query_batch) + batch_scores, batch_idxs = self.index.search(batch_emb, k=num) + batch_scores = batch_scores.tolist() + batch_idxs = batch_idxs.tolist() + + # load_docs is not vectorized, but is a python list approach + flat_idxs = sum(batch_idxs, []) + batch_results = load_docs(self.corpus, flat_idxs) + # chunk them back + batch_results = [batch_results[i*num : (i+1)*num] for i in range(len(batch_idxs))] + + results.extend(batch_results) + scores.extend(batch_scores) + + del batch_emb, batch_scores, batch_idxs, query_batch, flat_idxs, batch_results + torch.cuda.empty_cache() + + if return_score: + return results, scores + else: + return results + +def get_retriever(config): + if config.retrieval_method == "bm25": + return BM25Retriever(config) + else: + return DenseRetriever(config) + + +##################################### +# FastAPI server below +##################################### + +class Config: + """ + Minimal config class (simulating your argparse) + Replace this with your real arguments or load them dynamically. + """ + def __init__( + self, + retrieval_method: str = "bm25", + retrieval_topk: int = 10, + index_path: str = "./index/bm25", + corpus_path: str = "./data/corpus.jsonl", + dataset_path: str = "./data", + data_split: str = "train", + faiss_gpu: bool = True, + retrieval_model_path: str = "./model", + retrieval_pooling_method: str = "mean", + retrieval_query_max_length: int = 256, + retrieval_use_fp16: bool = False, + retrieval_batch_size: int = 128 + ): + self.retrieval_method = retrieval_method + self.retrieval_topk = retrieval_topk + self.index_path = index_path + self.corpus_path = corpus_path + self.dataset_path = dataset_path + self.data_split = data_split + self.faiss_gpu = faiss_gpu + self.retrieval_model_path = retrieval_model_path + self.retrieval_pooling_method = retrieval_pooling_method + self.retrieval_query_max_length = retrieval_query_max_length + self.retrieval_use_fp16 = retrieval_use_fp16 + self.retrieval_batch_size = retrieval_batch_size + + +class QueryRequest(BaseModel): + queries: List[str] + topk: Optional[int] = None + return_scores: bool = False + + +app = FastAPI() + +# 1) Build a config (could also parse from arguments). +# In real usage, you'd parse your CLI arguments or environment variables. +config = Config( + retrieval_method="bm25", + index_path=args.index_path, + corpus_path=args.corpus_path, + retrieval_topk=args.topk, + faiss_gpu=False, +) + +# 2) Instantiate a global retriever so it is loaded once and reused. +retriever = get_retriever(config) +@app.post("/retrieve") +def retrieve_endpoint(request: QueryRequest): + """ + Input: + { + "queries": ["What is Python?"], + "topk": 3, + "return_scores": true + } + """ + if not request.queries: + return {"result": [], "error": "queries cannot be empty"} + + topk = request.topk or config.retrieval_topk + + try: + if request.return_scores: + results, scores = retriever.batch_search( + query_list=request.queries, + num=topk, + return_score=True + ) + + resp = [] + for single_result, single_scores in zip(results, scores): + combined = [] + for doc, score in zip(single_result, single_scores): + combined.append({ + "document": doc, + "score": score + }) + resp.append(combined) + + return {"result": resp} + + else: + results = retriever.batch_search( + query_list=request.queries, + num=topk, + return_score=False + ) + return {"result": results} + + except Exception as e: + return { + "result": [], + "error": str(e) + } + +if __name__ == "__main__": + # 3) Launch the server. By default, it listens on http://127.0.0.1:8000 + uvicorn.run(app, host="0.0.0.0", port=8005) + \ No newline at end of file diff --git a/sdk/benchmark/acon_eval/run_acon_qa.py b/sdk/benchmark/acon_eval/run_acon_qa.py new file mode 100644 index 000000000..e59771e01 --- /dev/null +++ b/sdk/benchmark/acon_eval/run_acon_qa.py @@ -0,0 +1,570 @@ +#!/usr/bin/env python3 +"""Run ACON multi-objective QA benchmark with nexent agent. + +Loads ACON's nq_multi_8 data, builds a nexent CoreAgent with +wikipedia_search + final_answer tools, evaluates with EM/F1 scoring. + +Supports three modes: + baseline — no context compression + context_manager — nexent's built-in ContextManager + +Use --num_objectives to control how many sub-questions per sample +(e.g. --num_objectives 2 to use only the first 2 sub-questions). + +Usage: + # Start ACON retriever server first: + # cd acon/experiments/smolagents/search && python retriever_server.py + # (or download the corpus and start it per ACON README) + + python run_acon_qa.py \ + --data_folder data/nq_multi_8 \ + --split test \ + --mode baseline \ + --num_objectives 4 \ + --limit 5 + +Results saved to outputs///summary.json + predictions.jsonl +""" +import argparse +import asyncio +import json +import os +import sys +import threading +from datetime import datetime +from typing import Optional + +# ---- Path setup ---- +# Robust path resolution via paths.py (.git discovery) — works regardless of file location +# 1. Add benchmark/ to sys.path so paths.py can be found +# 2. import paths triggers setup_paths() which adds sdk/, backend/ to sys.path +# 3. Add this directory for local module imports (dataset, eval_utils, tools) +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import paths # noqa: F401 — side-effect: adds sdk/, backend/ to sys.path +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +# ---- Register ACON tools into nexent namespace before any agent creation ---- +from tools import register_acon_tools, get_acon_tool_configs +register_acon_tools() + +from dataset import QALoader +from eval_utils import exact_match, f1_max + +from agent_runner import ( + build_agent_run_info_with_custom_prompt, + run_agent_with_tracking, + AgentRunResult, + ContextManagerConfig, +) + +from nexent.core.agents.agent_model import AgentHistory +from nexent.core.agents.agent_context import ContextManager + + +# ---- QA-specific system prompt builder ---- + +def build_qa_system_prompt(num_objectives: int) -> str: + answer_slots = "; ".join(f"answer{i}" for i in range(1, num_objectives + 1)) + + return f"""You are a multi-hop QA agent. The input contains multiple sub-questions separated by "; ". +Answer them sequentially by actually calling `wikipedia_search`, then call `final_answer`. + +# Tools +- `wikipedia_search(query: str, n_results: int = 3)` — searches the local 2018 Wikipedia retriever. +- `final_answer(answer: str)` — submits the final answer. + +# Mandatory Tool-Use Protocol +For every search, you must use a real code block: + + +result = wikipedia_search(query="...", n_results=3) +print(result) + + +Only an actual Observation produced after a `` block counts as evidence. +Do not write fake Search/Result text. + +# Core Rules +For each sub-question, in order: +1. Run one `wikipedia_search` call. +2. Read the actual Observation. +3. If the Observation clearly answers the sub-question, register the canonical answer and move to the next sub-question. +4. Do not run confirmation searches after finding a clear answer. +5. Use at most 3 searches per sub-question. +6. If the first 2 searches fail, the 3rd query must be broader and centered on the main entity/topic. +7. If 3 searches are exhausted, commit to the best candidate from observed results and move on. + +# Anti-Loop & Exhaustion Rules (CRITICAL — overriding priority) +- Track the exact count of wikipedia_search calls for the current sub-question. +- When count reaches 3, STOP searching immediately. Output ANSWER_Q: and move to the next question. No exceptions, no additional searches. +- If the last 2 searches returned completely irrelevant results (no mention of the target entity), the query angle is wrong. Do NOT search a third time with minor wording tweaks of the same query. Instead, search the main entity broadly (e.g. "Formula One history" instead of "chain F1"), or if already at 3, infer the best answer from any indirect clues in the observations and output ANSWER_Q. +- Self-check: if you catch yourself writing "I'm not finding it", "Perhaps", "Let me search for" or similar frustration phrases, you have already done enough searching. Output ANSWER_Q with your best inference immediately. +- After 3 searches, you already have your answer. Do NOT write "However", "But", "I'm not sure", "I'm not entirely sure", "Let me try one more", "Let me check directly", or any similar hesitation phrase. These words mean you have a candidate answer but are delaying. Output that candidate as ANSWER_Q right now and move on. Uncertainty is expected and acceptable — your best guess IS the answer. +- If the conversation contains a user message starting with "Summary of earlier steps in this task:", that message is an authoritative checkpoint of your progress. Before each search, check its JSON fields: "status", "search_counts", "pending_q", "next_action". If pending_q is empty and next_action says to call final_answer, call final_answer immediately — do not search again. If a question is marked "exhausted" in the summary, do not search it further. + +# Query Rules +- Prefer entity-focused queries, e.g. "Asha Bhosle Guinness", not "most prolific singer ever". +- Each query must be meaningfully different. +- Use `n_results=3` by default. + +# Answer Rules +- Use concise canonical answers: Wikipedia-title-like names or one-line factual answers. +- Keep modifiers only when needed for correctness. +- Do not include explanations, citations, dates, chapter/verse references, or extra context. +- Final answers must be separated by "; " in the original sub-question order. + +# Answer Registration — mandatory +Before moving from one question to the next, output exactly one plain-text marker: + +ANSWER_Q: + + +JUST Examples: +ANSWER_Q1: Eva Lund +ANSWER_Q2: September 1980 + +Rules: +- The marker is plain text, not a code block. +- If an Observation clearly answers Q, output `ANSWER_Q: `. +- After 3 searches, if there is any usable candidate in the Observations, output `ANSWER_Q: `. +- Never move to the next question without an ANSWER_Q marker for the current question. +- Use the registered ANSWER_Q markers to construct the final answer. + +# Final Answer +Before calling `final_answer`, count your answers. +The final answer must contain exactly one answer per sub-question. +Never submit a partial answer. + +Use a real code block: + + +final_answer(answer="{answer_slots}") + + +Start answering the real questions, starting with obtaining ANSWER_Q1. +""" + +def _sanitize_for_path(name: str) -> str: + return ''.join(ch if ch.isalnum() or ch in ('-', '_', '.') else '-' for ch in name) + + +async def run_sample( + ex, + max_steps: int, + retriever_port: str, + mode: str, + cm_config: Optional[ContextManagerConfig], + debug: bool, + system_prompt: str, +) -> dict: + """Run a single QA example through the nexent agent.""" + tools = get_acon_tool_configs(port=retriever_port) + + agent_run_info = build_agent_run_info_with_custom_prompt( + query=ex.question, + system_prompt=system_prompt, + history=[], + tools=tools, + max_steps=max_steps, + agent_name="acon_qa_agent", + agent_description="ACON multi-objective QA agent", + language="en", + context_manager_config=cm_config, + temperature=0 + ) + + # Attach shared ContextManager if mode is context_manager + shared_cm = None + if mode == "context_manager" and cm_config and cm_config.enabled: + shared_cm = ContextManager(config=cm_config, max_steps=max_steps) + agent_run_info.context_manager = shared_cm + + result = await run_agent_with_tracking(agent_run_info, debug=debug) + pred_raw = result.final_answer or "" + + # Score: split prediction by semicolons, compare to gold answer list + pred_list = [p.strip() for p in pred_raw.split(";")] + + # Pad or truncate predictions to match number of gold sub-answers + n_sub = len(ex.answer) + while len(pred_list) < n_sub: + pred_list.append("") + pred_list = pred_list[:n_sub] + + em_list = [exact_match(p, a) for p, a in zip(pred_list, ex.answer)] + f1_list = [f1_max(p, a) for p, a in zip(pred_list, ex.answer)] + + em_score = sum(em_list) / n_sub if n_sub else 0.0 + f1_score = sum(f1_list) / n_sub if n_sub else 0.0 + + return { + "pred_raw": pred_raw, + "pred_list": pred_list, + "em_score": em_score, + "f1_score": f1_score, + "em_list": em_list, + "f1_list": f1_list, + "step_count": result.step_count, + "errors": result.errors, + "total_input_tokens": result.total_input_tokens, + "total_output_tokens": result.total_output_tokens, + "cm_stats": shared_cm.get_all_compression_stats() if shared_cm else None, + "cm_token_counts": shared_cm.get_token_counts() if shared_cm else None, + } + + +async def main( + data_folder: str, + split: str, + mode: str, + max_steps: int, + limit: Optional[int], + retriever_port: str, + token_threshold: int, + keep_recent_pairs: int, + keep_recent_steps: int, + max_observation_length: int, + debug: bool, + output_dir: Optional[str], + id_list_file: Optional[str], + num_objectives: int, +): + # Resolve data path + split_key = (split or "test").lower() + if split_key in {"dev", "validation", "val"}: + split_key = "test" + fname = "train.jsonl" if split_key == "train" else "test.jsonl" + data_path = os.path.join(data_folder, fname) + + if not os.path.exists(data_path): + print(f"ERROR: Data file not found: {data_path}") + print(f" Make sure to point --data_folder to ACON's nq_multi_8 directory,") + print(f" e.g., D:/path/to/acon/experiments/smolagents/data/nq_multi_8") + return + + loader = QALoader(data_path) + + # Optional ID filtering + filter_ids = None + if id_list_file and os.path.exists(id_list_file): + with open(id_list_file, "r", encoding="utf-8") as f: + filter_ids = {line.strip() for line in f if line.strip() and not line.strip().startswith("#")} + + # Build iterator + if filter_ids is not None: + materialized = [ex for ex in loader.iter(limit=None) if ex.id in filter_ids] + if limit is not None: + materialized = materialized[:limit] + iterator = materialized + total_count = len(materialized) + else: + iterator = list(loader.iter(limit=limit)) + total_count = len(iterator) + + # Truncate sub-questions if num_objectives < 8 + if num_objectives < 8: + for ex in iterator: + q_parts = [q.strip() for q in ex.question.split(";")] + ex.question = "; ".join(q_parts[:num_objectives]) + ex.answer = ex.answer[:num_objectives] + + # Build QA-specific system prompt with dynamic answer slots + qa_system_prompt = build_qa_system_prompt(num_objectives) + + # ContextManager config based on mode + cm_config = None + if mode == "context_manager": + # Custom summary JSON schema that emphasizes task progress tracking + custom_summary_schema = { + "n_questions": "Total number of sub-questions.", + "answers": ( + "Ordered list of final-answer candidates. Length must equal n_questions. " + "Each item is either an exact canonical answer string or 'Unknown'. " + ), + "status": ( + "Array of length n_questions. Each item must be one of: " + "'unstarted', 'searching', 'answered', 'exhausted'. " + "answered requires a non-null answer other than 'Unknown'. or null" + "exhausted requires answer that need to be inferred." + ), + "search_counts": ( + "Array of integers of length n_questions. " + "Count only actual wikipedia_search calls." + ), + "current_q": ( + "The 1-based index of the next question to solve. " + "Usually the first index whose status is not 'answered' or 'exhausted'." + ), + "pending_q": ( + "List of question numbers whose status is 'unstarted' or 'searching'. " + "Do not include answered or exhausted questions." + ), + "next_action": ( + "One direct mechanical next step. Example: " + "'Run wikipedia_search for Q5: Ash Wednesday ashes palm leaves'." + ), + } + # Custom summary system prompt that emphasizes multi-question task tracking + custom_incremental_summary_system_prompt = ( + "Update the compact QA checkpoint based on the latest agent action. " + "Output only strict JSON matching the schema. No markdown.\n\n" + "Treat ANSWER_Q: ... marker as authoritative; never replace with null or Unknown." + "INCREMENTAL UPDATE RULES:\n" + "- Preserve all answered values; never downgrade them to null or 'Unknown'.\n" + "- If the latest action executed wikipedia_search, increment only that question's search_counts entry.\n" + "- If the latest observation clearly answers the current question, write the canonical answer into answers and set status to 'answered'.\n" + "- ENFORCEMENT: If any search_counts reaches >=3, its status MUST be 'exhausted' (NEVER 'searching'). " + "Set its answer to the best observed candidate, or 'Unknown' if nothing was useful. " + "An exhausted question must be REMOVED from pending_q.\n" + "- current_q must advance past any exhausted question to the next unstarted/searching question.\n" + "- If ALL questions are answered or exhausted, set next_action to 'Call final_answer with the collected answers'.\n" + "- NEVER set next_action to search a question whose search_counts is already >=3.\n" + "- Otherwise, leave answer as null and status as 'searching'.\n" + "- pending_q must contain exactly the question numbers with status 'unstarted' or 'searching'.\n" + "- Overwrite the old state completely. Do not append logs, snippets, or history." + ) + + custom_summary_system_prompt = ( + "You are creating a compact execution checkpoint for a sequential multi-question QA agent. " + "Output only strict JSON matching the schema. No markdown, greetings, or backticks.\n\n" + "Treat ANSWER_Q: ... marker as authoritative; never replace an ANSWER_Q value with null or Unknown.\n" + "STATE RULES:\n" + "- Preserve exact canonical answer strings when explicitly available.\n" + "- answers, status, and search_counts must all have length n_questions.\n" + "- status must be consistent with answers and search_counts:\n" + " * unstarted => answer is null, search_counts is 0\n" + " * searching => answer is null, search_counts is 1 or 2\n" + " * answered => answer is non-null canonical string, search_counts is 1-3\n" + " * exhausted => search_counts is >=3, answer is best inference or 'Unknown'\n" + "- A question with search_counts >=3 must have status 'exhausted', never 'searching'.\n" + "- pending_q must contain exactly the question numbers with status 'unstarted' or 'searching'.\n" + "- current_q should be the first question in pending_q.\n" + "- If all questions are answered or exhausted, set next_action to 'Call final_answer'.\n\n" + + "COMPACTION RULES:\n" + "- Strip raw search logs, snippets, long reasons, file status, and failed query history.\n" + "- Count every wikipedia_search call visible in the trajectory for each question.\n" + "- Keep the checkpoint short and stable. Do not append history." + ) + cm_config = ContextManagerConfig( + enabled=True, + token_threshold=token_threshold, + keep_recent_pairs=keep_recent_pairs, + keep_recent_steps=keep_recent_steps, + max_observation_length=max_observation_length, + summary_json_schema=custom_summary_schema, + summary_system_prompt=custom_summary_system_prompt, + incremental_summary_system_prompt=custom_incremental_summary_system_prompt, + ) + else: + # baseline: no compression + cm_config = ContextManagerConfig(enabled=False, token_threshold=10**9) + + # Output directory + if output_dir is None: + acon_eval_dir = os.path.dirname(os.path.abspath(__file__)) + outputs_root = os.path.join(acon_eval_dir, "outputs") + else: + outputs_root = output_dir + + mode_part = _sanitize_for_path(mode) + split_part = _sanitize_for_path(split_key) + out_dir = os.path.join(outputs_root, f"{mode_part}", split_part) + os.makedirs(out_dir, exist_ok=True) + + print(f"\n{'='*60}") + obj_label = f"{num_objectives}-Objective" if num_objectives != 8 else "8-Objective" + print(f"ACON {obj_label} QA Evaluation (nexent agent)") + print(f"{'='*60}") + print(f" Data: {data_path}") + print(f" Split: {split_key}") + print(f" Mode: {mode}") + print(f" Num objectives: {num_objectives}") + print(f" Max steps: {max_steps}") + print(f" Limit: {limit or 'all'}") + print(f" Total: {total_count}") + print(f" Retriever: 127.0.0.1:{retriever_port}") + if mode == "context_manager": + print(f" CM config: threshold={token_threshold}, keep_recent_pairs={keep_recent_pairs}, " + f"keep_recent_steps={keep_recent_steps}, max_obs_len={max_observation_length}") + print(f" Output: {out_dir}") + print(f"{'='*60}\n") + + n = 0 + em_sum = 0.0 + f1_sum = 0.0 + all_rows = [] + + for ex in iterator: + print(f"[{n+1}/{total_count}] {ex.id[:40]}...", end=" ", flush=True) + + try: + sample_result = await run_sample( + ex=ex, + max_steps=max_steps, + retriever_port=retriever_port, + mode=mode, + cm_config=cm_config, + debug=debug, + system_prompt=qa_system_prompt, + ) + em_score = sample_result["em_score"] + f1_score = sample_result["f1_score"] + print(f"EM={em_score:.2f} F1={f1_score:.2f} steps={sample_result['step_count']}") + except Exception as e: + print(f"ERROR: {e}") + em_score = 0.0 + f1_score = 0.0 + sample_result = { + "pred_raw": "", + "pred_list": [], + "em_score": 0.0, + "f1_score": 0.0, + "em_list": [], + "f1_list": [], + "step_count": 0, + "errors": [str(e)], + "total_input_tokens": 0, + "total_output_tokens": 0, + "cm_stats": None, + "cm_token_counts": None, + } + + em_sum += em_score + f1_sum += f1_score + n += 1 + + all_rows.append({ + "id": ex.id, + "question": ex.question, + "answer": ex.answer, + "prediction": sample_result["pred_list"], + "pred_raw": sample_result["pred_raw"], + "em": em_score, + "f1": f1_score, + "em_list": sample_result["em_list"], + "f1_list": sample_result["f1_list"], + "step_count": sample_result["step_count"], + "errors": sample_result["errors"], + "total_input_tokens": sample_result["total_input_tokens"], + "total_output_tokens": sample_result["total_output_tokens"], + "cm_stats": sample_result.get("cm_stats"), + "cm_token_counts": sample_result.get("cm_token_counts"), + }) + + # Token aggregates + total_input_tokens = sum(row["total_input_tokens"] for row in all_rows) + total_output_tokens = sum(row["total_output_tokens"] for row in all_rows) + avg_input_tokens = (total_input_tokens / n) if n else 0.0 + avg_output_tokens = (total_output_tokens / n) if n else 0.0 + + # Compression cost aggregate (context_manager mode only) + total_compression_input_tokens = 0 + total_compression_output_tokens = 0 + for row in all_rows: + cm_stats = row.get("cm_stats") + if cm_stats: + total_compression_input_tokens += cm_stats.get("total_input_tokens", 0) + total_compression_output_tokens += cm_stats.get("total_output_tokens", 0) + avg_compression_input_tokens = (total_compression_input_tokens / n) if n else 0.0 + avg_compression_output_tokens = (total_compression_output_tokens / n) if n else 0.0 + + # Summary + summary = { + "total": n, + "avg_em": (em_sum / n) if n else 0.0, + "avg_f1": (f1_sum / n) if n else 0.0, + "mode": mode, + "split": split_key, + "num_objectives": num_objectives, + "data_path": data_path, + "max_steps": max_steps, + "token_threshold": token_threshold if mode == "context_manager" else None, + "keep_recent_pairs": keep_recent_pairs if mode == "context_manager" else None, + "keep_recent_steps": keep_recent_steps if mode == "context_manager" else None, + "avg_input_tokens": avg_input_tokens, + "avg_output_tokens": avg_output_tokens, + "total_input_tokens": total_input_tokens, + "total_output_tokens": total_output_tokens, + "total_compression_input_tokens": total_compression_input_tokens if mode == "context_manager" else None, + "total_compression_output_tokens": total_compression_output_tokens if mode == "context_manager" else None, + "avg_compression_input_tokens": avg_compression_input_tokens if mode == "context_manager" else None, + "avg_compression_output_tokens": avg_compression_output_tokens if mode == "context_manager" else None, + "timestamp": datetime.now().isoformat(), + } + + # Save results + with open(os.path.join(out_dir, "summary.json"), "w", encoding="utf-8") as f: + json.dump(summary, f, indent=2) + + with open(os.path.join(out_dir, "predictions.jsonl"), "w", encoding="utf-8") as f: + for row in all_rows: + f.write(json.dumps(row, ensure_ascii=False) + "\n") + + print(f"\n{'='*60}") + print(f"Results Summary") + print(f"{'='*60}") + print(f" Mode: {mode}") + print(f" Total: {n}") + print(f" Avg EM: {em_sum/n*100:.1f}% ({em_sum:.2f}/{n})" if n else " Avg EM: N/A") + print(f" Avg F1: {f1_sum/n:.3f}" if n else " Avg F1: N/A") + print(f" Avg Input Tokens: {avg_input_tokens:,.0f}") + print(f" Avg Output Tokens: {avg_output_tokens:,.0f}") + if mode == "context_manager": + print(f" Avg Compression Input Tokens: {avg_compression_input_tokens:,.0f}") + print(f" Avg Compression Output Tokens: {avg_compression_output_tokens:,.0f}") + print(f" Output: {out_dir}") + print(f"{'='*60}\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run ACON multi-objective QA benchmark with nexent agent") + parser.add_argument( + "--data_folder", + type=str, + default="data/nq_multi_8", + help="Path to ACON nq_multi_8 data folder (containing train.jsonl and test.jsonl)", + ) + parser.add_argument("--split", type=str, default="test", help="Dataset split: train or test") + parser.add_argument( + "--mode", + type=str, + default="baseline", + choices=["baseline", "context_manager"], + help="Evaluation mode: baseline (no compression) or context_manager (nexent CM)", + ) + parser.add_argument("--max_steps", type=int, default=30, help="Max agent steps per question") + parser.add_argument("--limit", type=int, default=None, help="Limit number of examples") + parser.add_argument("--retriever_port", type=str, default="8005", help="ACON retriever server port") + parser.add_argument("--token_threshold", type=int, default=7200, help="ContextManager token threshold (for context_manager mode)") + parser.add_argument("--keep_recent_pairs", type=int, default=1, help="ContextManager keep_recent_pairs (for context_manager mode)") + parser.add_argument("--keep_recent_steps", type=int, default=4, help="ContextManager keep_recent_steps (for context_manager mode)") + parser.add_argument("--max_observation_length", type=int, default=20000, help="Max observation length in chars (for context_manager mode)") + parser.add_argument("--debug", action="store_true", help="Enable debug output") + parser.add_argument("--output_dir", type=str, default=None, help="Override output directory") + parser.add_argument("--id_list_file", type=str, default=None, help="File with example IDs to filter (one per line)") + parser.add_argument( + "--num_objectives", + type=int, + default=8, + help="Number of sub-questions to use per sample (1-8, default: 8)", + ) + + args = parser.parse_args() + + asyncio.run(main( + data_folder=args.data_folder, + split=args.split, + mode=args.mode, + max_steps=args.max_steps, + limit=args.limit, + retriever_port=args.retriever_port, + token_threshold=args.token_threshold, + keep_recent_pairs=args.keep_recent_pairs, + keep_recent_steps=args.keep_recent_steps, + max_observation_length=args.max_observation_length, + debug=args.debug, + output_dir=args.output_dir, + id_list_file=args.id_list_file, + num_objectives=args.num_objectives, + )) diff --git a/sdk/benchmark/acon_eval/tools.py b/sdk/benchmark/acon_eval/tools.py new file mode 100644 index 000000000..828f05e63 --- /dev/null +++ b/sdk/benchmark/acon_eval/tools.py @@ -0,0 +1,131 @@ +"""ACON QA benchmark tools for nexent agent. + +Provides WikipediaSearchTool and FinalAnswerTool as smolagents.Tool +subclasses, plus a helper to register them in nexent's tool namespace +so that NexentAgent.create_local_tool() can find them via globals(). +""" +from typing import Any + +import requests +from smolagents.tools import Tool + +from nexent.core.agents.agent_model import ToolConfig + + +class WikipediaSearchTool(Tool): + name = "wikipedia_search" + description = ( + "Uses semantic search to retrieve the parts of 2018 wikipedia " + "that could be most relevant to answer your query." + ) + inputs = { + "query": { + "type": "string", + "description": ( + "The query to perform. This should be semantically close to " + "your target documents. Use the affirmative form rather than " + "a question." + ), + }, + "n_results": { + "type": "integer", + "nullable": True, + "description": "The number of results to return. Minimum is 3. Maximum is 10.", + }, + } + output_type = "string" + + def __init__(self, port: str = "8005", **kwargs): + super().__init__() + self.port = port + self.url = f"http://127.0.0.1:{self.port}/retrieve" + + def forward(self, query: str, n_results: int = 3) -> str: + if n_results < 3: + n_results = 3 + if n_results > 10: + n_results = 10 + + assert isinstance(query, str), "Your search query must be a string" + payload = { + "queries": [query], + "topk": n_results, + "return_scores": True, + } + + response = requests.post(self.url, json=payload) + response.raise_for_status() + + retrieved_data = response.json() + docs = retrieved_data["result"][0] + + return "Retrieved documents:" + "".join( + f"\n\n[Document {str(i)}]\n" + doc["document"]["contents"] + for i, doc in enumerate(docs) + ) + + +class FinalAnswerTool(Tool): + name = "final_answer" + description = "Provides a final answer to the given problem." + inputs = { + "answer": { + "type": "any", + "description": "The final answer to the problem", + }, + } + output_type = "any" + + def forward(self, answer: Any) -> Any: + return answer + + +# --------------------------------------------------------------------------- +# Tool registration and ToolConfig builders +# --------------------------------------------------------------------------- + +def register_acon_tools(): + """Inject ACON tool classes into nexent.core.tools AND nexent_agent namespaces. + + NexentAgent.create_local_tool() looks up tool classes via globals(), + which is populated by `from ..tools import *` at import time. + Since `setattr` on the tools module does NOT update nexent_agent's + already-executed `globals()`, we must inject into BOTH modules. + """ + import nexent.core.tools as _tools_mod + import nexent.core.agents.nexent_agent as _agent_mod + for cls in (WikipediaSearchTool, FinalAnswerTool): + setattr(_tools_mod, cls.__name__, cls) + setattr(_agent_mod, cls.__name__, cls) + + +def build_wikipedia_search_tool_config(port: str = "8005") -> ToolConfig: + return ToolConfig( + class_name="WikipediaSearchTool", + name="wikipedia_search", + description=WikipediaSearchTool.description, + inputs=str(WikipediaSearchTool.inputs), + output_type=WikipediaSearchTool.output_type, + params={"port": port}, + source="local", + ) + + +def build_final_answer_tool_config() -> ToolConfig: + return ToolConfig( + class_name="FinalAnswerTool", + name="final_answer", + description=FinalAnswerTool.description, + inputs=str(FinalAnswerTool.inputs), + output_type=FinalAnswerTool.output_type, + params={}, + source="local", + ) + + +def get_acon_tool_configs(port: str = "8005") -> list[ToolConfig]: + """Return the standard ACON QA tool config list.""" + return [ + build_wikipedia_search_tool_config(port=port), + build_final_answer_tool_config(), + ] \ No newline at end of file diff --git a/sdk/benchmark/agent_runner.py b/sdk/benchmark/agent_runner.py new file mode 100644 index 000000000..b9bea7d49 --- /dev/null +++ b/sdk/benchmark/agent_runner.py @@ -0,0 +1,508 @@ +# -*- coding: utf-8 -*- +""" +Shared utilities for building and running nexent agents in benchmarks. + +Provides: +1. Prompt construction (system prompt, prompt templates) +2. AgentRunInfo construction (standard and custom-prompt variants) +3. Message-stream processing and statistics +""" +import sys +import io +import json +import os +import re +from datetime import datetime +from typing import AsyncIterator, Callable, Optional + +sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +from jinja2 import Template, StrictUndefined +from smolagents.utils import BASE_BUILTIN_MODULES +from dotenv import load_dotenv +import string + +# ============ Environment Setup ============ +# Add parent directory to sys.path so paths.py can be found, then import it. +# paths.py resolves PROJECT_ROOT/SDK_DIR/BACKEND_DIR via .git discovery and +# injects them into sys.path automatically — no manual path manipulation needed. +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import paths # noqa: F401 — side-effect: adds sdk/, backend/ to sys.path + +from utils.prompt_template_utils import get_agent_prompt_template +from nexent.core.agents.agent_model import ( + AgentRunInfo, AgentConfig, ModelConfig, AgentHistory, ToolConfig +) + + + +from nexent.core.agents.run_agent import agent_run +from nexent.core.utils.observer import MessageObserver +from nexent.core.agents.agent_context import ContextManagerConfig +import logging +logging.getLogger("smolagents").setLevel(logging.WARNING) +import random +load_dotenv() + +# ============ Global Configuration ============ +LLM_API_KEY = os.getenv("LLM_API_KEY") +LLM_MODEL_NAME = os.getenv("LLM_MODEL_NAME") +LLM_API_URL = os.getenv("LLM_API_URL") + +# Disable model thinking for benchmark runs. Both vendor dialects are kept in +# one payload so the same agent_runner.py works against either backend without +# code changes: Qwen-on-vLLM/SGLang reads `chat_template_kwargs.enable_thinking` +# and ignores `thinking`; Anthropic reads `thinking.type` and ignores +# `chat_template_kwargs`. Unknown keys are silently dropped by each provider. +THINKING_OFF_EXTRA_BODY = { + "chat_template_kwargs": {"enable_thinking": False}, + "thinking": {"type": "disabled"}, +} + +APP_NAME = os.getenv("APP_NAME", "Nexent") +APP_DESCRIPTION = os.getenv("APP_DESCRIPTION", "Nexent is an open-source agent SDK and platform") + +# ============ Default Prompt Templates ============ +DEFAULT_DUTY_PROMPT = """You are an intelligent assistant focused on helping users solve problems. You need to: +1. Understand the user's needs and provide accurate answers +2. Maintain a friendly and professional attitude +3. Remember key information from the conversation""" + +DEFAULT_CONSTRAINT_PROMPT = """1. Do not generate harmful content +2. Comply with laws and regulations +3. Be honest with users when uncertain""" + +DEFAULT_FEW_SHOTS_PROMPT = "" + +DEFAULT_FALLBACK_PROMPT = """You are a helpful AI assistant that can help users solve various problems. Please remember important information from the conversation.""" + +# ============ Message Type Constants ============ +TRACKED_MESSAGE_TYPES = { + "agent_new_run", # task start + "step_count", # step count + "model_output_thinking", # thinking process + "model_output", # model output + "code_output", # code execution result + "final_answer", # final answer + "error", # error + "token_count", # per-step token usage stats +} + + +# ============ Prompt Construction Functions ============ + +def build_system_prompt( + duty: str = "", + constraint: str = "", + few_shots: str = "", + tools: list = None, + managed_agents: list = None, + memory_list: list = None, + knowledge_base_summary: str = "", + language: str = "zh", + is_manager: bool = False, + user_id: str = "", + skills: list = None +) -> str: + """ + Build System Prompt + + Args: + duty: Duty description + constraint: Constraints + few_shots: Few-shot examples + tools: Tool list + managed_agents: Managed sub-agent list + memory_list: Memory list + knowledge_base_summary: Knowledge base summary + language: Language (zh/en) + is_manager: Whether this is a manager agent + + Returns: + Rendered system prompt string + """ + tools = tools or [] + managed_agents = managed_agents or [] + memory_list = memory_list or [] + + prompt_template = get_agent_prompt_template(is_manager=is_manager, language=language) + template_content = prompt_template.get("system_prompt", "") + + tools_dict = {tool.name: tool for tool in tools} + managed_agents_dict = {agent.name: agent for agent in managed_agents} + + system_prompt = Template(template_content, undefined=StrictUndefined).render({ + "duty": duty, + "constraint": constraint, + "few_shots": few_shots, + "tools": tools_dict, + "managed_agents": managed_agents_dict, + "authorized_imports": str(BASE_BUILTIN_MODULES), + "APP_NAME": APP_NAME, + "APP_DESCRIPTION": APP_DESCRIPTION, + "memory_list": memory_list, + "knowledge_base_summary": knowledge_base_summary, + "time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "user_id": user_id, + "skills": skills or [] + }) + + return system_prompt + + +def build_prompt_templates( + system_prompt: str, + language: str = "zh", + is_manager: bool = False +) -> dict: + """ + Build complete prompt_templates dict + + Args: + system_prompt: System prompt string + language: Language + is_manager: Whether this is a manager agent + + Returns: + prompt_templates dict + """ + prompt_templates = get_agent_prompt_template(is_manager=is_manager, language=language) + prompt_templates["system_prompt"] = system_prompt + return prompt_templates + + +# ============ AgentRunInfo Construction Functions ============ + +def build_agent_run_info( + query: str, + history: list[AgentHistory], + duty_prompt: str = "", + constraint_prompt: str = "", + few_shots_prompt: str = "", + fallback_prompt: str = "", + tools: list = None, + managed_agents: list = None, + max_steps: int = 10, + temperature: float = 0.1, + agent_name: str = "test_agent", + agent_description: str = "Test Agent", + language: str = "zh", + is_manager: bool = False, + context_manager_config: Optional[ContextManagerConfig] = None, + user_id: str = "", + skills: list = None, + max_tokens: Optional[int] = None, +) -> AgentRunInfo: + """ + Construct AgentRunInfo with template-based system prompt. + + Args: + query: User query + history: Conversation history + duty_prompt: Duty prompt (empty uses default) + constraint_prompt: Constraint prompt (empty uses default) + few_shots_prompt: Few-shot prompt + fallback_prompt: Fallback prompt (empty uses default) + tools: Tool list + managed_agents: Managed sub-agent list + max_steps: Max execution steps + temperature: Temperature parameter + agent_name: Agent name + agent_description: Agent description + language: Language + is_manager: Whether this is a manager agent + context_manager_config: Context manager config (None uses default) + user_id: User ID + skills: Skill list + max_tokens: Per-call completion output cap forwarded to the main LLM. + Default None leaves the provider default (unbounded / + model max), matching the SDK back-port. Benchmarks that + want to bound runaway / degenerate-loop probes set this + explicitly (e.g. 4096). + + Returns: + AgentRunInfo object + """ + # Use defaults + duty = duty_prompt or DEFAULT_DUTY_PROMPT + constraint = constraint_prompt or DEFAULT_CONSTRAINT_PROMPT + few_shots = few_shots_prompt or DEFAULT_FEW_SHOTS_PROMPT + fallback = fallback_prompt or DEFAULT_FALLBACK_PROMPT + tools = tools or [] + managed_agents = managed_agents or [] + + model_config = ModelConfig( + cite_name="main_model", + api_key=LLM_API_KEY, + model_name=LLM_MODEL_NAME, + url=LLM_API_URL, + temperature=temperature, + ssl_verify=False, + extra_body=THINKING_OFF_EXTRA_BODY, + max_tokens=max_tokens, + ) + + if duty or constraint or few_shots: + system_prompt = build_system_prompt( + duty=duty, + constraint=constraint, + few_shots=few_shots, + tools=tools, + managed_agents=managed_agents, + memory_list=[], + knowledge_base_summary="", + language=language, + is_manager=is_manager, + user_id=user_id, + skills=skills + ) + else: + system_prompt = fallback + + prompt_templates = build_prompt_templates( + system_prompt, + language=language, + is_manager=is_manager + ) + + # Set context manager config + cm_config = context_manager_config + + + agent_config = AgentConfig( + name=agent_name, + description=agent_description, + tools=tools, + max_steps=max_steps, + model_name="main_model", + prompt_templates=prompt_templates, + managed_agents=managed_agents, + context_manager_config=cm_config + ) + + + import threading + return AgentRunInfo( + query=query, + model_config_list=[model_config], + observer=MessageObserver(lang=language), + agent_config=agent_config, + mcp_host=None, + history=history, + stop_event=threading.Event(), + ) + + +def build_agent_run_info_with_custom_prompt( + query: str, + system_prompt: str, + history: list[AgentHistory], + tools: list = None, + managed_agents: list = None, + max_steps: int = 10, + temperature: float = 0.1, + agent_name: str = "test_agent", + agent_description: str = "Test Agent", + language: str = "en", + is_manager: bool = False, + context_manager_config: Optional[ContextManagerConfig] = None, +) -> AgentRunInfo: + """ + Build AgentRunInfo with a pre-rendered system prompt string. + + Unlike build_agent_run_info which renders the system prompt via Jinja2 template, + this function accepts the final system prompt directly, bypassing the template + engine entirely. Use this for benchmark scenarios that need a specialized prompt + without the standard platform scaffolding. + + Args: + query: User query + system_prompt: Pre-rendered system prompt string (used as-is) + history: Conversation history + tools: Tool list + managed_agents: Managed sub-agents + max_steps: Max execution steps + temperature: Temperature parameter + agent_name: Agent name + agent_description: Agent description + language: Language + is_manager: Whether this is a manager agent + context_manager_config: Context manager config + + Returns: + AgentRunInfo object + """ + tools = tools or [] + managed_agents = managed_agents or [] + + model_config = ModelConfig( + cite_name="main_model", + api_key=LLM_API_KEY, + model_name=LLM_MODEL_NAME, + url=LLM_API_URL, + temperature=temperature, + ssl_verify=False, + extra_body=THINKING_OFF_EXTRA_BODY, + ) + + prompt_templates = build_prompt_templates( + system_prompt, + language=language, + is_manager=is_manager, + ) + + agent_config = AgentConfig( + name=agent_name, + description=agent_description, + tools=tools, + max_steps=max_steps, + model_name="main_model", + prompt_templates=prompt_templates, + managed_agents=managed_agents, + context_manager_config=context_manager_config, + ) + + import threading + return AgentRunInfo( + query=query, + model_config_list=[model_config], + observer=MessageObserver(lang=language), + agent_config=agent_config, + mcp_host=None, + history=history, + stop_event=threading.Event(), + ) + + +# ============ Message Processing Functions ============ + +def process_agent_message(chunk: str) -> tuple[str, str]: + """ + Parse JSON message returned by agent_run + + Args: + chunk: JSON string + + Returns: + (message_type, message_content) tuple + """ + try: + data = json.loads(chunk) + return data.get("type", ""), data.get("content", "") + except json.JSONDecodeError: + return "", chunk + + +class AgentRunResult: + """Agent run result wrapper""" + def __init__(self): + self.final_answer: str = "" + self.full_response: str = "" + self.message_type_count: dict = {} + self.step_count: int = 0 + self.errors: list = [] + self.total_input_tokens: int = 0 + self.total_output_tokens: int = 0 + + def __repr__(self): + return f"AgentRunResult(final_answer_len={len(self.final_answer)}, " \ + f"steps={self.step_count}, types={self.message_type_count})" + + +async def run_agent_with_tracking( + agent_run_info: AgentRunInfo, + on_final_answer: Optional[Callable[[str], None]] = None, + on_error: Optional[Callable[[str], None]] = None, + debug: bool = False +) -> AgentRunResult: + """ + Run Agent and track message statistics + + Args: + agent_run_info: Agent run info + on_final_answer: Callback when final_answer is received + on_error: Callback when error is received + debug: Whether to print debug info + + Returns: + AgentRunResult object containing final result and statistics + + Example: + >>> result = await run_agent_with_tracking(agent_run_info) + >>> print(result.final_answer) + >>> print(result.message_type_count) + """ + result = AgentRunResult() + + async for chunk in agent_run(agent_run_info): + if not chunk: + continue + + msg_type, msg_content = process_agent_message(chunk) + + if debug: + print(f"[DEBUG] Type={msg_type}, Content Length={len(msg_content)}", + file=sys.stderr, flush=True) + + # Count message types + if msg_type in TRACKED_MESSAGE_TYPES: + result.message_type_count[msg_type] = result.message_type_count.get(msg_type, 0) + 1 + + if msg_type in ["step_count", "final_answer"]: + result.step_count += 1 + + # Handle final answer + if msg_type == "final_answer": + result.final_answer = msg_content + result.full_response += msg_content + if on_final_answer: + on_final_answer(msg_content) + + # Handle error + elif msg_type == "error": + result.errors.append(msg_content) + if on_error: + on_error(msg_content) + + # Handle token_count — accumulate real main-LLM token usage + elif msg_type == "token_count": + try: + token_data = json.loads(msg_content) + result.total_input_tokens += token_data.get("step_input_tokens", 0) or 0 + result.total_output_tokens += token_data.get("step_output_tokens", 0) or 0 + except (json.JSONDecodeError, TypeError): + pass + + # Fallback when no final answer + if not result.final_answer: + result.final_answer = result.full_response if result.full_response else "(No response received)" + + return result + + + + +def parse_conversation_to_history(file_path: str) -> list[AgentHistory]: + """ + Parse a JSON conversation file into a list of AgentHistory objects. + + Expected format: [{"role": "user"|"assistant", "content": "..."}, ...] + + Args: + file_path: Path to a .json conversation file. + + Returns: + List of AgentHistory objects in conversation order. + + Raises: + ValueError: If file is not a .json file. + """ + if not file_path.endswith(".json"): + raise ValueError( + f"Only .json conversation files are supported, got: {file_path}" + ) + + with open(file_path, "r", encoding="utf-8") as f: + data = json.load(f) + + return [AgentHistory(role=entry["role"], content=entry["content"]) for entry in data] \ No newline at end of file diff --git a/sdk/benchmark/eventqa_eval/.gitignore b/sdk/benchmark/eventqa_eval/.gitignore new file mode 100644 index 000000000..7de82d1cc --- /dev/null +++ b/sdk/benchmark/eventqa_eval/.gitignore @@ -0,0 +1,9 @@ +# EventQA novels are large (~13MB); regenerate with download_data.py +data/ + +# Generated benchmark results (regenerated by each run) +outputs/ + +# Runtime artifacts +__pycache__/ +nexent_context_metrics.log diff --git a/sdk/benchmark/eventqa_eval/README.md b/sdk/benchmark/eventqa_eval/README.md new file mode 100644 index 000000000..eb4774b8b --- /dev/null +++ b/sdk/benchmark/eventqa_eval/README.md @@ -0,0 +1,197 @@ +# eventqa_eval — EventQA Long-text Memory Evaluation + +Based on **EventQA** dataset from MemoryAgentBench, evaluate the impact of **context compression** on ultra-long document memory: an entire novel as history to be compressed, can it still correctly answer "what happens next" questions? + +> Evaluation methods and dimensions follow the rest of `sdk/benchmark`: **baseline (no compression) vs compressed (compression)** comparison. This file covers **how to run** and **what each parameter means**. + +--- + +## Dataset + +EventQA comes from ∞-Bench's 5 novels (Gone with the Wind, Les Misérables, The Count of Monte Cristo, David Copperfield, Anna Karenina), each 390K–530K tokens. Each book has 100 six-choice MCQs: given prior events that have occurred, select the true continuation from 6 candidates (1 true + 5 GPT-4o distractors). + +Data is in HuggingFace `ai-hyz/MemoryAgentBench`'s `Accurate_Retrieval` split, rows with `metadata.source == "eventqa_full"` are the full novel versions. + +--- + +## Prerequisites + +- Use backend's venv: `nexent/backend/.venv/bin/python` (requires `huggingface_hub`, `pyarrow`) +- LLM credentials in repo root `nexent/.env`: `LLM_API_KEY` / `LLM_MODEL_NAME` / `LLM_API_URL` +- Commands below assume you're in this directory (`sdk/benchmark/eventqa_eval/`) + +--- + +## Two Steps + +### Step 1: Download Data + +```bash +python download_data.py +``` + +Download `Accurate_Retrieval` split from HuggingFace, extract 5 `eventqa_full` rows, write to `data/eventqa_full.jsonl` (~13MB, already `.gitignore`, not committed). + +| Parameter | Default | Meaning | +|---|---|---| +| `--source` | `eventqa_full` | Which variant: `eventqa_full` (entire), `eventqa_65536` (truncated to 64K tokens), `eventqa_131072` (truncated to 128K tokens). Note truncated variants have **different questions** than full | +| `--output_dir` | `./data` | Output directory | + +### Step 2: Run Evaluation + +```bash +# Smoke test: 1 book, 1 question, novel truncated to 48K chars +python run_eventqa.py --book_limit 1 --limit 1 \ + --max_ingest_chars 48000 --chunk_chars 12000 \ + --token_threshold 3000 --keep_recent_pairs 1 + +# Full run: 5 books × 100 questions +python run_eventqa.py +``` + +--- + +## `run_eventqa.py` Parameter Details + +### Evaluation Scope + +| Parameter | Default | Meaning | +|---|---|---| +| `--data_file` | `data/eventqa_full.jsonl` | Data file produced by `download_data.py` | +| `--book_limit` | All (5) | Only evaluate first N books. For smoke test set `1` | +| `--limit` | All (100) | Only run first N questions per book. For smoke test set `1` | + +### Compressed Arm: ContextManager Configuration + +The entire novel will be chunked and fed in multiple turns, triggering real ContextManager incremental compression. + +| Parameter | Default | Meaning | +|---|---|---| +| `--token_threshold` | `12000` | ContextManager compression trigger threshold. When cumulative context exceeds this token count, compression triggers. **Lower = earlier, more aggressive compression** | +| `--keep_recent_pairs` | `2` | How many chunks to retain uncompressed at tail (rest enters summary). **Total chunks must > this value for compression to actually occur** | +| `--keep_recent_steps` | `4` | ContextManager retains how many steps in current turn uncompressed | +| `--max_observation_length` | `20000` | ContextManager single observation max character count | +| `--chunk_chars` | `20000` | Character count per novel chunk. Total chars / this value = chunk turns. **Recommended ≲ token_threshold equivalent chars**, so each turn's incremental compression input stays within budget, uses fast incremental path; too large degrades to full re-compression | +| `--max_ingest_chars` | `0` (entire) | Compressed arm only takes first N chars of novel. **For smoke testing**—set small value (e.g., `48000`) to drastically shorten one book's ingest time. `0` means use entire novel | +| `--ingest_max_steps` | `2` | Max steps per ingest (acknowledge) agent run. Ingest agent only triggers compression, small step count sufficient | +| `--summary_schema` | `default` | Which summary template compressed arm uses: `default` / `narrative` / `both`, see below | + +### Two Summary Schemas (`--summary_schema`) + +ContextManager's default summary schema targets agent tasks (`active_task` / `completed_work` / `relevant_files` …). When compressing narrative novels, ~9 of 10 fields become "None", entire plot squeezed into single `critical_context` field (also capped ≤300 words)—will lose much plot detail, artificially lowering compressed scores. + +Therefore evaluation provides two schemas: + +| Schema | Fields | What it tests | +|---|---|---| +| `default` | active_task / completed_work / relevant_files … (10, agent-task oriented) | "Production ContextManager as-is" performance on narrative documents | +| `narrative` | events_so_far / characters / recent_events / unresolved_threads / setting (5, narrative oriented) | Whether compression **mechanism** with adapted template can retain narrative memory | + +`narrative` still uses **real ContextManager class + same incremental compression code path**, only replacing summary template (prompts + JSON schema, both are `ContextManagerConfig` fields). + +`--summary_schema both` lets compressed arm run both schemas. Difference between them can isolate loss sources: + +- `default` vs `narrative` gap → how much loss from **schema mismatch** +- `narrative` vs baseline gap → how much loss from **compression ratio itself** + +Note: `both` makes compressed arm (ingest + probes) run twice, ~doubling time. + +### Baseline Arm + +`eventqa_full` novels are 1.7M–3.2M chars, **no model can ingest entire book without compression**, so baseline uses "truncate to model window" as no-compression control. + +| Parameter | Default | Meaning | +|---|---|---| +| `--baseline_context_chars` | `480000` | Novel character count fed to baseline arm (truncate from start). Set to your model's context window capacity. Questions about events beyond window, baseline will fail—this is exactly what we're testing | + +### Probe (Probe) Execution + +| Parameter | Default | Meaning | +|---|---|---| +| `--probe_max_steps` | `3` | Max steps per MCQ probe agent run | + +### Skip One Arm / Debugging + +| Parameter | Default | Meaning | +|---|---|---| +| `--skip_baseline` | No | Skip baseline arm (use when iterating compressed arm only) | +| `--skip_compressed` | No | Skip compressed arm (use when iterating baseline only) | +| `--debug` | No | Print agent debug output | + +--- + +## Smoke Command Item-by-item Explanation + +```bash +python run_eventqa.py --book_limit 1 --limit 1 \ + --max_ingest_chars 48000 --chunk_chars 12000 \ + --token_threshold 3000 --keep_recent_pairs 1 +``` + +- `--book_limit 1`: Only evaluate 1 book (not all 5) +- `--limit 1`: This book only runs 1 question (not all 100) +- `--max_ingest_chars 48000`: Compressed arm only takes first 48K chars, not entire book—speeds up smoke test +- `--chunk_chars 12000`: Each chunk 12K chars → `48000 / 12000 = 4` chunks +- `--token_threshold 3000`: Cumulative context exceeds 3000 tokens triggers compression (small value, ensures compression triggers during smoke) +- `--keep_recent_pairs 1`: Tail only retains 1 chunk uncompressed → 4 chunks, first 3 enter compression region + +Overall effect: With minimal novel and question count, ensure **compression actually triggers**, end-to-end flow completes. + +--- + +## Evaluation Dimensions and Output + +Both arms answer **the same questions**, so retention ratio is clean: + +``` +memory_retention = compressed_accuracy / baseline_accuracy +token_reduction = 1 - last_compressed_tokens / last_uncompressed_tokens +``` + +**`token_reduction` same method as `manual_cases`**: Take compressed arm's **last ingest turn**'s `ContextManager.get_token_counts()`, calculate `1 - last_compressed / last_uncompressed` (corresponds to `manual_cases/test_benchmark.py` main algorithm). `acon_eval` doesn't measure token_reduction. Note this is "last turn" single-point sampling—if two schemas' last turns happen to have same token count, `token_reduction` will be same, this is inherent behavior of this method, not anomaly. + +No Continuation evaluation—EventQA MCQs are independent. + +Output written to `outputs/` (compressed metrics grouped by schema, `--summary_schema both` includes both): + +``` +outputs/ +├── / +│ ├── predictions.jsonl # Per-question: baseline vs each schema's compressed comparison +│ └── summary.json # Single-book metrics + each schema's compression info/summary +└── summary.json # Cross-book aggregate, includes per_schema grouped metrics +``` + +--- + +## Full Run Time Estimation + +Based on DeepSeek-v4-flash smoke test (Les Misérables entire book, single-step latency): + +| Stage | Unit Time (measured, approximate) | Notes | +|---|---|---| +| Ingest turn | ~20 s/turn | Chunk feed-in + one incremental compression LLM call | +| Compressed probe | ~60 s/question | Compressed context small, but model reasoning output long | +| Baseline probe | ~110 s/question | Entire novel fed in (400K–740K tokens), agent ~2 steps | + +- **Ingest turns = novel chars ÷ chunk_chars**. Default `chunk_chars=20000` means 5 books total ~590 turns. Ingest is **fixed cost, unrelated to `--limit`** (entire book must be compressed). +- Baseline probes are the time bottleneck: each question feeds entire book, agent often runs ~2 steps, each step re-sends entire book. + +**Full run (5 books × 100 questions, default params) rough estimate:** + +| Stage | Count | Estimated Time | +|---|---|---| +| Ingest | ~590 turns × 20s | ~3.3 h | +| Compressed probes | 500 questions × 60s | ~8.3 h | +| Baseline probes | 500 questions × 110s | ~15 h | +| **Total** | | **~25–30 hours** | + +**Sampled run (`--limit 20`, 5 books × 20 questions) rough estimate:** Ingest fixed ~3.3 h + probes ~5 h ≈ **8–9 hours**. + +Recommendations: + +- First use `--limit` sampling (e.g., `--limit 20`) to confirm results reasonable before expanding. +- To speed up ingest, increase `--chunk_chars` (turns halved, time ~halved), trade-off is larger per-turn compression input. +- When iterating one arm only, use `--skip_baseline` / `--skip_compressed`—baseline is time bottleneck. + +> Note: Smoke test confirmed **DeepSeek V4 (1M window) can ingest entire Les Misérables** (3,171,853 chars ≈ 743,179 tokens, single call without truncation, no error), all 5 books can be fully ingested for baseline arm. \ No newline at end of file diff --git a/sdk/benchmark/eventqa_eval/RUNBOOK.md b/sdk/benchmark/eventqa_eval/RUNBOOK.md new file mode 100644 index 000000000..20b47fe2f --- /dev/null +++ b/sdk/benchmark/eventqa_eval/RUNBOOK.md @@ -0,0 +1,408 @@ +# EventQA Execution Runbook + +Operation steps: From switching LLM credentials, smoke testing, running full 100 questions, to importing trace into Langfuse. +For parameter details see README.md in same directory. + +--- + +## 0. Prerequisites + +Daily use (environment already set up): + +- venv: `nexent/backend/.venv/bin/python` +- Data: One-time `python download_data.py` (13MB, written to `data/eventqa_full.jsonl`, already .gitignore) +- LLM credentials: Repo root `nexent/.env`'s `LLM_API_KEY` / `LLM_MODEL_NAME` / `LLM_API_URL` +- LLM optional environment variables (repo root `nexent/.env`, same section as LLM_* above): + - `LLM_ENABLE_THINKING` — `false` disables thinking for Qwen3-like models (see §8.1) + - `LLM_EXTRA_BODY` — Generic version, directly pass a JSON to `chat.completions.create`'s `extra_body` +- Langfuse (optional, for trace visualization): Self-hosted at `http://localhost:3100`; credentials see `sdk/ctx_debugger/langfuse/.env` + +### Fresh Environment from Scratch + +Clean machine (after `git clone`) follow below to install. + +#### A. Python Dependencies + +```bash +# 1) Install nexent SDK itself (editable, convenient for source changes to take effect) +cd nexent/sdk +uv pip install -e . + +# 2) backend dependencies (versions pinned by uv.lock) + benchmark extra (pyarrow / langfuse / huggingface_hub together) +cd ../backend +uv sync --extra benchmark +``` + +#### B. Langfuse (Optional — only install when need trace visualization) + +Prerequisite: Docker installed (Linux install docker engine; Windows install Docker Desktop and enable WSL2 integration). + +**Step 1 — Generate `sdk/ctx_debugger/langfuse/.env`** (gitignored, must create on new machine): + +```bash +cat > sdk/ctx_debugger/langfuse/.env < **Pitfall avoidance**: Previous glm-5 (dashscope) would reject classic novels with "inappropriate content"— +> If internal DeepSeek has similar content moderation, first use Step 2 smoke test to probe, otherwise running 100 questions will all fail. + +--- + +## 2. Quick Smoke Test (~3–5 minutes) + +Confirm internal DeepSeek reachable, doesn't block content, window large enough: + +```bash +cd /home/feiran/nexent/sdk/benchmark/eventqa_eval +../../backend/.venv/bin/python run_eventqa.py \ + --book_index 0 --limit 1 \ + --max_ingest_chars 200000 --chunk_chars 100000 \ + --token_threshold 200000 \ + --summary_schema narrative \ + --baseline_context_chars 200000 +``` + +Expected: Terminal finally prints `RESULT: baseline_acc=... | narrative: acc=... ... token_reduction=...`, +no `Error code: 400`, `inappropriate`, `Traceback` appear. + +--- + +## 3. Full Run: 1 Book × 100 Questions (**Main Command**) + +Run book 0 Gone with the Wind entire book + all 100 questions, narrative schema, production `token_threshold=200000`: + +```bash +cd /home/feiran/nexent/sdk/benchmark/eventqa_eval +../../backend/.venv/bin/python run_eventqa.py \ + --book_index 0 \ + --token_threshold 200000 --chunk_chars 100000 \ + --summary_schema narrative \ + --baseline_context_chars 800000 +``` + +- Remove `--limit` = run all 100 questions +- Remove `--max_ingest_chars` = ingest entire book (~23 chunks) +- Estimated time **~1.5–2.5 hours** (depends on internal DeepSeek speed; baseline probes are bottleneck: 100 times × 860K chars fed) + +Results land at: + +``` +outputs/eventqa_full_book0/ +├── predictions.jsonl # Per-question baseline vs compressed answers +└── summary.json # Single-book metrics + complete narrative summary +outputs/summary.json # Cross-book aggregate +``` + +### Common Switches for Cost/Time Savings + +| Want to | Add Parameter | +|---|---| +| Only run compressed arm (when tuning compression params, baseline is time bottleneck) | `--skip_baseline` | +| Only run baseline | `--skip_compressed` | +| Sample 20 questions first to see trend | `--limit 20` | +| Run both default and narrative for comparison | `--summary_schema both` (compressed arm time doubles) | +| Switch book (0–4 = Gone with the Wind / Les Misérables / Count of Monte Cristo / David Copperfield / Anna Karenina) | `--book_index ` | + +--- + +## 4. (Optional) Capture trace with ctx_debugger + Import to Langfuse + +Only go this path when **need visualization of each step's context/compression** (adds trace write overhead, each run produces independent trace). + +### 4.1 Run Test While Capturing Trace + +Replace the above Step 3 command's **entry point**, run from `ctx_debugger` directory: + +```bash +cd /home/feiran/nexent/sdk/ctx_debugger +NEXENT_CONTEXT_DEBUG=/tmp/eventqa_book0_narr.jsonl \ + ../../backend/.venv/bin/python example_with_eventqa.py \ + --book_index 0 \ + --token_threshold 200000 --chunk_chars 100000 \ + --summary_schema narrative \ + --baseline_context_chars 800000 +``` + +Parameters same as `run_eventqa.py`, forwarded unchanged. Trace written to `$NEXENT_CONTEXT_DEBUG`. + +**This demo's command** (1 book 1 question, entire book ingest): + +```bash +cd /home/feiran/nexent/sdk/ctx_debugger +NEXENT_CONTEXT_DEBUG=/tmp/eventqa_narr_trace.jsonl \ + ../../backend/.venv/bin/python example_with_eventqa.py \ + --book_index 0 --limit 1 \ + --token_threshold 200000 --chunk_chars 100000 \ + --summary_schema narrative \ + --baseline_context_chars 800000 +``` + +### 4.2 Import to Langfuse + +```bash +cd /home/feiran/nexent/sdk +set -a; source ctx_debugger/langfuse/.env; set +a +LANGFUSE_HOST=http://localhost:3100 \ +LANGFUSE_PUBLIC_KEY="$LANGFUSE_INIT_PROJECT_PUBLIC_KEY" \ +LANGFUSE_SECRET_KEY="$LANGFUSE_INIT_PROJECT_SECRET_KEY" \ + ../backend/.venv/bin/python -m ctx_debugger.langfuse_export \ + /tmp/eventqa_book0_narr.jsonl \ + --session-id book0-narrative-full +``` + +**Change `--session-id` for each run** (e.g., `book0-narr-thr150k`, `book0-narr-chunk60k`), +that's a new session, convenient for side-by-side comparison in Langfuse. Already created session names: +`nexent-ctx-demo`, `eventqa-demo`, `eventqa-narrative` (this demo). + +In Langfuse project `nexent-context`, click corresponding session to view: each turn nested expands +ingest turns / compression spans / main LLM calls / tool calls / token usage. + +### 4.3 Offline Preview Mapping Structure + +```bash +cd /home/feiran/nexent/sdk +../backend/.venv/bin/python -m ctx_debugger.langfuse_export \ + /tmp/eventqa_book0_narr.jsonl --dry-run +``` + +--- + +## 5. Parameter Quick Reference (Details in README) + +| Parameter | This demo's value | Meaning | +|---|---|---| +| `--book_index` | `0` | 0–4, 5 novels | +| `--limit` | Default=100 / smoke use 1 | Questions per book | +| `--question_start` | Default `0` | Skip first N questions (for interrupted run recovery, see §7) | +| `--token_threshold` | `200000` | Compression trigger threshold, mimics glm-5 200K window production config | +| `--chunk_chars` | `100000` | Novel chunk granularity (~23k tokens/chunk, entire book ~23 chunks) | +| `--summary_schema` | `narrative` | `default` / `narrative` / `both` | +| `--baseline_context_chars` | `800000` | Baseline truncation length (~186k tokens, ~200K window production scenario) | +| `--keep_recent_pairs` | Default `2` | Tail retain chunk count | +| `--max_ingest_chars` | Default `0` (entire) / smoke use 200000 | Ingest truncation (0=no truncation) | +| `--skip_baseline` / `--skip_compressed` | Default No | Skip one arm (for recovery, see §7) | + +--- + +## 6. Troubleshooting + +| Symptom | Cause / Action | +|---|---| +| `Error code: 400 ... inappropriate content` | LLM endpoint has content moderation blocking classic literature. Switch model/endpoint (DeepSeek direct has no issue). | +| Output large amounts of ``, random chars, `扫码失败` | LLM producing degraded gibberish (OpenRouter `:free` seen this). Switch model. | +| `Still exceeds threshold after compression: X > Y` | Warning, not fatal. Means retained tail + current chunk already exceeds token_threshold; can reduce `--keep_recent_pairs` or `--chunk_chars`, or increase `--token_threshold`. | +| `compressed_pairs=0` (trace shows compression not triggered) | Ingest cumulative tokens didn't exceed `--token_threshold`. Increase `--max_ingest_chars`, reduce `--token_threshold`, or reduce `--chunk_chars`. | +| Langfuse import blank | `--dry-run` check if trace non-empty; confirm `LANGFUSE_HOST`/keys correct; `curl -s http://localhost:3100/api/public/health` check service. | +| `data file not found` | First run `python download_data.py`. | +| Large amount of `no_answer` (baseline ≥50%) | Most likely model thinking mode eating up `max_tokens`, `content`来不及生成完整 `final_answer(...)` code block. See §8. | + +--- + +## 7. Interrupted Recovery / Salvage + +EventQA entire book + 100 questions + dual arms run occasionally gets killed by network disconnect/SSH disconnect/timeout. This section provides a **no data loss** recovery flow. + +Prerequisite: You **ran with ctx_debugger capturing trace** (see §4.1)—trace saved each probe's input, model reply, final_answer. Without trace capture, pure `run_eventqa.py` interrupted can only restart from beginning. + +Recovery pipeline three steps: + +``` + trace.jsonl ──(1. salvage)──> outputs/_salvage/ + │ + │ Knows baseline ran to qid N-1 then broke + ▼ + run_eventqa.py --skip_compressed --question_start N + ──(2. resume)──> outputs// + │ + ▼ + (3. merge) + outputs// + (overwrite with merged version) +``` + +### 7.1 Salvage Existing Probe Results from Trace + +```bash +cd /home/feiran/nexent/sdk/benchmark/eventqa_eval +../../backend/.venv/bin/python salvage_trace.py \ + /tmp/nexent_eventqa_trace.jsonl \ + --book_index 0 --schema narrative +``` + +Writes to `outputs/eventqa_full_book0_salvage/`: +- `summary.json` — Contains compressed accuracy, baseline partial accuracy, compression info (previous_summary, token_counts, num_chunks) +- `predictions_compressed.jsonl` — Compressed arm per-question results +- `predictions_baseline.jsonl` — Baseline arm already-run partial results (e.g., 0–43) + +Print will tell you where baseline broke ("qids 0..43 done, 56 remaining"). + +**How to map trace turns to qid**: By trace internal turn order. Compressed arm's k-th `eventqa_answerer` turn = items[k]; baseline similarly. Prerequisite is **probes run sequentially, no retries**—current `run_probes` does exactly this. If retries added in future, need redesign here. + +### 7.2 Resume Missing Baseline Part + +Following above "qids 0..43 done", remaining qids 44..99 = 57 questions. But for safety **restart from 43** (breakpoint question likely incomplete), i.e., 56 questions: + +```bash +cd /home/feiran/nexent/sdk/benchmark/eventqa_eval +../../backend/.venv/bin/python run_eventqa.py \ + --book_index 0 --skip_compressed \ + --question_start 43 \ + --token_threshold 200000 --chunk_chars 100000 \ + --summary_schema narrative \ + --baseline_context_chars 800000 +``` + +Key: +- `--skip_compressed` skip ingest + compressed probes (preserve salvage's existing compressed data) +- `--question_start 43` skip first 43 questions (this is §7.1 salvage told you done count) +- Other parameters **must match exactly interrupted run**—especially `--token_threshold` / `--chunk_chars` / `--summary_schema` / `--baseline_context_chars`, otherwise merged data not comparable + +Writes to `outputs/eventqa_full_book0/{summary.json, predictions.jsonl}`, at this point **only contains qid 43..99 baseline** (compressed empty dict). + +### 7.3 Merge + +```bash +cd /home/feiran/nexent/sdk/benchmark/eventqa_eval +../../backend/.venv/bin/python merge_partial.py \ + --book_id eventqa_full_book0 \ + --schema narrative \ + --resume_start_qid 43 +``` + +Reads `outputs/_salvage/` and `outputs//` (after resumed run), merges writes back to `outputs//{summary.json, predictions.jsonl}`, contains: +- compressed 100 questions (from salvage) +- baseline 100 questions (0..42 from salvage, 43..99 from resumed run) +- recalculated accuracy / retention / token_reduction +- `_merge_provenance` field recording data source (which qids from salvage, which from resumed) + +Merged `outputs//` format completely identical to从头跑一次完整 output—subsequent tools (Langfuse, merge after dry-run etc.) all work normally. + +### 7.4 Prevent Interruption + +Next time running long task use `tmux` / `nohup` / `setsid` protection, avoid SSH disconnect/terminal close killing process: + +```bash +tmux new -s eventqa +# In tmux run command +# Ctrl+B then D detach; next time tmux attach -t eventqa +``` + +Note tmux only prevents SSH disconnect; LLM endpoint jitter/timeout still causes individual agent step failures, that case `run_agent_with_tracking`'s fallback will default to `no_answer`, won't kill entire run. + +--- + +## 8. Known Limitations + +### 8.1 Qwen3 etc. Thinking Model Impact + +Qwen3 (`qwen36` etc.) has "thinking" mode: model first in `reasoning_content` channel reasons, then produces final answer in `content`. `nexent`'s `OpenAIModel` already captures both channels separately (`openai_llm.py:148-154`), so `content` **won't** have `` etc. pollution. + +**But** thinking still impacts EventQA: +- thinking喷的 token counts toward `max_tokens` budget, **`content` may run out of budget before producing complete `final_answer(...)` code block** → smolagents parse failure → `no_answer` +- Large context (baseline feeds ~186k tokens) thinking喷得更长更乱, compared to compressed (~70k) more easily exhausts budget +- Measured one run (qwen36 / entire book 0 / narrative / token_threshold=200000): + - baseline `no_answer` rate **66%** (29/44) + - compressed `no_answer` rate 21% (21/100) + - retention = compressed_acc/baseline_acc = **1.76** (compressed beats baseline, because baseline heavily误伤by thinking, not compression actually better) + +**Mitigation**: Pass `extra_body={"chat_template_kwargs":{"enable_thinking":false}}` to disable thinking, let all `max_tokens` budget留给 `content`. Two entry points: + +Via `.env` (recommended, globally effective): +```bash +# Either works, former more generic +LLM_EXTRA_BODY={"chat_template_kwargs":{"enable_thinking":false}} +LLM_ENABLE_THINKING=false +``` + +Via Python directly constructing `OpenAIModel`: +```python +OpenAIModel(..., extra_body={"chat_template_kwargs":{"enable_thinking": False}}) +``` + +Code changes involve SDK three places (`agent_model.ModelConfig.extra_body` field, `openai_llm.OpenAIModel.extra_body` param, `nexent_agent.create_model` pass-through) + benchmark side `agent_runner.py` env read. Already landed, default behavior unchanged (unset = not pass = consistent with before). + +**Thinking off vs on are incomparable two datasets**—if you want comparison, run twice: one default (thinking on), one `LLM_ENABLE_THINKING=false`, separately go through §3 flow, session id distinguish (e.g., `eventqa-narr-thinkON` / `eventqa-narr-thinkOFF`). + +### 8.2 Salvage Mechanism Boundaries + +§7's `salvage_trace.py` **maps by trace internal turn order** to `book.items[k]`, this relies on `run_probes` running sequentially, no retries. Current implementation does exactly this (one item one `run_agent_with_tracking`). If probe-level retries added in future (one item multiple agent_init), salvage's "by order" assumption breaks, need more robust qid matching strategy (e.g., by-question-text matching—but ctx_debugger's message truncation makes prefix matching also容易误判, seen fuzzy matching把累加前序事件的多个qid都归到qid=1踩坑). + +### 8.3 token_reduction is Single-point Sampling + +As README explains, `token_reduction` takes **last ingest turn**'s `get_token_counts()` (same method as `manual_cases`). Two schemas' last turns happen to hit same token count, retention will be same, normal sampling behavior. + +### 8.4 Content Moderation Blocking + +Classic literature (19th century Western novels) triggers some domestic LLM endpoints' content moderation (measured glm-5 / dashscope directly 400 `inappropriate content` blocking Gone with the Wind first chunk). This isn't something benchmark can bypass—need to switch to endpoints without literature moderation (DeepSeek direct, self-deployed Qwen3, etc.). + +### 8.5 baseline_context_chars vs Model Window Balance + +`--baseline_context_chars 800000` (~186k tokens) already approaches 200K window model limit—adding system prompt + question容易撞窗口; if model actual effective context shorter than nominal ("lost in the middle"), baseline accuracy further lowered, but this is **the model's真实表现 at this window size**, what benchmark should reflect, not bug. \ No newline at end of file diff --git a/sdk/benchmark/eventqa_eval/dataset.py b/sdk/benchmark/eventqa_eval/dataset.py new file mode 100644 index 000000000..6a87a0226 --- /dev/null +++ b/sdk/benchmark/eventqa_eval/dataset.py @@ -0,0 +1,130 @@ +"""Dataset loader for EventQA (MemoryAgentBench). + +Loads the ``eventqa_full.jsonl`` produced by ``download_data.py``. Each line is +one novel: the full text plus 100 six-choice "what happens next" questions. + +Each raw question string embeds the candidate events as a Python list literal: + + These are the events that have already occurred: + + 1. + + Below is a list of possible subsequent events: + + ['event A', 'event B', ..., 'event F'] + + Your task is to choose from the above events which event happens next ... + +This module parses that structure into EventQAItem objects so the runner can +feed the raw question to the agent and score the answer against the gold option. +""" +import ast +import json +from dataclasses import dataclass, field +from typing import List + +# Markers that delimit the three parts of a raw EventQA question. +_PRIOR_MARKER = "These are the events that have already occurred:" +_OPTIONS_MARKER = "Below is a list of possible subsequent events:" +_TASK_MARKER = "Your task is to choose" + + +@dataclass +class EventQAItem: + """A single six-choice "what happens next" question.""" + qid: str + question: str # raw question text, fed verbatim to the agent + options: List[str] # the six candidate subsequent events + gold: str # exact text of the correct option + prior_events: str = "" # the "events that have already occurred" block + + +@dataclass +class EventQABook: + """One novel with its 100 EventQA questions.""" + book_index: int + book_id: str + book_title: str + context: str # full novel text + items: List[EventQAItem] = field(default_factory=list) + + +def _parse_question(raw: str) -> tuple[str, List[str]]: + """Extract the prior-events block and the six candidate options. + + Returns (prior_events_text, options). Either may be empty if the question + does not follow the expected structure. + """ + prior = "" + options: List[str] = [] + + prior_idx = raw.find(_PRIOR_MARKER) + opts_idx = raw.find(_OPTIONS_MARKER) + task_idx = raw.find(_TASK_MARKER) + + if prior_idx != -1 and opts_idx != -1: + prior = raw[prior_idx + len(_PRIOR_MARKER):opts_idx].strip() + + if opts_idx != -1: + seg_end = task_idx if task_idx != -1 else len(raw) + segment = raw[opts_idx + len(_OPTIONS_MARKER):seg_end] + lb = segment.find("[") + rb = segment.rfind("]") + if lb != -1 and rb > lb: + try: + parsed = ast.literal_eval(segment[lb:rb + 1]) + if isinstance(parsed, (list, tuple)): + options = [str(x) for x in parsed] + except (ValueError, SyntaxError): + options = [] + + return prior, options + + +def _gold_answer(raw_answer) -> str: + """Normalize the answer field to the gold option's text. + + The dataset stores answers as a one-element list, e.g. ['the correct event']. + """ + if isinstance(raw_answer, (list, tuple)): + return str(raw_answer[0]) if raw_answer else "" + return str(raw_answer) + + +def load_books(jsonl_path: str) -> List[EventQABook]: + """Load all EventQA books from a downloaded ``*.jsonl`` file.""" + books: List[EventQABook] = [] + + with open(jsonl_path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + row = json.loads(line) + + questions = row.get("questions") or [] + answers = row.get("answers") or [] + qa_ids = row.get("qa_pair_ids") or [] + + items: List[EventQAItem] = [] + for i, raw_q in enumerate(questions): + prior, options = _parse_question(str(raw_q)) + gold = _gold_answer(answers[i]) if i < len(answers) else "" + qid = qa_ids[i] if i < len(qa_ids) else f"{row.get('book_id', 'book')}_no{i}" + items.append(EventQAItem( + qid=str(qid), + question=str(raw_q), + options=options, + gold=gold, + prior_events=prior, + )) + + books.append(EventQABook( + book_index=row.get("book_index", len(books)), + book_id=row.get("book_id", f"book{len(books)}"), + book_title=row.get("book_title", f"book{len(books)}"), + context=row.get("context") or "", + items=items, + )) + + return books diff --git a/sdk/benchmark/eventqa_eval/download_data.py b/sdk/benchmark/eventqa_eval/download_data.py new file mode 100644 index 000000000..d688f44f2 --- /dev/null +++ b/sdk/benchmark/eventqa_eval/download_data.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +"""Download EventQA data from the MemoryAgentBench dataset on HuggingFace. + +EventQA lives in the ``Accurate_Retrieval`` split of ``ai-hyz/MemoryAgentBench``. +Rows whose ``metadata.source`` equals ``eventqa_full`` carry the five full novels +(each ~1.7M-3.2M characters) plus 100 six-choice "what happens next" questions. + +This script downloads the split's parquet file, extracts the five ``eventqa_full`` +rows, and writes them to ``data/eventqa_full.jsonl`` (one book per line). + +Usage: + python download_data.py + python download_data.py --source eventqa_131072 # truncated 128K variant + +Requires ``huggingface_hub`` and ``pyarrow`` in the active environment. +""" +import argparse +import json +import os + +HF_REPO = "ai-hyz/MemoryAgentBench" +HF_FILE = "data/Accurate_Retrieval-00000-of-00001.parquet" + +# Map a context prefix to a human-readable novel title. The five EventQA books +# always appear in this order in the parquet, but matching on the prefix keeps +# the labels correct even if the row order ever changes. +_BOOK_TITLES = [ + ("Part One \nCHAPTER I \nDEBBIE", "Gone with the Wind"), + ("VOLUME I\nMIRACLE", "Les Miserables"), + ("Chapter 1\nMarseilles", "The Count of Monte Cristo"), + ("Whether I shall turn out to be the hero", "David Copperfield"), + ("PART ONE\nChapter 1\nHappy families", "Anna Karenina"), +] + + +def _book_title(context: str, fallback_index: int) -> str: + head = context.lstrip() + for prefix, title in _BOOK_TITLES: + if head.startswith(prefix.lstrip()): + return title + return f"book{fallback_index}" + + +def main(source: str, output_dir: str): + from huggingface_hub import hf_hub_download + import pyarrow.parquet as pq + + print(f"Downloading {HF_FILE} from {HF_REPO} ...") + path = hf_hub_download(HF_REPO, HF_FILE, repo_type="dataset") + print(f" cached at: {path}") + + rows = pq.read_table(path).to_pylist() + books = [r for r in rows if (r.get("metadata") or {}).get("source") == source] + if not books: + sources = sorted({(r.get("metadata") or {}).get("source") for r in rows}) + raise SystemExit(f"No rows with source={source!r}. Available sources: {sources}") + + os.makedirs(output_dir, exist_ok=True) + out_path = os.path.join(output_dir, f"{source}.jsonl") + + with open(out_path, "w", encoding="utf-8") as f: + for i, row in enumerate(books): + context = row.get("context") or "" + md = row.get("metadata") or {} + record = { + "book_index": i, + "book_id": f"{source}_book{i}", + "book_title": _book_title(context, i), + "source": source, + "context": context, + "questions": row.get("questions") or [], + "answers": row.get("answers") or [], + "previous_events": md.get("previous_events") or [], + "qa_pair_ids": md.get("qa_pair_ids") or [], + } + f.write(json.dumps(record, ensure_ascii=False) + "\n") + print(f" book {i}: {record['book_title']:<28} " + f"ctx_chars={len(context):>9d} questions={len(record['questions'])}") + + print(f"\nWrote {len(books)} books to {out_path}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Download EventQA data from MemoryAgentBench") + parser.add_argument( + "--source", + type=str, + default="eventqa_full", + choices=["eventqa_full", "eventqa_65536", "eventqa_131072"], + help="Which EventQA variant to extract (default: eventqa_full)", + ) + parser.add_argument( + "--output_dir", + type=str, + default=os.path.join(os.path.dirname(os.path.abspath(__file__)), "data"), + help="Directory to write the .jsonl file", + ) + args = parser.parse_args() + main(source=args.source, output_dir=args.output_dir) diff --git a/sdk/benchmark/eventqa_eval/eval_utils.py b/sdk/benchmark/eventqa_eval/eval_utils.py new file mode 100644 index 000000000..cdff5d1f6 --- /dev/null +++ b/sdk/benchmark/eventqa_eval/eval_utils.py @@ -0,0 +1,107 @@ +"""Scoring utilities for EventQA six-choice questions. + +The agent is asked to answer a "what happens next" question by returning one of +six candidate events. Scoring maps the agent's free-text answer back to one of +the six options, then checks whether that option is the gold option. + +Matching strategy (most to least reliable): + 1. exact — normalized answer equals a normalized option + 2. containment — a normalized option is a substring of the normalized answer + (or vice versa); the agent wrapped the option in extra words + 3. fuzzy — highest token-F1 option, used only as a last resort +""" +import re +import string +from dataclasses import dataclass +from typing import List + + +@dataclass +class MCQResult: + correct: bool + score: float # 1.0 if correct, else 0.0 + selected_index: int # index of the option the agent chose, -1 if none + selected: str # text of the chosen option ("" if none) + gold_index: int # index of the gold option, -1 if gold not in options + match_type: str # "exact" | "containment" | "fuzzy" | "none" + + +def _normalize(s: str) -> str: + """Lowercase, drop punctuation and articles, collapse whitespace.""" + s = s.lower() + s = s.translate(str.maketrans("", "", string.punctuation)) + s = re.sub(r"\b(a|an|the)\b", " ", s) + return " ".join(s.split()) + + +def _token_f1(pred: str, gold: str) -> float: + """SQuAD-style token-overlap F1 between two normalized strings.""" + pred_tokens = pred.split() + gold_tokens = gold.split() + if not pred_tokens or not gold_tokens: + return 0.0 + common: dict[str, int] = {} + for t in pred_tokens: + common[t] = common.get(t, 0) + 1 + overlap = 0 + for t in gold_tokens: + if common.get(t, 0) > 0: + overlap += 1 + common[t] -= 1 + if overlap == 0: + return 0.0 + precision = overlap / len(pred_tokens) + recall = overlap / len(gold_tokens) + return 2 * precision * recall / (precision + recall) + + +def score_mcq(answer: str, options: List[str], gold: str) -> MCQResult: + """Map a free-text answer to one of the six options and score it.""" + gold_index = options.index(gold) if gold in options else -1 + + norm_answer = _normalize(answer or "") + norm_options = [_normalize(o) for o in options] + + selected_index = -1 + match_type = "none" + + if norm_answer: + # 1. Exact normalized match. + for i, norm_opt in enumerate(norm_options): + if norm_opt and norm_opt == norm_answer: + selected_index = i + match_type = "exact" + break + + # 2. Containment — prefer the longest contained option to avoid + # matching a short option that is a prefix of the intended one. + if selected_index == -1: + best_len = -1 + for i, norm_opt in enumerate(norm_options): + if not norm_opt: + continue + if norm_opt in norm_answer or norm_answer in norm_opt: + if len(norm_opt) > best_len: + best_len = len(norm_opt) + selected_index = i + match_type = "containment" + + # 3. Fuzzy — highest token-F1 option (last resort). + if selected_index == -1: + best_f1 = 0.0 + for i, norm_opt in enumerate(norm_options): + f1 = _token_f1(norm_answer, norm_opt) + if f1 > best_f1: + best_f1 = f1 + selected_index = i + match_type = "fuzzy" + + correct = selected_index != -1 and selected_index == gold_index + return MCQResult( + correct=correct, + score=1.0 if correct else 0.0, + selected_index=selected_index, + selected=options[selected_index] if selected_index != -1 else "", + gold_index=gold_index, + match_type=match_type, + ) diff --git a/sdk/benchmark/eventqa_eval/merge_partial.py b/sdk/benchmark/eventqa_eval/merge_partial.py new file mode 100644 index 000000000..29e51d9f9 --- /dev/null +++ b/sdk/benchmark/eventqa_eval/merge_partial.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 +"""Merge a salvaged trace + a resumed run into the canonical book output. + +After an interrupted EventQA run, the pipeline becomes: + + 1. salvage_trace.py -> outputs/_salvage/{summary,predictions_*}.jsonl + 2. run_eventqa.py --skip_compressed --question_start N + -> outputs//{summary,predictions}.jsonl (NEW partial) + 3. merge_partial.py -> outputs//{summary,predictions}.jsonl (UNIFIED) + +The merge takes: + - All 100 compressed-arm probe results from the salvage. + - Baseline probe results from the salvage for qids 0..N-1. + - Baseline probe results from the resumed run for qids N..99 (overwrites any + overlap, so item N is taken from the fresh resumed run since it was the one + interrupted). + +Outputs match the format ``run_eventqa.py`` writes natively. +""" +import argparse +import json +import os +import re +import sys + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import paths # noqa: F401 + + +def _qnum(qid: str) -> int: + m = re.search(r"no(\d+)$", qid or "") + return int(m.group(1)) if m else -1 + + +def _read_jsonl(path): + out = [] + with open(path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + out.append(json.loads(line)) + return out + + +def main(): + ap = argparse.ArgumentParser(description="Merge salvaged + resumed EventQA outputs.") + ap.add_argument("--book_id", default="eventqa_full_book0") + ap.add_argument("--schema", default="narrative") + ap.add_argument("--resume_start_qid", type=int, default=43, + help="The qid number at which the resumed run started.") + ap.add_argument("--outputs_dir", default=None, + help="Parent outputs dir; default = eventqa_eval/outputs/") + args = ap.parse_args() + + base = args.outputs_dir or os.path.join( + os.path.dirname(os.path.abspath(__file__)), "outputs" + ) + salvage_dir = os.path.join(base, f"{args.book_id}_salvage") + resume_dir = os.path.join(base, args.book_id) + if not os.path.isdir(salvage_dir): + sys.exit(f"salvage dir not found: {salvage_dir}") + if not os.path.isdir(resume_dir): + sys.exit(f"resumed-run dir not found: {resume_dir}") + + # --- salvage --- + salvage_sum = json.load(open(os.path.join(salvage_dir, "summary.json"))) + salvage_comp = _read_jsonl(os.path.join(salvage_dir, "predictions_compressed.jsonl")) + salvage_base = _read_jsonl(os.path.join(salvage_dir, "predictions_baseline.jsonl")) + + # --- resumed run --- + resume_sum = json.load(open(os.path.join(resume_dir, "summary.json"))) + resume_preds = _read_jsonl(os.path.join(resume_dir, "predictions.jsonl")) + + # Compressed arm: all 100 from salvage. + # Baseline arm: salvage qids 0..(resume_start_qid-1), then resume qids resume_start_qid..99. + base_by_qid = {} + for r in salvage_base: + n = _qnum(r["qid"]) + if 0 <= n < args.resume_start_qid: + base_by_qid[r["qid"]] = r + for r in resume_preds: + b = r.get("baseline") + if not b: + continue + n = _qnum(r["qid"]) + if n >= args.resume_start_qid: + base_by_qid[r["qid"]] = { + "qid": r["qid"], "gold": r.get("gold"), + "answer": b.get("answer"), "selected": b.get("selected"), + "correct": b.get("correct"), "score": 1.0 if b.get("correct") else 0.0, + "match_type": b.get("match_type"), + } + + # Build unified predictions in run_eventqa format + by_qid = {} + for r in salvage_comp: + by_qid.setdefault(r["qid"], {"qid": r["qid"], "gold": r["gold"]}) + by_qid[r["qid"]].setdefault("compressed", {})[args.schema] = { + "answer": r["answer"], "selected": r["selected"], + "correct": r["correct"], "match_type": r["match_type"], + } + for qid, r in base_by_qid.items(): + by_qid.setdefault(qid, {"qid": qid, "gold": r.get("gold")}) + by_qid[qid]["baseline"] = { + "answer": r["answer"], "selected": r["selected"], + "correct": r["correct"], "match_type": r["match_type"], + } + predictions = sorted(by_qid.values(), key=lambda x: _qnum(x["qid"])) + + # Aggregate metrics + base_results = [(_qnum(r["qid"]), r) for r in base_by_qid.values()] + base_results.sort(key=lambda x: x[0]) + comp_results = sorted(salvage_comp, key=lambda r: _qnum(r["qid"])) + + bacc = sum(1.0 if r["correct"] else 0.0 for _, r in base_results) / max(len(base_results), 1) + cacc = sum(r["score"] for r in comp_results) / max(len(comp_results), 1) + retention = cacc / bacc if bacc > 0 else 0.0 + + # Pull compression metadata from salvage's compressed//compression + comp_meta = salvage_sum["compressed"][args.schema]["compression"] + token_reduction = salvage_sum["compressed"][args.schema].get("token_reduction") + + summary = { + "book_id": args.book_id, + "book_title": salvage_sum.get("book_title"), + "novel_chars": salvage_sum.get("novel_chars"), + "num_questions": salvage_sum.get("num_questions"), + "baseline": {"accuracy": bacc, "n": len(base_results)}, + "compressed": { + args.schema: { + "accuracy": cacc, + "n": len(comp_results), + "memory_retention": retention, + "token_reduction": token_reduction, + "compression": comp_meta, + } + }, + "_merge_provenance": { + "salvage_dir": salvage_dir, + "resume_dir": resume_dir, + "resume_start_qid": args.resume_start_qid, + "baseline_from_salvage": sum(1 for _, r in base_results if _qnum(r["qid"]) < args.resume_start_qid), + "baseline_from_resume": sum(1 for _, r in base_results if _qnum(r["qid"]) >= args.resume_start_qid), + }, + } + + # Write to the canonical book outputs dir + out_dir = os.path.join(base, args.book_id) + with open(os.path.join(out_dir, "summary.json"), "w", encoding="utf-8") as f: + json.dump(summary, f, ensure_ascii=False, indent=2, default=str) + with open(os.path.join(out_dir, "predictions.jsonl"), "w", encoding="utf-8") as f: + for p in predictions: + f.write(json.dumps(p, ensure_ascii=False) + "\n") + + print(f"Merged to {out_dir}") + print(f" baseline N={len(base_results)} acc={bacc:.3f}") + print(f" compressed[{args.schema}] N={len(comp_results)} acc={cacc:.3f}") + print(f" retention={retention:.3f} token_reduction={token_reduction}") + print(f" provenance: baseline {summary['_merge_provenance']['baseline_from_salvage']} from salvage " + f"+ {summary['_merge_provenance']['baseline_from_resume']} from resume run") + + +if __name__ == "__main__": + main() diff --git a/sdk/benchmark/eventqa_eval/run_eventqa.py b/sdk/benchmark/eventqa_eval/run_eventqa.py new file mode 100644 index 000000000..26cf7d760 --- /dev/null +++ b/sdk/benchmark/eventqa_eval/run_eventqa.py @@ -0,0 +1,810 @@ +#!/usr/bin/env python3 +"""Run the EventQA benchmark with the nexent agent. + +EventQA (MemoryAgentBench) gives a full novel plus 100 six-choice +"what happens next" questions. This benchmark keeps the same evaluation method +and dimensions as the rest of ``sdk/benchmark`` — a baseline vs compressed +comparison — but adapted to a long-document memory task: + + * Baseline — the novel is truncated to the model's context window and fed + whole, with NO compression. Questions about events past the + truncation point are expected to fail. + * Compressed — the FULL novel is streamed in as a growing conversation + history; the real ContextManager incrementally compresses it. + The 100 questions are then run as memory probes against the + pre-compressed context. + +Both arms answer the SAME 100 questions, so the retention ratio is clean: + + memory_retention = compressed_accuracy / baseline_accuracy + token_reduction = 1 - last_compressed_tokens / last_uncompressed_tokens + +Continuation is not measured — EventQA questions are independent MCQs. + +Usage: + python download_data.py # one-time: fetch the 5 novels + python run_eventqa.py --limit 5 --book_limit 1 --max_ingest_chars 120000 + python run_eventqa.py # full run: 5 books x 100 questions + +Results are written to outputs// and outputs/summary.json. +""" +import argparse +import asyncio +import copy +import json +import os +import sys + +# ---- Path setup (mirrors acon_eval/run_acon_qa.py) ---- +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import paths # noqa: F401 - side effect: adds sdk/, backend/ to sys.path +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from agent_runner import ( + build_agent_run_info, + run_agent_with_tracking, + ContextManagerConfig, +) +from nexent.core.agents.agent_model import AgentHistory +from nexent.core.agents.agent_context import ContextManager + +from dataset import load_books, EventQABook +from eval_utils import score_mcq + + +# ============ Agent duty prompts ============ + +INGEST_DUTY = ( + "You are reading a long novel one part at a time. Each message gives you " + "the next part of the novel. Read it carefully and remember the events, " + "the characters, and the order in which things happen. Do not analyze, " + "review, or summarize the text. Simply acknowledge that you have read it " + "by calling final_answer with the single word: OK" +) + +PROBE_DUTY = ( + "You are answering a six-choice question about a novel. The novel — or a " + "compressed summary of it — has been provided to you as earlier context. " + "The question states the events that have already occurred and then lists " + "six candidate events that might happen next. Exactly one of the six is " + "the true continuation from the novel; the other five are " + "plausible-sounding distractors.\n" + "Rules:\n" + "- You MUST choose exactly one of the six options. Choosing one is " + "mandatory even when none seems certain — pick the most likely.\n" + "- Never reply that none of the events occur, and never put your " + "reasoning into the answer.\n" + "- Answer in a SINGLE step. Your first and only code block must call " + "final_answer directly. Do NOT first write a bare string, a print, or " + "any inspection code — a bare string is NOT an answer and wastes a step.\n" + "- Emit exactly one code block of this form, with the chosen option's " + "text copied verbatim from the candidate list:\n" + '\nfinal_answer("")\n' +) + + +# ============ Summary schemas for the compressed arm ============ +# The compressed arm can use either schema; `--summary_schema both` runs each. +# +# default — the production ContextManager schema (agent-task oriented: +# active_task / completed_work / relevant_files ...). On a novel +# most fields collapse to "None" and the plot is squeezed into a +# single capped field. +# narrative — the novel-oriented schema below. Still the real ContextManager +# class and the same incremental-compression code path; only the +# summary template (prompts + JSON schema) differs. + +NARRATIVE_SUMMARY_SYSTEM_PROMPT = ( + "You are summarizing a novel that is being read in sequential parts. " + "Treat the text below as the novel's own content — it is a story, NOT a " + "task, a conversation, or a document the user is asking you to review. " + "Produce only the structured JSON summary; no greeting, preamble, or prefix. " + "Write the summary in the same language as the novel. " + "Your goal is to preserve the STORY so that someone who reads only your " + "summary could still answer 'what happens next' questions: keep the " + "sequence of events, which character did what, and the order things happen. " + "Be CONCRETE — name characters, places, and specific actions, and preserve " + "chronological order. Avoid vague phrases like 'various events occur'. " + "Output strict JSON format without markdown blocks." +) + +NARRATIVE_INCREMENTAL_SUMMARY_SYSTEM_PROMPT = ( + "You are maintaining a running summary of a novel that is being read in " + "sequential parts. The text below shows the existing summary as 'Previous " + "Summary' and the next part of the novel as 'New Content'. Treat the new " + "content as story text, NOT as a task or conversation. " + "Update the summary by these rules:\n" + "1. PRESERVE earlier events — do not drop plot points just because they " + "are old. When space runs short, compress older events into briefer " + "mentions rather than deleting them outright.\n" + "2. ADD the new events to 'events_so_far', continuing the chronological order.\n" + "3. UPDATE 'characters' with newly introduced characters and changes to known ones.\n" + "4. UPDATE 'recent_events' to describe the latest part in finer detail.\n" + "5. UPDATE 'unresolved_threads' and 'setting'.\n" + "Write in the novel's language. Output strict JSON format without markdown blocks." +) + +NARRATIVE_SUMMARY_SCHEMA = { + "events_so_far": ( + "THE MOST IMPORTANT FIELD. A numbered, chronological list of the plot " + "events from the start of the novel up to now. Each entry: which " + "character did what, and where. Be concrete and specific — this field " + "is what a reader uses to judge what happens next. (<=600 words)" + ), + "characters": ( + "Key characters and their roles, relationships, and current " + "situation. (<=250 words)" + ), + "recent_events": ( + "The events of the most recent part, in finer detail than the older " + "entries, for continuity with what comes next. (<=200 words)" + ), + "unresolved_threads": ( + "Open plot threads, conflicts, and questions not yet resolved. (<=150 words)" + ), + "setting": "Time period, places, and overall context of the story. (<=80 words)", +} + + +def build_compressed_config(schema_name: str, args) -> ContextManagerConfig: + """Build the compressed-arm ContextManagerConfig for a given summary schema. + + For 'narrative', only the three summary-template fields are overridden; the + rest of the ContextManager (incremental compression, caching, boundaries) + is untouched — it is still the real production compression path. + """ + config = ContextManagerConfig( + enabled=True, + token_threshold=args.token_threshold, + keep_recent_pairs=args.keep_recent_pairs, + keep_recent_steps=args.keep_recent_steps, + max_observation_length=args.max_observation_length, + ) + if schema_name == "narrative": + config.summary_system_prompt = NARRATIVE_SUMMARY_SYSTEM_PROMPT + config.incremental_summary_system_prompt = NARRATIVE_INCREMENTAL_SUMMARY_SYSTEM_PROMPT + config.summary_json_schema = NARRATIVE_SUMMARY_SCHEMA + return config + + +def resolve_schemas(arg: str) -> list[str]: + """Map the --summary_schema argument to the list of schemas to run.""" + return ["default", "narrative"] if arg == "both" else [arg] + + +def _fmt(x) -> str: + """Format a possibly-None metric for console output.""" + return "n/a" if x is None else f"{x:.3f}" + + +# ============ Pre-compressed history builder ============ +# Copied from manual_cases/test_benchmark.py:build_precompressed_history so this +# directory stays self-contained (acon_eval follows the same self-contained +# pattern). It must mirror the message structure produced by +# ContextManager.compress_if_needed → SummaryTaskStep.to_messages(). + +def build_precompressed_history( + frozen_history: list[AgentHistory], + cm_summary: dict, +) -> list[AgentHistory]: + """Build a pre-compressed history from a compression snapshot. + + Replaces the compressed prefix pairs with a single user message holding the + summary text, then appends the retained tail pairs verbatim. If no + compression happened, the original history is returned unchanged. + """ + boundary = cm_summary.get("compression_boundary", {}) + compressed_pairs = boundary.get("previous_compressed_pairs", 0) + compressed_entries = compressed_pairs * 2 # each pair = user + assistant + + summary_text = cm_summary.get("previous_summary") or "" + if not summary_text or compressed_entries == 0: + return list(frozen_history) + + precompressed = [ + AgentHistory( + role="user", + content=f"Summary of earlier steps in this task:\n{summary_text}", + ), + ] + if compressed_entries < len(frozen_history): + precompressed.extend(frozen_history[compressed_entries:]) + return precompressed + + +# ============ Novel chunking ============ + +def chunk_text(text: str, chunk_chars: int) -> list[str]: + """Split text into chunks of about ``chunk_chars`` characters. + + Chunk boundaries are nudged forward to the next newline (within a small + slack) so chunks do not cut sentences in half. + """ + chunks: list[str] = [] + i, n = 0, len(text) + while i < n: + end = min(i + chunk_chars, n) + if end < n: + nl = text.find("\n", end) + if nl != -1 and nl - end < 500: + end = nl + 1 + chunks.append(text[i:end]) + i = end + return chunks + + +# ============ Compressed arm: ingest + compress ============ + +async def ingest_and_compress(book: EventQABook, cm_config: ContextManagerConfig, args) -> dict: + """Stream the novel into a growing history and let ContextManager compress. + + Returns a dict with the compression summary export, the accumulated + conversation history, the last token counts, and compression stats. + """ + context = book.context + if args.max_ingest_chars > 0: + context = context[:args.max_ingest_chars] + + chunks = chunk_text(context, args.chunk_chars) + shared_cm = ContextManager(config=cm_config, max_steps=args.ingest_max_steps) + + conversation_history: list[AgentHistory] = [] + token_counts = None + ingest_main_input_tokens = 0 + ingest_main_output_tokens = 0 + + for idx, chunk in enumerate(chunks): + chunk_msg = f"[Novel part {idx + 1} of {len(chunks)}]\n\n{chunk}" + # The agent only exists to drive a real ContextManager compression pass + # over the accumulated history. Showing the exact acknowledgement code + # keeps a code-agent from misfiring on a bare "OK". + query = ( + f"{chunk_msg}\n\n" + f"You have now read this part of the novel. Acknowledge it by " + f"emitting exactly this code and nothing else:\n" + f'\nfinal_answer("OK")\n' + ) + run_info = build_agent_run_info( + query, + conversation_history, + duty_prompt=INGEST_DUTY, + max_steps=args.ingest_max_steps, + context_manager_config=cm_config, + language="en", + agent_name="eventqa_reader", + agent_description="EventQA novel-reading agent", + ) + run_info.context_manager = shared_cm + + chunk_result = await run_agent_with_tracking(run_info, debug=args.debug) + ingest_main_input_tokens += chunk_result.total_input_tokens + ingest_main_output_tokens += chunk_result.total_output_tokens + token_counts = shared_cm.get_token_counts() + + # Store a clean (chunk, ack) pair. The agent's own reply carries no + # information and may be malformed, so a fixed "OK" is used instead. + conversation_history.append(AgentHistory(role="user", content=chunk_msg)) + conversation_history.append(AgentHistory(role="assistant", content="OK")) + + return { + "cm_summary": shared_cm.export_summary(), + "conversation_history": conversation_history, + "token_counts": token_counts, + "cm_stats": shared_cm.get_all_compression_stats(), + "num_chunks": len(chunks), + "ingest_chars": len(context), + "ingest_main_input_tokens": ingest_main_input_tokens, + "ingest_main_output_tokens": ingest_main_output_tokens, + } + + +# ============ Probe runner (shared by both arms) ============ + +async def run_probes(items, history: list[AgentHistory], args) -> tuple[list[dict], dict]: + """Run each EventQA question against a frozen history snapshot. + + Compression is disabled — the history is already in its final form + (pre-compressed summary, or truncated novel). Each probe gets its own + deep copy and runs fully independently, so we can fan them out under + a bounded semaphore (--probe_concurrency). Result order is preserved + via asyncio.gather and matches the items order. + + Returns ``(rows, token_totals)`` where ``token_totals`` aggregates the + main-LLM input/output tokens across all probes (compression is disabled + in this arm so no compression cost is incurred here). + """ + disabled_cm = ContextManagerConfig(enabled=False, token_threshold=10 ** 9) + concurrency = max(1, args.probe_concurrency) + sem = asyncio.Semaphore(concurrency) + + async def _one(it): + async with sem: + probe_history = copy.deepcopy(history) + run_info = build_agent_run_info( + it.question, + probe_history, + duty_prompt=PROBE_DUTY, + max_steps=args.probe_max_steps, + context_manager_config=disabled_cm, + language="en", + agent_name="eventqa_answerer", + agent_description="EventQA multiple-choice answering agent", + max_tokens=args.probe_max_tokens, + ) + result = await run_agent_with_tracking(run_info, debug=args.debug) + mcq = score_mcq(result.final_answer, it.options, it.gold) + return { + "qid": it.qid, + "answer": result.final_answer, + "selected_index": mcq.selected_index, + "selected": mcq.selected, + "gold": it.gold, + "gold_index": mcq.gold_index, + "correct": mcq.correct, + "score": mcq.score, + "match_type": mcq.match_type, + "_main_input_tokens": result.total_input_tokens, + "_main_output_tokens": result.total_output_tokens, + } + + rows = await asyncio.gather(*(_one(it) for it in items)) + totals = { + "main_input_tokens": sum(r.pop("_main_input_tokens", 0) for r in rows), + "main_output_tokens": sum(r.pop("_main_output_tokens", 0) for r in rows), + } + return rows, totals + + +# ============ Per-book run ============ + +async def run_book(book: EventQABook, args) -> dict: + """Run the baseline arm plus one compressed arm per summary schema.""" + # --question_start lets a salvaged / resumed run skip already-done qids. + start = max(0, args.question_start) + end = start + args.limit if args.limit else None + items = book.items[start:end] if end is not None else book.items[start:] + schemas = resolve_schemas(args.summary_schema) + print(f"\n===== BOOK: {book.book_title} ({book.book_id}) =====") + if start > 0: + print(f" novel chars={len(book.context)} questions={len(items)} (qids {start}..{start+len(items)-1})") + else: + print(f" novel chars={len(book.context)} questions={len(items)}") + + # ---- Compressed arm(s): one ingest + probe pass per summary schema ---- + compressed: dict[str, dict] = {} + if not args.skip_compressed: + for schema_name in schemas: + cm_config = build_compressed_config(schema_name, args) + print(f" [compressed:{schema_name}] ingesting novel " + f"(chunk_chars={args.chunk_chars}, threshold={args.token_threshold}) ...") + compression = await ingest_and_compress(book, cm_config, args) + boundary = compression["cm_summary"].get("compression_boundary", {}) + print(f" [compressed:{schema_name}] {compression['num_chunks']} chunks " + f"ingested, compressed_pairs=" + f"{boundary.get('previous_compressed_pairs', 0)}") + + precompressed_history = build_precompressed_history( + compression["conversation_history"], compression["cm_summary"] + ) + print(f" [compressed:{schema_name}] running {len(items)} probes ...") + results, probe_tokens = await run_probes(items, precompressed_history, args) + compressed[schema_name] = { + "results": results, + "compression": compression, + "probe_tokens": probe_tokens, + } + + # ---- Baseline arm (schema-independent, runs once) ---- + baseline_results: list[dict] = [] + baseline_probe_tokens = {"main_input_tokens": 0, "main_output_tokens": 0} + if not args.skip_baseline: + truncated = book.context[:args.baseline_context_chars] + baseline_history = [ + AgentHistory( + role="user", + content=f"Here is the novel (it may be truncated):\n\n{truncated}", + ), + AgentHistory(role="assistant", content="OK, I have read the novel."), + ] + print(f" [baseline] novel truncated to {len(truncated)} chars, " + f"running {len(items)} probes ...") + baseline_results, baseline_probe_tokens = await run_probes( + items, baseline_history, args + ) + + # ---- Metrics ---- + def accuracy(rows: list[dict]) -> float: + return sum(r["score"] for r in rows) / len(rows) if rows else 0.0 + + baseline_acc = accuracy(baseline_results) + + compressed_report: dict[str, dict] = {} + for schema_name, data in compressed.items(): + c_acc = accuracy(data["results"]) + + memory_retention = None + if baseline_results and data["results"]: + memory_retention = c_acc / baseline_acc if baseline_acc > 0 else 0.0 + + token_reduction = None + tc = data["compression"]["token_counts"] + if tc: + unc = tc.get("last_uncompressed") or 0 + comp = tc.get("last_compressed") or 0 + if unc > 0: + token_reduction = 1 - comp / unc + + cm_summary = data["compression"]["cm_summary"] + compressed_report[schema_name] = { + "accuracy": c_acc, + "n": len(data["results"]), + "memory_retention": memory_retention, + "token_reduction": token_reduction, + "compression": { + "token_counts": data["compression"]["token_counts"], + "num_chunks": data["compression"]["num_chunks"], + "ingest_chars": data["compression"]["ingest_chars"], + "compression_boundary": cm_summary.get("compression_boundary"), + "previous_summary": cm_summary.get("previous_summary"), + }, + } + + cost = _build_cost(baseline_probe_tokens, compressed) + report = { + "book_id": book.book_id, + "book_title": book.book_title, + "novel_chars": len(book.context), + "num_questions": len(items), + "config": _build_run_config(args), + "baseline": {"accuracy": baseline_acc, "n": len(baseline_results)}, + "compressed": compressed_report, + "cost": cost, + "predictions": _merge_predictions(baseline_results, compressed), + } + + line = f" RESULT: baseline_acc={_fmt(baseline_acc)}" + for schema_name, c in compressed_report.items(): + line += (f" | {schema_name}: acc={_fmt(c['accuracy'])} " + f"retention={_fmt(c['memory_retention'])} " + f"token_reduction={_fmt(c['token_reduction'])}") + print(line) + base_total = cost["baseline"]["total_tokens"] + if base_total and cost.get("compressed"): + for schema_name, c in cost["compressed"].items(): + r = (cost.get("ratio") or {}).get(schema_name, {}).get("total") + print(f" COST[{schema_name}]: baseline_total={base_total:,} " + f"compressed_total={c['total_tokens']:,} " + f"(main={c['main_input_tokens'] + c['main_output_tokens']:,} " + f"+ compression={c['compression_input_tokens'] + c['compression_output_tokens']:,}) " + f"ratio={_fmt(r)}") + return report + + +def _build_run_config(args) -> dict: + """Snapshot the run's compression/ingest/probe/baseline params. + + Stored verbatim in summary.json so each output stands alone for + later analysis without grepping shell history for the command line. + """ + return { + "token_threshold": args.token_threshold, + "keep_recent_pairs": args.keep_recent_pairs, + "keep_recent_steps": args.keep_recent_steps, + "max_observation_length": args.max_observation_length, + "summary_schemas": resolve_schemas(args.summary_schema), + "chunk_chars": args.chunk_chars, + "max_ingest_chars": args.max_ingest_chars, + "ingest_max_steps": args.ingest_max_steps, + "probe_max_steps": args.probe_max_steps, + "probe_concurrency": args.probe_concurrency, + "probe_max_tokens": args.probe_max_tokens, + "baseline_context_chars": args.baseline_context_chars, + "limit": args.limit, + "question_start": args.question_start, + } + + +def _build_cost(baseline_probe_tokens: dict, compressed: dict[str, dict]) -> dict: + """Aggregate end-to-end token cost (main LLM + compression LLM) per arm. + + EventQA supports multiple schemas per book, so the compressed side is a + dict keyed by ``schema_name``. Baseline arm has zero compression cost + since compression is disabled in its probe-only runs. + """ + base_main_in = baseline_probe_tokens.get("main_input_tokens", 0) + base_main_out = baseline_probe_tokens.get("main_output_tokens", 0) + baseline = { + "main_input_tokens": base_main_in, + "main_output_tokens": base_main_out, + "compression_input_tokens": 0, + "compression_output_tokens": 0, + "total_input_tokens": base_main_in, + "total_output_tokens": base_main_out, + "total_tokens": base_main_in + base_main_out, + } + + if not compressed: + return {"baseline": baseline, "compressed": None, "ratio": None} + + def _ratio(c: int, b: int): + return (c / b) if b > 0 else None + + compressed_costs: dict[str, dict] = {} + ratios: dict[str, dict] = {} + for schema_name, data in compressed.items(): + comp = data["compression"] + cm_stats = comp.get("cm_stats") or {} + probe = data.get("probe_tokens") or {} + + comp_main_in = comp.get("ingest_main_input_tokens", 0) + probe.get("main_input_tokens", 0) + comp_main_out = comp.get("ingest_main_output_tokens", 0) + probe.get("main_output_tokens", 0) + comp_cmp_in = cm_stats.get("total_input_tokens", 0) or 0 + comp_cmp_out = cm_stats.get("total_output_tokens", 0) or 0 + compressed_costs[schema_name] = { + "main_input_tokens": comp_main_in, + "main_output_tokens": comp_main_out, + "compression_input_tokens": comp_cmp_in, + "compression_output_tokens": comp_cmp_out, + "ingest_main_input_tokens": comp.get("ingest_main_input_tokens", 0), + "ingest_main_output_tokens": comp.get("ingest_main_output_tokens", 0), + "probe_main_input_tokens": probe.get("main_input_tokens", 0), + "probe_main_output_tokens": probe.get("main_output_tokens", 0), + "compression_calls": cm_stats.get("total_calls", 0), + "total_input_tokens": comp_main_in + comp_cmp_in, + "total_output_tokens": comp_main_out + comp_cmp_out, + "total_tokens": comp_main_in + comp_main_out + comp_cmp_in + comp_cmp_out, + } + ratios[schema_name] = { + "input": _ratio(compressed_costs[schema_name]["total_input_tokens"], baseline["total_input_tokens"]), + "output": _ratio(compressed_costs[schema_name]["total_output_tokens"], baseline["total_output_tokens"]), + "total": _ratio(compressed_costs[schema_name]["total_tokens"], baseline["total_tokens"]), + } + return {"baseline": baseline, "compressed": compressed_costs, "ratio": ratios} + + +def _aggregate_costs(costs: list[dict | None]) -> dict: + """Sum per-arm token totals across books for the top-level summary. + + Compressed side is keyed by schema, so the aggregate is also keyed by + schema; ratios are recomputed at the aggregate level from summed totals + rather than averaged from per-book ratios. + """ + base_keys = ["main_input_tokens", "main_output_tokens", + "compression_input_tokens", "compression_output_tokens", + "total_input_tokens", "total_output_tokens", "total_tokens"] + comp_keys = base_keys + ["ingest_main_input_tokens", "ingest_main_output_tokens", + "probe_main_input_tokens", "probe_main_output_tokens", + "compression_calls"] + baseline_agg = {k: 0 for k in base_keys} + compressed_agg: dict[str, dict] = {} + for c in costs: + if not c: + continue + for k in base_keys: + baseline_agg[k] += c.get("baseline", {}).get(k, 0) or 0 + for schema_name, sub in (c.get("compressed") or {}).items(): + slot = compressed_agg.setdefault(schema_name, {k: 0 for k in comp_keys}) + for k in comp_keys: + slot[k] += sub.get(k, 0) or 0 + + def _ratio(c: int, b: int): + return (c / b) if b > 0 else None + + ratios: dict[str, dict] = {} + for schema_name, sub in compressed_agg.items(): + ratios[schema_name] = { + "input": _ratio(sub["total_input_tokens"], baseline_agg["total_input_tokens"]), + "output": _ratio(sub["total_output_tokens"], baseline_agg["total_output_tokens"]), + "total": _ratio(sub["total_tokens"], baseline_agg["total_tokens"]), + } + return { + "baseline": baseline_agg, + "compressed": compressed_agg or None, + "ratio": ratios or None, + } + + +def _merge_predictions( + baseline_results: list[dict], + compressed: dict[str, dict], +) -> list[dict]: + """Join the baseline and per-schema compressed results by qid.""" + by_qid: dict[str, dict] = {} + + def _row(r: dict) -> dict: + return { + "answer": r["answer"], "selected": r["selected"], + "correct": r["correct"], "match_type": r["match_type"], + } + + for r in baseline_results: + entry = by_qid.setdefault(r["qid"], {"qid": r["qid"], "gold": r["gold"]}) + entry["baseline"] = _row(r) + for schema_name, data in compressed.items(): + for r in data["results"]: + entry = by_qid.setdefault(r["qid"], {"qid": r["qid"], "gold": r["gold"]}) + entry.setdefault("compressed", {})[schema_name] = _row(r) + return list(by_qid.values()) + + +# ============ Main ============ + +async def main(args): + data_path = args.data_file + if not os.path.isabs(data_path): + data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), data_path) + + if not os.path.exists(data_path): + print(f"ERROR: data file not found: {data_path}") + print(" Run 'python download_data.py' first to fetch the EventQA novels.") + return + + books = load_books(data_path) + if args.book_index is not None: + books = [books[args.book_index]] + elif args.book_limit: + books = books[:args.book_limit] + + outputs_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs") + os.makedirs(outputs_root, exist_ok=True) + + schemas = resolve_schemas(args.summary_schema) + + print(f"{'=' * 60}") + print(f"EventQA Benchmark (nexent agent)") + print(f"{'=' * 60}") + print(f" Books: {len(books)}") + print(f" Questions per book: {args.limit or 'all (100)'}") + print(f" Compressed schema(s): {', '.join(schemas)}") + print(f" Token threshold: {args.token_threshold}") + print(f" Chunk chars: {args.chunk_chars}") + print(f" Baseline ctx chars: {args.baseline_context_chars}") + print(f" Max ingest chars: {args.max_ingest_chars or 'full novel'}") + print(f"{'=' * 60}") + + reports = [] + for book in books: + report = await run_book(book, args) + reports.append(report) + + book_dir = os.path.join(outputs_root, book.book_id) + os.makedirs(book_dir, exist_ok=True) + with open(os.path.join(book_dir, "predictions.jsonl"), "w", encoding="utf-8") as f: + for pred in report["predictions"]: + f.write(json.dumps(pred, ensure_ascii=False) + "\n") + book_summary = {k: v for k, v in report.items() if k != "predictions"} + with open(os.path.join(book_dir, "summary.json"), "w", encoding="utf-8") as f: + json.dump(book_summary, f, ensure_ascii=False, indent=2, default=str) + + # ---- Cross-book aggregate ---- + def _avg(values): + vals = [v for v in values if v is not None] + return sum(vals) / len(vals) if vals else None + + per_schema = {} + for schema_name in schemas: + books_with = [r for r in reports if schema_name in r["compressed"]] + if not books_with: + continue + per_schema[schema_name] = { + "avg_compressed_accuracy": _avg( + [r["compressed"][schema_name]["accuracy"] for r in books_with]), + "avg_memory_retention": _avg( + [r["compressed"][schema_name]["memory_retention"] for r in books_with]), + "avg_token_reduction": _avg( + [r["compressed"][schema_name]["token_reduction"] for r in books_with]), + } + + # Cross-book cost aggregate: sum absolute tokens across books so the + # top-level number reflects the full benchmark wallet, not an average. + cost_agg = _aggregate_costs([r.get("cost") for r in reports]) + + summary = { + "total_books": len(reports), + "questions_per_book": args.limit or 100, + "summary_schemas": schemas, + "config": _build_run_config(args), + "avg_baseline_accuracy": _avg([r["baseline"]["accuracy"] for r in reports]), + "per_schema": per_schema, + "cost": cost_agg, + "per_book": { + r["book_id"]: { + "book_title": r["book_title"], + "baseline_accuracy": r["baseline"]["accuracy"], + "compressed": { + s: { + "accuracy": c["accuracy"], + "memory_retention": c["memory_retention"], + "token_reduction": c["token_reduction"], + } + for s, c in r["compressed"].items() + }, + "cost": r.get("cost"), + } + for r in reports + }, + } + summary_name = ( + f"summary_{args.book_index}.json" + if args.book_index is not None + else "summary.json" + ) + summary_path = os.path.join(outputs_root, summary_name) + with open(summary_path, "w", encoding="utf-8") as f: + json.dump(summary, f, ensure_ascii=False, indent=2, default=str) + + print(f"\n{'=' * 60}") + print(f"EventQA finished. {len(reports)} book(s).") + print(f" avg baseline accuracy: {_fmt(summary['avg_baseline_accuracy'])}") + for schema_name, m in per_schema.items(): + print(f" [compressed:{schema_name}] acc={_fmt(m['avg_compressed_accuracy'])} " + f"retention={_fmt(m['avg_memory_retention'])} " + f"token_reduction={_fmt(m['avg_token_reduction'])}") + if cost_agg.get("compressed") and cost_agg["baseline"]["total_tokens"]: + b = cost_agg["baseline"] + print(f" cost (sum across books):") + print(f" baseline main={b['main_input_tokens']:>12,} in / {b['main_output_tokens']:>10,} out total={b['total_tokens']:,}") + for schema_name, c in cost_agg["compressed"].items(): + r = cost_agg["ratio"][schema_name] + print(f" compressed[{schema_name}] main={c['main_input_tokens']:>12,} in / {c['main_output_tokens']:>10,} out " + f"compression={c['compression_input_tokens']:,} in / {c['compression_output_tokens']:,} out total={c['total_tokens']:,}") + print(f" ratio input={_fmt(r['input'])} output={_fmt(r['output'])} total={_fmt(r['total'])}") + print(f" Summary saved to {summary_path}") + print(f"{'=' * 60}") + + +def _build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Run the EventQA benchmark with the nexent agent") + parser.add_argument("--data_file", type=str, default="data/eventqa_full.jsonl", + help="EventQA jsonl produced by download_data.py") + parser.add_argument("--book_limit", type=int, default=None, + help="Limit number of books (default: all 5)") + parser.add_argument("--book_index", type=int, default=None, + help="Evaluate only the book at this index (0-4); overrides --book_limit") + parser.add_argument("--limit", type=int, default=None, + help="Limit questions per book (default: all 100)") + parser.add_argument("--question_start", type=int, default=0, + help="Skip the first N questions (for resuming an interrupted run)") + parser.add_argument("--token_threshold", type=int, default=12000, + help="ContextManager token threshold for the compressed arm") + parser.add_argument("--keep_recent_pairs", type=int, default=2, + help="ContextManager keep_recent_pairs") + parser.add_argument("--keep_recent_steps", type=int, default=4, + help="ContextManager keep_recent_steps") + parser.add_argument("--max_observation_length", type=int, default=20000, + help="ContextManager max_observation_length") + parser.add_argument("--summary_schema", type=str, default="default", + choices=["default", "narrative", "both"], + help="Summary template the compressed arm uses: 'default' " + "(production agent-task schema), 'narrative' " + "(novel-oriented schema), or 'both' (run each and compare)") + parser.add_argument("--chunk_chars", type=int, default=20000, + help="Characters per novel chunk fed during ingest") + parser.add_argument("--baseline_context_chars", type=int, default=480000, + help="Characters of the novel fed to the baseline arm " + "(truncate to the model's context window)") + parser.add_argument("--max_ingest_chars", type=int, default=0, + help="Cap the novel length ingested in the compressed arm " + "(0 = full novel; use a small value for smoke tests)") + parser.add_argument("--ingest_max_steps", type=int, default=2, + help="Max agent steps per ingest (acknowledge) run") + parser.add_argument("--probe_max_steps", type=int, default=3, + help="Max agent steps for each question-answering probe") + parser.add_argument("--probe_concurrency", type=int, default=5, + help="Bounded asyncio concurrency for probe LLM calls " + "(default 5; set 1 for serial). Only affects probes — " + "ingest stays serial since compressions are ordered.") + parser.add_argument("--probe_max_tokens", type=int, default=4096, + help="Per-call completion output cap for probe LLM calls " + "(default 4096 — matches SDK production default). " + "Lower to 1024-2048 for tighter loop containment.") + parser.add_argument("--skip_baseline", action="store_true", + help="Skip the baseline arm (compressed-only iteration)") + parser.add_argument("--skip_compressed", action="store_true", + help="Skip the compressed arm (baseline-only iteration)") + parser.add_argument("--debug", action="store_true", help="Enable agent debug output") + return parser + + +if __name__ == "__main__": + asyncio.run(main(_build_arg_parser().parse_args())) diff --git a/sdk/benchmark/eventqa_eval/salvage_trace.py b/sdk/benchmark/eventqa_eval/salvage_trace.py new file mode 100644 index 000000000..fd09e267c --- /dev/null +++ b/sdk/benchmark/eventqa_eval/salvage_trace.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python3 +"""Salvage probe results from an interrupted ctx_debugger trace. + +When an EventQA run dies mid-flight (network drop, OOM, …) the +``run_eventqa.py`` process never reaches the report-writing block, so +``outputs//summary.json`` is missing. The ctx_debugger trace however +has every probe's input and final_answer captured. This script walks the trace +and reconstructs per-probe results — compressed arm first, baseline arm second +— matching turns to items by their ORDER within each arm (probes run +sequentially through ``book.items`` with no retries). + +It does NOT re-run any LLM call. It only reads the trace. + +Usage: + python salvage_trace.py [--out ] [--schema default|narrative] + +Default output dir: ``outputs/_salvage/`` (sibling of the regular run +output dir). The merge script can combine this with a resumed run later. +""" +import argparse +import json +import os +import re +import sys + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import paths # noqa: F401 + +from dataset import load_books +from eval_utils import score_mcq + + +def _load_events(path: str): + with open(path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + yield json.loads(line) + + +def _split_turns(events): + """Split flat events into one segment per eventqa_answerer agent_init.""" + turns = [] + current = None + for e in events: + if (e.get("event") == "agent_init" + and e.get("data", {}).get("agent_name") == "eventqa_answerer"): + if current is not None: + turns.append(current) + current = {"events": []} + elif current is not None: + current["events"].append(e) + if current is not None: + turns.append(current) + return turns + + +def _classify_arm(turn) -> str: + """compressed vs baseline — detect by the 'Here is the novel' marker.""" + for ev in turn["events"]: + if ev.get("event") != "llm_call_begin": + continue + for m in ev.get("data", {}).get("input_messages", []) or []: + text = m.get("text") or m.get("preview") or "" + if "Here is the novel" in text: + return "baseline" + break + return "compressed" + + +def _extract_answer(turn): + """Return the last final_answer tool call's return_preview, or None.""" + ans = None + for ev in turn["events"]: + if (ev.get("event") == "tool_call_end" + and ev.get("data", {}).get("tool") == "final_answer"): + ans = ev.get("data", {}).get("return_preview") + return ans + + +def _extract_final_summary(events): + """Walk the trace for the LAST compress_end with a non-empty summary_after.""" + summary = None + token_counts = None + boundary = None + num_chunks = None + for e in events: + if e.get("event") == "compress_end": + d = e.get("data", {}) or {} + s = d.get("summary_after") + if s and "previous_summary" in (s or {}): + ps = s.get("previous_summary") + if ps: + summary = ps + boundary = s.get("compression_boundary") + tc = d.get("token_counts") + if tc: + token_counts = tc + # Count ingest rounds = eventqa_reader agent_init events + num_chunks = sum( + 1 for e in events + if e.get("event") == "agent_init" + and e.get("data", {}).get("agent_name") == "eventqa_reader" + ) + return { + "previous_summary": summary, + "compression_boundary": boundary, + "token_counts": token_counts, + "num_chunks": num_chunks, + } + + +def salvage(trace_path: str, book_index: int, schema: str) -> dict: + events = list(_load_events(trace_path)) + turns = _split_turns(events) + + # Detect arm boundary + first_baseline = next( + (i for i, t in enumerate(turns) if _classify_arm(t) == "baseline"), + len(turns), + ) + compressed_turns = turns[:first_baseline] + baseline_turns = turns[first_baseline:] + + books = load_books( + os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "data", + "eventqa_full.jsonl", + ) + ) + book = books[book_index] + items = book.items + + def score_turns(arm_turns): + out = [] + for k, t in enumerate(arm_turns): + if k >= len(items): + break + it = items[k] + ans = _extract_answer(t) + if ans is None: + out.append({ + "qid": it.qid, "gold": it.gold, "answer": None, + "selected": "", "selected_index": -1, + "gold_index": it.options.index(it.gold) if it.gold in it.options else -1, + "correct": False, "score": 0.0, "match_type": "no_answer", + }) + else: + mcq = score_mcq(ans, it.options, it.gold) + out.append({ + "qid": it.qid, "gold": it.gold, "answer": ans, + "selected": mcq.selected, "selected_index": mcq.selected_index, + "gold_index": mcq.gold_index, + "correct": mcq.correct, "score": mcq.score, + "match_type": mcq.match_type, + }) + return out + + compressed = score_turns(compressed_turns) + baseline = score_turns(baseline_turns) + + comp_info = _extract_final_summary(events) + + def accuracy(rs): + return sum(r["score"] for r in rs) / len(rs) if rs else 0.0 + + bacc = accuracy(baseline) + cacc = accuracy(compressed) + retention = None + if baseline and compressed: + retention = cacc / bacc if bacc > 0 else 0.0 + token_reduction = None + if comp_info["token_counts"]: + tc = comp_info["token_counts"] + unc = tc.get("last_uncompressed") or 0 + comp = tc.get("last_compressed") or 0 + if unc > 0: + token_reduction = 1 - comp / unc + + return { + "book_id": book.book_id, + "book_title": book.book_title, + "novel_chars": len(book.context), + "num_questions": len(items), + "schema_salvaged": schema, + "compressed_turns": len(compressed_turns), + "baseline_turns": len(baseline_turns), + "baseline": {"accuracy": bacc, "n": len(baseline), "qid_range": [0, len(baseline) - 1] if baseline else None}, + "compressed": { + schema: { + "accuracy": cacc, + "n": len(compressed), + "memory_retention": retention, + "token_reduction": token_reduction, + "compression": comp_info, + } + }, + "predictions_compressed": compressed, + "predictions_baseline": baseline, + } + + +def main(): + ap = argparse.ArgumentParser(description="Salvage probe results from a ctx_debugger trace.") + ap.add_argument("trace", help="Path to ctx_debugger JSONL trace.") + ap.add_argument("--book_index", type=int, default=0) + ap.add_argument("--schema", default="narrative", + help="Which schema this trace's compressed arm used (default/narrative).") + ap.add_argument("--out_dir", default=None, + help="Output dir; default outputs/_salvage/") + args = ap.parse_args() + + report = salvage(args.trace, args.book_index, args.schema) + + out_dir = args.out_dir or os.path.join( + os.path.dirname(os.path.abspath(__file__)), "outputs", + f"{report['book_id']}_salvage", + ) + os.makedirs(out_dir, exist_ok=True) + with open(os.path.join(out_dir, "summary.json"), "w", encoding="utf-8") as f: + json.dump({k: v for k, v in report.items() + if k not in ("predictions_compressed", "predictions_baseline")}, + f, ensure_ascii=False, indent=2, default=str) + with open(os.path.join(out_dir, "predictions_compressed.jsonl"), "w", encoding="utf-8") as f: + for r in report["predictions_compressed"]: + f.write(json.dumps(r, ensure_ascii=False) + "\n") + with open(os.path.join(out_dir, "predictions_baseline.jsonl"), "w", encoding="utf-8") as f: + for r in report["predictions_baseline"]: + f.write(json.dumps(r, ensure_ascii=False) + "\n") + + print(f"Salvage written to {out_dir}") + print(f" compressed: {report['compressed_turns']} turns " + f"(acc={report['compressed'][args.schema]['accuracy']:.3f})") + print(f" baseline: {report['baseline_turns']} turns " + f"(acc={report['baseline']['accuracy']:.3f}) — " + f"qids 0..{report['baseline']['n'] - 1} done, {100 - report['baseline']['n']} remaining") + + +if __name__ == "__main__": + main() diff --git a/sdk/benchmark/longmemeval_eval/.gitignore b/sdk/benchmark/longmemeval_eval/.gitignore new file mode 100644 index 000000000..502e11088 --- /dev/null +++ b/sdk/benchmark/longmemeval_eval/.gitignore @@ -0,0 +1,9 @@ +# Source datasets (regenerate via download_data.py) +data/ + +# Generated benchmark results (regenerated by each run) +outputs/ + +# Runtime artifacts +__pycache__/ +nexent_context_metrics.log diff --git a/sdk/benchmark/longmemeval_eval/README.md b/sdk/benchmark/longmemeval_eval/README.md new file mode 100644 index 000000000..805222f7a --- /dev/null +++ b/sdk/benchmark/longmemeval_eval/README.md @@ -0,0 +1,177 @@ +# longmemeval_eval — LongMemEval (S*) Long Memory Evaluation + +Based on **LongMemEval (S\*)** dataset (from MemoryAgentBench, arXiv 2507.05257v3's "5 long conversations sharing 60 questions" reconstruction of original LongMemEval arXiv 2410.10813), evaluate **context compression**'s impact on **multi-session conversation long memory**. + +> Evaluation methods and dimensions follow the rest of `sdk/benchmark`: **baseline (no compression) vs compressed (compression)** comparison. This file covers **dataset format**, **how to run** and **what each parameter means**. + +--- + +## Dataset + +| Dimension | Value | +|---|---| +| Long conversations | 5 (shared) | +| Per-conversation tokens | ~355K | +| Per-conversation atomic sessions | ~107–116 (user/assistant multi-turn pairs) | +| Per-conversation questions | 60 | +| Total questions | **300** | +| Question types (6 categories) | `multi-session` (75) · `temporal-reasoning` (75) · `single-session-user` (45) · `knowledge-update` (45) · `single-session-assistant` (30) · `single-session-preference` (30) | +| Answers | Free text (LLM-as-judge scoring) | + +Data from HuggingFace `ai-hyz/MemoryAgentBench`'s `Accurate_Retrieval` split, +rows with `metadata.source == "longmemeval_s*"`. **Same parquet as `eventqa_eval`**. + +Each row contains: +- `context` — Entire conversation flattened to plain text (for baseline truncation feed) +- `haystack_sessions` — Nested structure `list[60] of list[~2] of list[turn]`, + `turn = {role, content, has_answer}`. `dataset.py` flattens to single-layer + `list[session]`, concatenated in chronological order. +- `questions` / `answers` / `question_types` / `question_dates` / `question_ids` + +--- + +## Prerequisites + +- Use backend's venv: `nexent/backend/.venv/bin/python` (already contains `huggingface_hub`, + `pyarrow`, `openai`) +- Tested LLM credentials: Repo root `nexent/.env`'s `LLM_API_KEY` / `LLM_MODEL_NAME` / `LLM_API_URL` +- **Judge model (optional)**: `JUDGE_API_KEY` / `JUDGE_MODEL_NAME` / `JUDGE_API_URL` + - Leave empty auto fallback to `LLM_*` (same model as both tested and judge — simple but has + self-judging bias) + - Separate config后judge only runs scoring step, volume small, recommend stronger model to avoid bias +- Commands below assume you're in this directory (`sdk/benchmark/longmemeval_eval/`) + +--- + +## Two Steps + +### Step 1: Download Data + +```bash +python download_data.py +``` + +Writes to `data/longmemeval_s_star.jsonl` (~30MB). + +### Step 2: Run Evaluation + +```bash +# Smoke test: 1 conversation, 1 question, only ingest first 6 sessions (must trigger compression) +python run_longmemeval.py \ + --dialogue_index 0 --limit 1 \ + --max_ingest_sessions 6 --sessions_per_batch 2 \ + --token_threshold 3000 --keep_recent_pairs 1 \ + --baseline_context_chars 40000 + +# Default sample: 5 conversations × 20 questions = 100 questions +python run_longmemeval.py + +# Full: 5 conversations × 60 questions = 300 questions +python run_longmemeval.py --limit 60 +``` + +--- + +## `run_longmemeval.py` Parameter Details + +### Evaluation Scope + +| Parameter | Default | Meaning | +|---|---|---| +| `--data_file` | `data/longmemeval_s_star.jsonl` | Download script produced data | +| `--dialogue_limit` | All (5) | Only run first N conversations | +| `--dialogue_index` | None | Only run specific index conversation (0-4), overrides `--dialogue_limit` | +| `--limit` | **20** | Per-conversation only run first N questions (**default sample**; set 60 for full 300 questions) | + +### Compressed Arm: ContextManager Configuration + +| Parameter | Default | Meaning | +|---|---|---| +| `--token_threshold` | `12000` | Cumulative context exceeds this token count triggers compression, smaller = more aggressive | +| `--keep_recent_pairs` | `2` | How many pairs (user, assistant) to retain uncompressed at tail | +| `--keep_recent_steps` | `4` | ContextManager within-turn retain step count | +| `--max_observation_length` | `20000` | Single observation character limit | +| `--sessions_per_batch` | `4` | How many atomic sessions per ingest batch (larger = fewer compression rounds, larger per-round input) | +| `--max_ingest_sessions` | `0` (entire) | Compressed arm only takes first N sessions, **for smoke testing**—small value drastically speeds up | +| `--ingest_max_steps` | `2` | Ingest agent max steps (only triggers compression, 2 steps sufficient) | + +### Scoring Arm + +| Parameter | Default | Meaning | +|---|---|---| +| `--probe_max_steps` | `3` | Each probe agent max steps | + +Scoring uses LLM-as-judge: + +- Each question_type has one judge prompt (`eval_utils.py`) +- Judge model parsed by env priority: `JUDGE_*` → `LLM_*` → fallback substring match +- Judge actual behavior printed in `outputs/.../predictions.jsonl`'s `judge_label` field + (`yes` / `no` / `unknown` / `error` / `fallback_*`) + +### Baseline Arm + +`longmemeval_s*` conversations ~1.6M chars (~355K tokens), **when window not large enough must truncate**. + +| Parameter | Default | Meaning | +|---|---|---| +| `--baseline_context_chars` | `480000` | Baseline feed character limit (estimate by model window) | + +### Debug / Skip + +| Parameter | Default | Meaning | +|---|---|---| +| `--skip_baseline` | No | Skip baseline (save time when iterating compression params) | +| `--skip_compressed` | No | Skip compressed arm | +| `--debug` | No | Print agent debug output | + +--- + +## Evaluation Dimensions and Output + +Both arms answer **same questions**, retention ratio clean: + +``` +memory_retention = compressed_accuracy / baseline_accuracy +token_reduction = 1 - last_compressed_tokens / last_uncompressed_tokens +``` + +`token_reduction` same method as `manual_cases` / `eventqa_eval`: Take compressed arm last +ingest turn's `ContextManager.get_token_counts()` single-point sampling. + +**New dimension (vs `eventqa_eval`)**: Report retention bucketed by 6 question_types, +locate which memory categories compression hurts. + +No Continuation evaluation—LongMemEval questions independent. + +Output written to `outputs/`: + +``` +outputs/ +├── / +│ ├── predictions.jsonl # Per-question baseline vs compressed answers + judge labels +│ └── summary.json # Single-conversation metrics + complete compression summary + per-category +└── summary.json # Cross-conversation aggregate + per-category grouped metrics +``` + +--- + +## Differences from eventqa_eval (Key) + +| | eventqa_eval | longmemeval_eval | +|--|--|--| +| History format | Novel continuous prose, char-chunked into `[Novel part X]` envelope | **Real multi-session conversation**, by session chunk, turns as-is as `(user, assistant)` pairs into history | +| Scoring | Six-choice MCQ → string match | **Free text → LLM-as-judge** (per-type different prompts) | +| Default schema | `default` / `narrative` / `both` | **Only SDK default schema** (first test production behavior, schema experiments pending) | +| Probe independence | ✓ | ✓ | +| Dimensions | Single accuracy + token_reduction | accuracy + token_reduction + **per-category retention** (6 types) | + +--- + +## Notes + +- **Self-judging bias**: Default fallback uses same `LLM_*` model as judge, numbers biased optimistic. + For formal comparison recommend separate `JUDGE_*` config (external stronger model like GPT-4o). +- **Sample vs full**: Default `--limit 20` (5 × 20 = 100 questions) suitable for iteration; for formal numbers + run `--limit 60` (5 × 60 = 300 questions). +- **Ingest is fixed cost**: Unrelated to `--limit`—entire conversation history must be compressed once. +- Data download if HF SSL jitter will auto fallback to local cache. \ No newline at end of file diff --git a/sdk/benchmark/longmemeval_eval/dataset.py b/sdk/benchmark/longmemeval_eval/dataset.py new file mode 100644 index 000000000..fe2b0574c --- /dev/null +++ b/sdk/benchmark/longmemeval_eval/dataset.py @@ -0,0 +1,146 @@ +"""Dataset loader for LongMemEval (S*) from MemoryAgentBench. + +Loads the ``longmemeval_s_star.jsonl`` produced by ``download_data.py``. Each +line is one long multi-session dialogue: 60 "session groups" (each a list of +1-3 atomic sessions, ~100-120 atomic sessions total) plus 60 free-text +questions tagged with one of six categories. + +The released ``haystack_sessions`` field has a nested shape:: + + haystack_sessions: list[60] # one entry per question slot + -> list[N] # 1-3 chronological sessions + -> list[turn] # the turns of one session + -> {role, content, has_answer} # role is "user"|"assistant" + +This module flattens that to a single ordered list of atomic sessions for +ingest, and exposes the per-question metadata (question_type, question_date) +so the runner can group retention by ability category. +""" +import ast +import json +from dataclasses import dataclass, field +from typing import List, Dict, Any + + +@dataclass +class LongMemEvalTurn: + """One conversation turn inside a haystack session.""" + role: str # "user" or "assistant" + content: str + has_answer: bool = False # True if this turn carries evidence for some Q + + +@dataclass +class LongMemEvalSession: + """One atomic chat session (list of turns).""" + turns: List[LongMemEvalTurn] + + +@dataclass +class LongMemEvalItem: + """One free-text question with its gold answer and ability category.""" + qid: str + question: str # raw question text, fed verbatim to the agent + answer: str # gold answer, unwrapped from the stringified list + question_type: str # one of: single-session-user / -assistant / + # -preference / multi-session / knowledge-update / + # temporal-reasoning (no "_abs" in S*) + question_date: str = "" # "Current Date" anchor; already in question text + + +@dataclass +class LongMemEvalDialogue: + """One LongMemEval (S*) dialogue: shared haystack + its 60 questions.""" + dialogue_index: int + dialogue_id: str + context: str # flattened-text haystack (for baseline) + sessions: List[LongMemEvalSession] = field(default_factory=list) + items: List[LongMemEvalItem] = field(default_factory=list) + + +def _unwrap_answer(raw) -> str: + """The dataset stores answers as a stringified list, e.g. "['50']". + + Parse it back to the bare string. Falls back to ``str(raw)`` if the field + is already a plain string or any other shape. + """ + if isinstance(raw, (list, tuple)): + return str(raw[0]) if raw else "" + if isinstance(raw, str): + s = raw.strip() + if s.startswith("[") and s.endswith("]"): + try: + parsed = ast.literal_eval(s) + if isinstance(parsed, (list, tuple)) and parsed: + return str(parsed[0]) + except (ValueError, SyntaxError): + pass + return s + return str(raw) + + +def _flatten_sessions(haystack_sessions: List[Any]) -> List[LongMemEvalSession]: + """Flatten the nested ``list[group] -> list[session] -> list[turn]`` shape + into a single chronological list of atomic sessions. + + The outer 60 groups are indexed by question slot but are also the natural + chronological order of the dialogue, so concatenating their inner sessions + preserves chronology. + """ + flat: List[LongMemEvalSession] = [] + for group in haystack_sessions or []: + if not isinstance(group, list): + continue + for session in group: + if not isinstance(session, list): + continue + turns: List[LongMemEvalTurn] = [] + for t in session: + if not isinstance(t, dict): + continue + turns.append(LongMemEvalTurn( + role=str(t.get("role", "")), + content=str(t.get("content", "")), + has_answer=bool(t.get("has_answer", False)), + )) + if turns: + flat.append(LongMemEvalSession(turns=turns)) + return flat + + +def load_dialogues(jsonl_path: str) -> List[LongMemEvalDialogue]: + """Load all LongMemEval (S*) dialogues from a downloaded ``*.jsonl`` file.""" + dialogues: List[LongMemEvalDialogue] = [] + + with open(jsonl_path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + row = json.loads(line) + + questions = row.get("questions") or [] + answers = row.get("answers") or [] + qtypes = row.get("question_types") or [] + qdates = row.get("question_dates") or [] + qids = row.get("question_ids") or [] + + items: List[LongMemEvalItem] = [] + for i, q in enumerate(questions): + items.append(LongMemEvalItem( + qid=str(qids[i]) if i < len(qids) else f"q{i}", + question=str(q), + answer=_unwrap_answer(answers[i] if i < len(answers) else ""), + question_type=str(qtypes[i]) if i < len(qtypes) else "", + question_date=str(qdates[i]) if i < len(qdates) else "", + )) + + dialogues.append(LongMemEvalDialogue( + dialogue_index=int(row.get("dialogue_index", len(dialogues))), + dialogue_id=str(row.get("dialogue_id", f"dialogue{len(dialogues)}")), + context=str(row.get("context") or ""), + sessions=_flatten_sessions(row.get("haystack_sessions") or []), + items=items, + )) + + return dialogues diff --git a/sdk/benchmark/longmemeval_eval/download_data.py b/sdk/benchmark/longmemeval_eval/download_data.py new file mode 100644 index 000000000..3b8e9867d --- /dev/null +++ b/sdk/benchmark/longmemeval_eval/download_data.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +"""Download LongMemEval (S*) data from MemoryAgentBench on HuggingFace. + +LongMemEval (S*) lives in the ``Accurate_Retrieval`` split of +``ai-hyz/MemoryAgentBench``. Rows whose ``metadata.source`` equals +``"longmemeval_s*"`` carry the 5 long dialogues (~355K tokens each, ~1.6M +characters of flattened conversation) plus 60 free-text questions per dialogue +(300 total). + +This script downloads the split's parquet, extracts the five ``longmemeval_s*`` +rows, and writes them to ``data/longmemeval_s_star.jsonl`` (one dialogue per +line; the literal ``*`` in the source name is sanitized to ``_star`` for the +filename). + +Usage: + python download_data.py + +Requires ``huggingface_hub`` and ``pyarrow`` in the active environment (already +present in ``backend/.venv`` via the ``benchmark`` extra). +""" +import argparse +import json +import os + +HF_REPO = "ai-hyz/MemoryAgentBench" +HF_FILE = "data/Accurate_Retrieval-00000-of-00001.parquet" +SOURCE_TAG = "longmemeval_s*" +OUTPUT_BASENAME = "longmemeval_s_star" + + +def main(output_dir: str): + from huggingface_hub import hf_hub_download + import pyarrow.parquet as pq + + print(f"Downloading {HF_FILE} from {HF_REPO} ...") + try: + path = hf_hub_download(HF_REPO, HF_FILE, repo_type="dataset") + except Exception as exc: + # SSL hiccups during HEAD revalidation are common; fall back to whatever + # is already in the local HF cache. + print(f" online fetch failed ({type(exc).__name__}); " + f"retrying with local_files_only=True ...") + path = hf_hub_download(HF_REPO, HF_FILE, repo_type="dataset", + local_files_only=True) + print(f" cached at: {path}") + + rows = pq.read_table(path).to_pylist() + dialogues = [r for r in rows if (r.get("metadata") or {}).get("source") == SOURCE_TAG] + if not dialogues: + sources = sorted({(r.get("metadata") or {}).get("source") for r in rows}) + raise SystemExit(f"No rows with source={SOURCE_TAG!r}. Available: {sources}") + + os.makedirs(output_dir, exist_ok=True) + out_path = os.path.join(output_dir, f"{OUTPUT_BASENAME}.jsonl") + + with open(out_path, "w", encoding="utf-8") as f: + for i, row in enumerate(dialogues): + md = row.get("metadata") or {} + record = { + "dialogue_index": i, + "dialogue_id": f"{OUTPUT_BASENAME}_d{i}", + "source": SOURCE_TAG, + # Flattened-text rendering of the haystack, useful for the + # baseline arm (truncate-to-window fallback). + "context": row.get("context") or "", + # The structured haystack: list[60] of list[2] of list[turn], + # where each turn = {role, content, has_answer}. + "haystack_sessions": md.get("haystack_sessions") or [], + "questions": row.get("questions") or [], + "answers": row.get("answers") or [], + "question_types": md.get("question_types") or [], + "question_dates": md.get("question_dates") or [], + "question_ids": md.get("question_ids") or [], + } + ctx = record["context"] + n_sess_groups = len(record["haystack_sessions"]) + n_atomic = sum(len(g) for g in record["haystack_sessions"] + if isinstance(g, list)) + f.write(json.dumps(record, ensure_ascii=False) + "\n") + print(f" dialogue {i}: ctx_chars={len(ctx):>9d} " + f"session_groups={n_sess_groups} atomic_sessions={n_atomic} " + f"questions={len(record['questions'])}") + + print(f"\nWrote {len(dialogues)} dialogues to {out_path}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Download LongMemEval (S*) data from MemoryAgentBench") + parser.add_argument( + "--output_dir", + type=str, + default=os.path.join(os.path.dirname(os.path.abspath(__file__)), "data"), + help="Directory to write the .jsonl file", + ) + args = parser.parse_args() + main(output_dir=args.output_dir) diff --git a/sdk/benchmark/longmemeval_eval/eval_utils.py b/sdk/benchmark/longmemeval_eval/eval_utils.py new file mode 100644 index 000000000..5f920da2b --- /dev/null +++ b/sdk/benchmark/longmemeval_eval/eval_utils.py @@ -0,0 +1,201 @@ +"""LLM-as-judge grading for LongMemEval (S*) free-text answers. + +LongMemEval answers are free-text and cannot be scored by exact/F1 matching +reliably (e.g. "50 hours" vs "around 50 hours per week" are both correct). +The original benchmark uses GPT-4o as a judge with per-category prompts +(reported ~97% agreement with humans). We replicate that pattern, but allow +the judge to be either: + + * a dedicated model configured via ``JUDGE_API_KEY`` / ``JUDGE_MODEL_NAME`` + / ``JUDGE_API_URL`` env vars (recommended — avoids self-judging bias); + * the same ``LLM_*`` model used as the agent (fallback when JUDGE_* is + unset — keeps "no extra credentials" as the default). + +The judge is called via an OpenAI-compatible chat-completions endpoint, which +covers the production DeepSeek / GLM / OpenAI / OpenRouter / Anthropic-proxy +endpoints we use elsewhere in nexent. +""" +import os +import re +from dataclasses import dataclass +from typing import Optional + +# ============ Per-category judge prompts ============ +# Modeled on LongMemEval's evaluate_qa.py. Each prompt frames the task slightly +# differently to match the ability being tested: +# - single-session-*: substantive containment of the key fact +# - multi-session: aggregation / comparison must match +# - knowledge-update: must reflect the LATEST value the user stated +# - temporal-reasoning: must match the time/date implied by the gold +# +# The judge returns "yes" or "no" as the very first token of its reply, which +# we then regex-extract. Any reasoning AFTER "yes"/"no" is allowed but ignored. + +_JUDGE_HEADER = ( + "You are an evaluator judging whether a model's answer correctly responds " + "to a question about a long multi-session chat conversation. You will be " + "given the question, the gold (reference) answer, and the model's " + "hypothesis answer. Return a single word — 'yes' if the hypothesis is " + "correct, 'no' otherwise — followed (optionally) by a one-line reason." +) + +_PROMPT_DEFAULT = _JUDGE_HEADER + ( + "\n\nCriterion: the hypothesis is correct if it conveys the same factual " + "content as the gold answer. Minor wording / unit differences are fine. " + "Extra correct context is fine; extra contradictions or hallucinated facts " + "make it wrong." +) + +_PROMPT_KNOWLEDGE_UPDATE = _JUDGE_HEADER + ( + "\n\nThis is a KNOWLEDGE-UPDATE question. The user revised their stated " + "information at some point during the conversation. The hypothesis is " + "correct ONLY if it reflects the MOST RECENT value, matching the gold. " + "An answer that gives the older, superseded value is WRONG even if that " + "older value was once true." +) + +_PROMPT_TEMPORAL = _JUDGE_HEADER + ( + "\n\nThis is a TEMPORAL-REASONING question. The hypothesis is correct only " + "if the time / date / duration it states matches the gold. Different " + "phrasings of the same time are fine ('Friday' == '2023/05/26 (Fri)'); " + "answering with the wrong day/week/month is wrong." +) + +_PROMPT_MULTI_SESSION = _JUDGE_HEADER + ( + "\n\nThis is a MULTI-SESSION question. The gold answer combines facts " + "stated across several different sessions. The hypothesis is correct only " + "if the aggregated / compared result matches the gold; mentioning only " + "one of the underlying facts is NOT enough." +) + +# Single-session variants share the default contract. +_PROMPT_BY_TYPE = { + "knowledge-update": _PROMPT_KNOWLEDGE_UPDATE, + "temporal-reasoning": _PROMPT_TEMPORAL, + "multi-session": _PROMPT_MULTI_SESSION, + "single-session-user": _PROMPT_DEFAULT, + "single-session-assistant": _PROMPT_DEFAULT, + "single-session-preference":_PROMPT_DEFAULT, +} + + +@dataclass +class JudgeResult: + correct: bool + score: float # 1.0 if correct else 0.0 + judge_label: str # normalized "yes" / "no" / "error" + judge_raw: str # raw judge output (for debugging) + + +# ============ Judge configuration ============ +# JUDGE_* takes precedence; fall back to LLM_* so the script runs with whatever +# credentials are already in .env. + +def _judge_config() -> tuple[str, str, str]: + """Return (api_key, model_name, base_url) for the judge model. + + Self-hosted vLLM/sglang endpoints commonly accept any api_key (or none). + When the env value is empty but ``model`` + ``url`` are set, fall back to + the placeholder ``"EMPTY"`` so the OpenAI client still constructs. + """ + # Use explicit precedence: JUDGE_* keys override LLM_* keys. + api_key = (os.getenv("JUDGE_API_KEY") + or os.getenv("LLM_API_KEY") or "").strip() + model = (os.getenv("JUDGE_MODEL_NAME") + or os.getenv("LLM_MODEL_NAME") or "").strip() + url = (os.getenv("JUDGE_API_URL") + or os.getenv("LLM_API_URL") or "").strip() + if not api_key and model and url: + api_key = "EMPTY" + return api_key, model, url + + +_YES_RE = re.compile(r"^\s*(yes|correct|true)\b", re.IGNORECASE) +_NO_RE = re.compile(r"^\s*(no|incorrect|false|wrong)\b", re.IGNORECASE) + + +def _parse_judge(raw: str) -> str: + """Map the judge's free-text reply to 'yes' / 'no' / 'unknown'.""" + if not raw: + return "unknown" + if _YES_RE.match(raw): + return "yes" + if _NO_RE.match(raw): + return "no" + # last-chance scan: a 'yes' / 'no' anywhere near the start + head = raw[:64].lower() + if "yes" in head and "no" not in head[:head.find("yes")]: + return "yes" + if "no" in head: + return "no" + return "unknown" + + +def judge_answer(question: str, gold: str, hypothesis: str, + question_type: str) -> JudgeResult: + """Grade a single answer with an LLM judge. + + Falls back to a string-containment check if the judge endpoint is not + configured — better a noisy signal than a crash. The fallback is logged + via the ``judge_label`` field ("fallback_match" / "fallback_miss"). + """ + api_key, model, url = _judge_config() + + if not (api_key and model and url): + # Fallback: case-insensitive substring containment. Coarse but works + # as a sanity baseline when no LLM judge is configured. + gold_norm = (gold or "").strip().lower() + hyp_norm = (hypothesis or "").strip().lower() + correct = bool(gold_norm) and gold_norm in hyp_norm + return JudgeResult( + correct=correct, + score=1.0 if correct else 0.0, + judge_label="fallback_match" if correct else "fallback_miss", + judge_raw="(no judge model configured; used substring fallback)", + ) + + system_prompt = _PROMPT_BY_TYPE.get(question_type, _PROMPT_DEFAULT) + user_prompt = ( + f"Question:\n{question}\n\n" + f"Gold answer:\n{gold}\n\n" + f"Hypothesis answer:\n{hypothesis}\n\n" + f"Is the hypothesis correct? Answer 'yes' or 'no' first, then " + f"(optionally) a brief reason." + ) + + try: + from openai import OpenAI + client = OpenAI(api_key=api_key, base_url=url) + resp = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ], + temperature=0.0, + # Generous budget — Qwen3-style thinking models route reasoning + # into the same token budget, so 128 was too tight (content came + # back empty). The actual "yes"/"no" reply is still short. + max_tokens=1024, + ) + raw = (resp.choices[0].message.content or "").strip() + except Exception as exc: + return JudgeResult( + correct=False, score=0.0, judge_label="error", + judge_raw=f"{type(exc).__name__}: {exc}", + ) + + label = _parse_judge(raw) + correct = label == "yes" + return JudgeResult( + correct=correct, + score=1.0 if correct else 0.0, + judge_label=label, + judge_raw=raw, + ) + + +def judge_configured() -> bool: + """True iff a dedicated JUDGE_* model is set (not the LLM_* fallback).""" + return bool(os.getenv("JUDGE_API_KEY") and os.getenv("JUDGE_MODEL_NAME") + and os.getenv("JUDGE_API_URL")) diff --git a/sdk/benchmark/longmemeval_eval/run_longmemeval.py b/sdk/benchmark/longmemeval_eval/run_longmemeval.py new file mode 100644 index 000000000..e1ec6625b --- /dev/null +++ b/sdk/benchmark/longmemeval_eval/run_longmemeval.py @@ -0,0 +1,827 @@ +#!/usr/bin/env python3 +"""Run the LongMemEval (S*) benchmark with the nexent agent. + +LongMemEval (S*) from MemoryAgentBench gives 5 long multi-session dialogues +(~355K tokens each) with 60 free-text questions per dialogue (300 total), +labelled with six ability categories: + + * single-session-user / -assistant / -preference (information extraction) + * multi-session (multi-session reasoning) + * knowledge-update (keep the latest value) + * temporal-reasoning (dates / durations) + +IMPORTANT: LongMemEval contains MANY INDEPENDENT TOPICS (job search, work hours, +bereavement support, travel, shopping, etc.), not a single continuous task. +The default "active_task" schema fails here — it discards older topics. +Use --summary_schema multi_topic to preserve all topics. + +This script keeps the same evaluation method as the rest of ``sdk/benchmark`` +(baseline vs compressed, retention as the ratio of the two) but adapted to a +multi-session conversational memory task: + + * Baseline — the dialogue's flattened text is truncated to the model's + context window and fed whole, with NO compression. Questions + whose evidence lies past the truncation point are expected + to fail. + * Compressed — the FULL multi-session chat history is streamed in as real + (user, assistant) turn pairs; the real ContextManager + incrementally compresses it. The 60 questions are then run + as memory probes against the pre-compressed context. + +Both arms answer the SAME questions, so the retention ratio is clean: + + memory_retention = compressed_accuracy / baseline_accuracy + token_reduction = 1 - last_compressed_tokens / last_uncompressed_tokens + +Continuation is not measured — LongMemEval questions are independent. + +Default scope is the full benchmark: 5 dialogues x 60 questions = 300 Q. +Pass --limit 20 (or any smaller value) to sample for quick iteration. + +Usage: + python download_data.py # one-time: fetch the dataset + python run_longmemeval.py --dialogue_index 0 --limit 1 # smoke + python run_longmemeval.py --limit 20 # 100-Q sample + python run_longmemeval.py # full 300 Q + +Results are written to outputs// and outputs/summary.json. +""" +import argparse +import asyncio +import copy +import json +import os +import sys +from collections import defaultdict + +# ---- Path setup (mirrors eventqa_eval/run_eventqa.py) ---- +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import paths # noqa: F401 - side effect: adds sdk/, backend/ to sys.path +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from agent_runner import ( + build_agent_run_info, + run_agent_with_tracking, + ContextManagerConfig, +) +from nexent.core.agents.agent_model import AgentHistory +from nexent.core.agents.agent_context import ContextManager + +from dataset import load_dialogues, LongMemEvalDialogue, LongMemEvalSession +from eval_utils import judge_answer, judge_configured +from summary_schemas import build_multi_topic_config + + +# ============ Agent duty prompts ============ + +INGEST_DUTY = ( + "You are reading a long multi-session chat conversation between a user and " + "an assistant. Earlier sessions are already in your conversation history " + "in their original chronological order. The next message will simply ask " + "you to acknowledge the latest batch of sessions you have just seen. " + "Do not analyze or summarize anything. Acknowledge by calling final_answer " + 'with the single word: OK' +) + +PROBE_DUTY = ( + "You are answering a question about a long multi-session chat conversation " + "between a user and an assistant. The entire conversation history (or a " + "compressed summary of it) is in your context. The user is asking you to " + "recall some fact from that history.\n" + "Rules:\n" + "- Answer the question DIRECTLY in a single short sentence — give the " + "fact, not your reasoning.\n" + "- If the user has updated some information over time, answer with the " + "MOST RECENT value, not an older superseded one.\n" + "- Answer in a SINGLE step. Your first and only code block must call " + "final_answer directly.\n" + '\nfinal_answer("")\n' +) + + +# ============ Pre-compressed history builder ============ +# Same shape as eventqa_eval/run_eventqa.py:build_precompressed_history. +# Kept self-contained so this directory does not depend on eventqa_eval. + +def build_precompressed_history( + frozen_history: list[AgentHistory], + cm_summary: dict, +) -> list[AgentHistory]: + """Replace the compressed prefix pairs with a single summary message, + then append the retained tail pairs verbatim. + """ + boundary = cm_summary.get("compression_boundary", {}) + compressed_pairs = boundary.get("previous_compressed_pairs", 0) + compressed_entries = compressed_pairs * 2 + + summary_text = cm_summary.get("previous_summary") or "" + if not summary_text or compressed_entries == 0: + return list(frozen_history) + + precompressed = [ + AgentHistory( + role="user", + content=f"Summary of earlier sessions in this conversation:\n{summary_text}", + ), + ] + if compressed_entries < len(frozen_history): + precompressed.extend(frozen_history[compressed_entries:]) + return precompressed + + +# ============ Session batching ============ +# The haystack is already 100-120 atomic (user,assistant,...) sessions per +# dialogue. We group N sessions per "ingest batch" — the agent runs once per +# batch to trigger compression, and the real turns are appended directly to +# the conversation history so the chat structure is preserved (unlike the +# novel-prose envelope used by eventqa_eval). + +def turns_to_pairs(session: LongMemEvalSession) -> list[tuple[str, str]]: + """Squash a session's turns into well-formed (user, assistant) pairs. + + Real sessions occasionally have consecutive turns of the same role + (rare but observed). We coalesce runs of same-role turns into one, then + pair user with the following assistant. A trailing unpaired user turn is + paired with an empty assistant ack; a trailing assistant turn without a + preceding user is dropped (no information attribution). + """ + coalesced: list[tuple[str, str]] = [] # (role, content) + for t in session.turns: + if coalesced and coalesced[-1][0] == t.role: + coalesced[-1] = (t.role, coalesced[-1][1] + "\n" + t.content) + else: + coalesced.append((t.role, t.content)) + + pairs: list[tuple[str, str]] = [] + i = 0 + while i < len(coalesced): + role, content = coalesced[i] + if role == "user": + if i + 1 < len(coalesced) and coalesced[i + 1][0] == "assistant": + pairs.append((content, coalesced[i + 1][1])) + i += 2 + else: + pairs.append((content, "")) + i += 1 + else: + # leading assistant turn with no user — skip + i += 1 + return pairs + + +def session_chunk_text(session_pairs: list[tuple[str, str]]) -> str: + """Render one batch of session pairs as a plain text block (for the + chunk_chars / token-budget estimate displayed in logs).""" + parts: list[str] = [] + for u, a in session_pairs: + parts.append(f"USER: {u}\nASSISTANT: {a}") + return "\n\n".join(parts) + + +# ============ Compressed arm: ingest + compress ============ + +async def ingest_and_compress(dialogue: LongMemEvalDialogue, + cm_config: ContextManagerConfig, args) -> dict: + """Stream the real chat history into the conversation_history list and + let ContextManager compress it incrementally. + + Unlike EventQA (which wraps novel prose as [Novel part X] envelopes), + LongMemEval turns are real user/assistant pairs and go into history as + such. A tiny no-op agent run per batch is the compression trigger. + """ + sessions = dialogue.sessions + if args.max_ingest_sessions > 0: + sessions = sessions[:args.max_ingest_sessions] + + shared_cm = ContextManager(config=cm_config, max_steps=args.ingest_max_steps) + conversation_history: list[AgentHistory] = [] + token_counts = None + ingest_main_input_tokens = 0 + ingest_main_output_tokens = 0 + + batch_size = max(args.sessions_per_batch, 1) + batches: list[list[LongMemEvalSession]] = [ + sessions[i:i + batch_size] + for i in range(0, len(sessions), batch_size) + ] + + for batch_idx, batch in enumerate(batches): + # 1. Append the real turns of this batch to conversation_history. + new_pairs_count = 0 + for sess in batch: + for user_text, assistant_text in turns_to_pairs(sess): + conversation_history.append(AgentHistory(role="user", content=user_text)) + conversation_history.append( + AgentHistory(role="assistant", content=assistant_text or "OK") + ) + new_pairs_count += 1 + + # 2. Trigger compression with a no-op acknowledgement query. + ack_query = ( + f"You have just been shown sessions {batch_idx * batch_size + 1}" + f"-{batch_idx * batch_size + len(batch)} of {len(sessions)} in " + f"the conversation history. Acknowledge by emitting exactly:\n" + f'\nfinal_answer("OK")\n' + ) + run_info = build_agent_run_info( + ack_query, + conversation_history, + duty_prompt=INGEST_DUTY, + max_steps=args.ingest_max_steps, + context_manager_config=cm_config, + language="en", + agent_name="longmemeval_reader", + agent_description="LongMemEval ingest agent", + ) + run_info.context_manager = shared_cm + batch_result = await run_agent_with_tracking(run_info, debug=args.debug) + ingest_main_input_tokens += batch_result.total_input_tokens + ingest_main_output_tokens += batch_result.total_output_tokens + token_counts = shared_cm.get_token_counts() + + return { + "cm_summary": shared_cm.export_summary(), + "conversation_history": conversation_history, + "token_counts": token_counts, + "cm_stats": shared_cm.get_all_compression_stats(), + "num_batches": len(batches), + "num_sessions": len(sessions), + "num_pairs": len(conversation_history) // 2, + "ingest_main_input_tokens": ingest_main_input_tokens, + "ingest_main_output_tokens": ingest_main_output_tokens, + } + + +# ============ Probe runner ============ + +async def run_probes(items, history: list[AgentHistory], args) -> tuple[list[dict], dict]: + """Run each LongMemEval question against a frozen history snapshot. + + Compression is disabled — the history is already in its final form + (pre-compressed summary, or truncated context). Each probe gets its own + deep copy and runs fully independently, so we fan them out under a + bounded semaphore (--probe_concurrency). Result order is preserved via + asyncio.gather and matches the items order. + + Returns ``(rows, token_totals)`` where ``token_totals`` aggregates the + main-LLM input/output tokens across all probes (compression is disabled + in this arm so no compression cost is incurred here). + """ + disabled_cm = ContextManagerConfig(enabled=False, token_threshold=10 ** 9) + concurrency = max(1, args.probe_concurrency) + sem = asyncio.Semaphore(concurrency) + + async def _one(it): + async with sem: + probe_history = copy.deepcopy(history) + run_info = build_agent_run_info( + it.question, + probe_history, + duty_prompt=PROBE_DUTY, + max_steps=args.probe_max_steps, + context_manager_config=disabled_cm, + language="en", + agent_name="longmemeval_answerer", + agent_description="LongMemEval question-answering agent", + max_tokens=args.probe_max_tokens, + ) + result = await run_agent_with_tracking(run_info, debug=args.debug) + verdict = judge_answer( + question=it.question, + gold=it.answer, + hypothesis=result.final_answer, + question_type=it.question_type, + ) + return { + "qid": it.qid, + "question_type": it.question_type, + "answer": result.final_answer, + "gold": it.answer, + "correct": verdict.correct, + "score": verdict.score, + "judge_label": verdict.judge_label, + "judge_raw": verdict.judge_raw, + "_main_input_tokens": result.total_input_tokens, + "_main_output_tokens": result.total_output_tokens, + } + + rows = await asyncio.gather(*(_one(it) for it in items)) + totals = { + "main_input_tokens": sum(r.pop("_main_input_tokens", 0) for r in rows), + "main_output_tokens": sum(r.pop("_main_output_tokens", 0) for r in rows), + } + return rows, totals + + +# ============ Per-dialogue run ============ + +def _fmt(x) -> str: + return "n/a" if x is None else f"{x:.3f}" + + +def _aggregate_costs(costs: list[dict | None]) -> dict: + """Sum per-arm token totals across dialogues for the top-level summary.""" + base_keys = ["main_input_tokens", "main_output_tokens", + "compression_input_tokens", "compression_output_tokens", + "total_input_tokens", "total_output_tokens", "total_tokens"] + comp_keys = base_keys + ["ingest_main_input_tokens", "ingest_main_output_tokens", + "probe_main_input_tokens", "probe_main_output_tokens", + "compression_calls"] + baseline_agg = {k: 0 for k in base_keys} + compressed_agg = {k: 0 for k in comp_keys} + have_compressed = False + for c in costs: + if not c: + continue + for k in base_keys: + baseline_agg[k] += c.get("baseline", {}).get(k, 0) or 0 + if c.get("compressed"): + have_compressed = True + for k in comp_keys: + compressed_agg[k] += c.get("compressed", {}).get(k, 0) or 0 + + def _ratio(c: int, b: int): + return (c / b) if b > 0 else None + + ratio = None + if have_compressed: + ratio = { + "input": _ratio(compressed_agg["total_input_tokens"], baseline_agg["total_input_tokens"]), + "output": _ratio(compressed_agg["total_output_tokens"], baseline_agg["total_output_tokens"]), + "total": _ratio(compressed_agg["total_tokens"], baseline_agg["total_tokens"]), + } + return { + "baseline": baseline_agg, + "compressed": compressed_agg if have_compressed else None, + "ratio": ratio, + } + + +def _build_run_config(args) -> dict: + """Snapshot the run's compression/ingest/probe/baseline params. + + Stored verbatim in summary.json so each output stands alone for + later analysis without grepping shell history for the command line. + """ + return { + "token_threshold": args.token_threshold, + "keep_recent_pairs": args.keep_recent_pairs, + "keep_recent_steps": args.keep_recent_steps, + "max_observation_length": args.max_observation_length, + "summary_schema": args.summary_schema, + "sessions_per_batch": args.sessions_per_batch, + "max_ingest_sessions": args.max_ingest_sessions, + "ingest_max_steps": args.ingest_max_steps, + "probe_max_steps": args.probe_max_steps, + "probe_concurrency": args.probe_concurrency, + "probe_max_tokens": args.probe_max_tokens, + "baseline_context_chars": args.baseline_context_chars, + "limit": args.limit, + } + + +def _build_cost(baseline_probe_tokens: dict, compressed_data: dict | None) -> dict: + """Aggregate end-to-end token cost (main LLM + compression LLM) per arm. + + Baseline arm has zero compression cost since compression is disabled in its + probe-only runs. The compressed arm sums ingest main-LLM tokens, probe + main-LLM tokens, and the compression LLM tokens reported by the shared + ContextManager. + """ + base_main_in = baseline_probe_tokens.get("main_input_tokens", 0) + base_main_out = baseline_probe_tokens.get("main_output_tokens", 0) + baseline = { + "main_input_tokens": base_main_in, + "main_output_tokens": base_main_out, + "compression_input_tokens": 0, + "compression_output_tokens": 0, + "total_input_tokens": base_main_in, + "total_output_tokens": base_main_out, + "total_tokens": base_main_in + base_main_out, + } + + if compressed_data is None: + return {"baseline": baseline, "compressed": None, "ratio": None} + + comp = compressed_data["compression"] + cm_stats = comp.get("cm_stats") or {} + probe = compressed_data.get("probe_tokens") or {} + + comp_main_in = comp.get("ingest_main_input_tokens", 0) + probe.get("main_input_tokens", 0) + comp_main_out = comp.get("ingest_main_output_tokens", 0) + probe.get("main_output_tokens", 0) + comp_cmp_in = cm_stats.get("total_input_tokens", 0) or 0 + comp_cmp_out = cm_stats.get("total_output_tokens", 0) or 0 + compressed = { + "main_input_tokens": comp_main_in, + "main_output_tokens": comp_main_out, + "compression_input_tokens": comp_cmp_in, + "compression_output_tokens": comp_cmp_out, + "ingest_main_input_tokens": comp.get("ingest_main_input_tokens", 0), + "ingest_main_output_tokens": comp.get("ingest_main_output_tokens", 0), + "probe_main_input_tokens": probe.get("main_input_tokens", 0), + "probe_main_output_tokens": probe.get("main_output_tokens", 0), + "compression_calls": cm_stats.get("total_calls", 0), + "total_input_tokens": comp_main_in + comp_cmp_in, + "total_output_tokens": comp_main_out + comp_cmp_out, + "total_tokens": comp_main_in + comp_main_out + comp_cmp_in + comp_cmp_out, + } + + def _ratio(c: int, b: int): + return (c / b) if b > 0 else None + + ratio = { + "input": _ratio(compressed["total_input_tokens"], baseline["total_input_tokens"]), + "output": _ratio(compressed["total_output_tokens"], baseline["total_output_tokens"]), + "total": _ratio(compressed["total_tokens"], baseline["total_tokens"]), + } + return {"baseline": baseline, "compressed": compressed, "ratio": ratio} + + +def _category_accuracy(rows: list[dict]) -> dict[str, dict]: + """Bucket scores by question_type and return per-category {n, accuracy}.""" + bucket: dict[str, list[float]] = defaultdict(list) + for r in rows: + bucket[r["question_type"]].append(r["score"]) + out: dict[str, dict] = {} + for qt, scores in bucket.items(): + out[qt] = { + "n": len(scores), + "accuracy": sum(scores) / len(scores) if scores else 0.0, + } + return out + + +async def run_dialogue(dialogue: LongMemEvalDialogue, args) -> dict: + """Run baseline + compressed arms for one LongMemEval dialogue.""" + items = dialogue.items[:args.limit] if args.limit else dialogue.items + print(f"\n===== DIALOGUE: {dialogue.dialogue_id} =====") + print(f" ctx_chars={len(dialogue.context)} sessions={len(dialogue.sessions)} " + f"questions={len(items)}") + + # ---- Compressed arm ---- + compressed_data = None + if not args.skip_compressed: + cm_config = ContextManagerConfig( + enabled=True, + token_threshold=args.token_threshold, + keep_recent_pairs=args.keep_recent_pairs, + keep_recent_steps=args.keep_recent_steps, + max_observation_length=args.max_observation_length, + ) + # Override with multi-topic schema if requested + if args.summary_schema == "multi_topic": + build_multi_topic_config(cm_config) + schema_label = "multi_topic" + else: + schema_label = "default" + print(f" [compressed:{schema_label}] ingesting " + f"(sessions_per_batch={args.sessions_per_batch}, " + f"threshold={args.token_threshold}) ...") + compression = await ingest_and_compress(dialogue, cm_config, args) + boundary = compression["cm_summary"].get("compression_boundary", {}) + print(f" [compressed:{schema_label}] {compression['num_batches']} batches, " + f"{compression['num_pairs']} pairs ingested, " + f"compressed_pairs={boundary.get('previous_compressed_pairs', 0)}") + + precompressed_history = build_precompressed_history( + compression["conversation_history"], compression["cm_summary"] + ) + print(f" [compressed:{schema_label}] running {len(items)} probes ...") + compressed_results, compressed_probe_tokens = await run_probes( + items, precompressed_history, args + ) + compressed_data = { + "results": compressed_results, + "compression": compression, + "schema": schema_label, + "probe_tokens": compressed_probe_tokens, + } + + # ---- Baseline arm ---- + baseline_results: list[dict] = [] + baseline_probe_tokens = {"main_input_tokens": 0, "main_output_tokens": 0} + if not args.skip_baseline: + truncated = dialogue.context[:args.baseline_context_chars] + baseline_history = [ + AgentHistory( + role="user", + content=( + "Here is the full multi-session chat history between you and " + "the user (it may be truncated):\n\n" + truncated + ), + ), + AgentHistory(role="assistant", content="OK, I have read it."), + ] + print(f" [baseline] context truncated to {len(truncated)} chars, " + f"running {len(items)} probes ...") + baseline_results, baseline_probe_tokens = await run_probes( + items, baseline_history, args + ) + + # ---- Metrics ---- + def accuracy(rows: list[dict]) -> float: + return sum(r["score"] for r in rows) / len(rows) if rows else 0.0 + + baseline_acc = accuracy(baseline_results) + compressed_acc = accuracy(compressed_data["results"]) if compressed_data else 0.0 + memory_retention = None + if baseline_results and compressed_data: + memory_retention = (compressed_acc / baseline_acc) if baseline_acc > 0 else 0.0 + + token_reduction = None + if compressed_data and compressed_data["compression"]["token_counts"]: + tc = compressed_data["compression"]["token_counts"] + unc = tc.get("last_uncompressed") or 0 + comp = tc.get("last_compressed") or 0 + if unc > 0: + token_reduction = 1 - comp / unc + + per_cat_baseline = _category_accuracy(baseline_results) + per_cat_compressed = ( + _category_accuracy(compressed_data["results"]) if compressed_data else {} + ) + + # Per-category retention: compressed_acc / baseline_acc within each type. + per_cat_retention: dict[str, dict] = {} + all_types = set(per_cat_baseline) | set(per_cat_compressed) + for qt in sorted(all_types): + b = per_cat_baseline.get(qt, {}).get("accuracy") + c = per_cat_compressed.get(qt, {}).get("accuracy") + per_cat_retention[qt] = { + "n": per_cat_baseline.get(qt, {}).get("n") or per_cat_compressed.get(qt, {}).get("n", 0), + "baseline_accuracy": b, + "compressed_accuracy": c, + "memory_retention": (c / b) if (b is not None and c is not None and b > 0) else None, + } + + cm_summary = compressed_data["compression"]["cm_summary"] if compressed_data else {} + cost = _build_cost(baseline_probe_tokens, compressed_data) + report = { + "dialogue_id": dialogue.dialogue_id, + "ctx_chars": len(dialogue.context), + "num_sessions": len(dialogue.sessions), + "num_questions": len(items), + "summary_schema": compressed_data.get("schema", "none") if compressed_data else "none", + "config": _build_run_config(args), + "baseline": {"accuracy": baseline_acc, "n": len(baseline_results)}, + "compressed": ( + None if compressed_data is None else { + "accuracy": compressed_acc, + "n": len(compressed_data["results"]), + "memory_retention": memory_retention, + "token_reduction": token_reduction, + "token_counts": compressed_data["compression"]["token_counts"], + "num_batches": compressed_data["compression"]["num_batches"], + "num_sessions_ingested": compressed_data["compression"]["num_sessions"], + "compression_boundary": cm_summary.get("compression_boundary"), + "previous_summary": cm_summary.get("previous_summary"), + } + ), + "cost": cost, + "per_category": per_cat_retention, + "predictions": _merge_predictions(baseline_results, compressed_data), + } + + line = (f" RESULT: baseline_acc={_fmt(baseline_acc)} " + f"compressed_acc={_fmt(compressed_acc)} " + f"retention={_fmt(memory_retention)} " + f"token_reduction={_fmt(token_reduction)} " + f"schema={compressed_data.get('schema', 'none') if compressed_data else 'none'}") + print(line) + if cost.get("compressed") and cost.get("baseline"): + b = cost["baseline"]["total_tokens"] + c = cost["compressed"]["total_tokens"] + r = cost["ratio"]["total"] + print(f" COST: baseline_total={b:,} compressed_total={c:,} " + f"(main={cost['compressed']['main_input_tokens'] + cost['compressed']['main_output_tokens']:,} " + f"+ compression={cost['compressed']['compression_input_tokens'] + cost['compressed']['compression_output_tokens']:,}) " + f"ratio={_fmt(r)}") + return report + + +def _merge_predictions(baseline_results: list[dict], + compressed_data: dict) -> list[dict]: + """Join baseline and compressed predictions by qid.""" + by_qid: dict[str, dict] = {} + + def _row(r: dict) -> dict: + return { + "answer": r["answer"], + "correct": r["correct"], + "judge_label": r["judge_label"], + } + + for r in baseline_results: + entry = by_qid.setdefault(r["qid"], { + "qid": r["qid"], "question_type": r["question_type"], "gold": r["gold"], + }) + entry["baseline"] = _row(r) + if compressed_data: + for r in compressed_data["results"]: + entry = by_qid.setdefault(r["qid"], { + "qid": r["qid"], "question_type": r["question_type"], "gold": r["gold"], + }) + entry["compressed"] = _row(r) + return list(by_qid.values()) + + +# ============ Main ============ + +async def main(args): + data_path = args.data_file + if not os.path.isabs(data_path): + data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), data_path) + if not os.path.exists(data_path): + print(f"ERROR: data file not found: {data_path}") + print(" Run 'python download_data.py' first.") + return + + dialogues = load_dialogues(data_path) + if args.dialogue_index is not None: + dialogues = [dialogues[args.dialogue_index]] + elif args.dialogue_limit: + dialogues = dialogues[:args.dialogue_limit] + + outputs_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs") + os.makedirs(outputs_root, exist_ok=True) + + print(f"{'=' * 60}") + print(f"LongMemEval (S*) Benchmark (nexent agent)") + print(f"{'=' * 60}") + print(f" Dialogues: {len(dialogues)}") + print(f" Questions per dialogue: {args.limit if args.limit else 'all (60)'}") + print(f" Token threshold: {args.token_threshold}") + print(f" Sessions per batch: {args.sessions_per_batch}") + print(f" Keep recent pairs: {args.keep_recent_pairs}") + print(f" Summary schema: {args.summary_schema}") + print(f" Baseline ctx chars: {args.baseline_context_chars}") + print(f" Max ingest sessions: {args.max_ingest_sessions or 'full'}") + print(f" Judge: {'dedicated JUDGE_*' if judge_configured() else 'main LLM_*'}") + print(f"{'=' * 60}") + + reports = [] + for dialogue in dialogues: + report = await run_dialogue(dialogue, args) + reports.append(report) + + d_dir = os.path.join(outputs_root, dialogue.dialogue_id) + os.makedirs(d_dir, exist_ok=True) + with open(os.path.join(d_dir, "predictions.jsonl"), "w", encoding="utf-8") as f: + for pred in report["predictions"]: + f.write(json.dumps(pred, ensure_ascii=False) + "\n") + d_summary = {k: v for k, v in report.items() if k != "predictions"} + with open(os.path.join(d_dir, "summary.json"), "w", encoding="utf-8") as f: + json.dump(d_summary, f, ensure_ascii=False, indent=2, default=str) + + # ---- Cross-dialogue aggregate ---- + def _avg(values): + vals = [v for v in values if v is not None] + return sum(vals) / len(vals) if vals else None + + overall_baseline = _avg([r["baseline"]["accuracy"] for r in reports]) + overall_compressed = _avg([ + r["compressed"]["accuracy"] for r in reports if r["compressed"] + ]) + overall_retention = _avg([ + r["compressed"]["memory_retention"] for r in reports if r["compressed"] + ]) + overall_token_red = _avg([ + r["compressed"]["token_reduction"] for r in reports if r["compressed"] + ]) + + # Cross-dialogue per-category aggregate. + per_cat_agg: dict[str, dict] = {} + all_types: set[str] = set() + for r in reports: + all_types.update(r["per_category"].keys()) + for qt in sorted(all_types): + baseline_vals = [r["per_category"][qt]["baseline_accuracy"] + for r in reports if qt in r["per_category"] + and r["per_category"][qt]["baseline_accuracy"] is not None] + compressed_vals = [r["per_category"][qt]["compressed_accuracy"] + for r in reports if qt in r["per_category"] + and r["per_category"][qt]["compressed_accuracy"] is not None] + retention_vals = [r["per_category"][qt]["memory_retention"] + for r in reports if qt in r["per_category"] + and r["per_category"][qt]["memory_retention"] is not None] + per_cat_agg[qt] = { + "avg_baseline_accuracy": _avg(baseline_vals), + "avg_compressed_accuracy": _avg(compressed_vals), + "avg_memory_retention": _avg(retention_vals), + } + + # Cross-dialogue cost aggregate: sum absolute tokens across dialogues so + # the top-level number reflects the full benchmark wallet, not an average. + cost_agg = _aggregate_costs([r.get("cost") for r in reports]) + + summary = { + "total_dialogues": len(reports), + "questions_per_dialogue": args.limit if args.limit else 60, + "summary_schema": args.summary_schema, + "config": _build_run_config(args), + "judge": "JUDGE_*" if judge_configured() else "LLM_*", + "avg_baseline_accuracy": overall_baseline, + "avg_compressed_accuracy": overall_compressed, + "avg_memory_retention": overall_retention, + "avg_token_reduction": overall_token_red, + "cost": cost_agg, + "per_category": per_cat_agg, + "per_dialogue": { + r["dialogue_id"]: { + "baseline_accuracy": r["baseline"]["accuracy"], + "compressed": ( + None if r["compressed"] is None else { + "accuracy": r["compressed"]["accuracy"], + "memory_retention": r["compressed"]["memory_retention"], + "token_reduction": r["compressed"]["token_reduction"], + } + ), + "cost": r.get("cost"), + } + for r in reports + }, + } + summary_name = ( + f"summary_{args.dialogue_index}.json" + if args.dialogue_index is not None + else "summary.json" + ) + summary_path = os.path.join(outputs_root, summary_name) + with open(summary_path, "w", encoding="utf-8") as f: + json.dump(summary, f, ensure_ascii=False, indent=2, default=str) + + print(f"\n{'=' * 60}") + print(f"LongMemEval finished. {len(reports)} dialogue(s).") + print(f" avg baseline accuracy: {_fmt(overall_baseline)}") + print(f" avg compressed accuracy: {_fmt(overall_compressed)}") + print(f" avg memory_retention: {_fmt(overall_retention)}") + print(f" avg token_reduction: {_fmt(overall_token_red)}") + print(f" per-category:") + for qt, m in per_cat_agg.items(): + print(f" {qt:<28} baseline={_fmt(m['avg_baseline_accuracy'])} " + f"compressed={_fmt(m['avg_compressed_accuracy'])} " + f"retention={_fmt(m['avg_memory_retention'])}") + if cost_agg.get("compressed") and cost_agg["baseline"]["total_tokens"]: + b = cost_agg["baseline"] + c = cost_agg["compressed"] + print(f" cost (sum across dialogues):") + print(f" baseline main={b['main_input_tokens']:>12,} in / {b['main_output_tokens']:>10,} out total={b['total_tokens']:,}") + print(f" compressed main={c['main_input_tokens']:>12,} in / {c['main_output_tokens']:>10,} out " + f"compression={c['compression_input_tokens']:,} in / {c['compression_output_tokens']:,} out total={c['total_tokens']:,}") + print(f" ratio input={_fmt(cost_agg['ratio']['input'])} " + f"output={_fmt(cost_agg['ratio']['output'])} total={_fmt(cost_agg['ratio']['total'])}") + print(f" Summary saved to {summary_path}") + print(f"{'=' * 60}") + + +def _build_arg_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser(description="Run the LongMemEval (S*) benchmark") + p.add_argument("--data_file", type=str, default="data/longmemeval_s_star.jsonl") + p.add_argument("--dialogue_limit", type=int, default=None, + help="Run only first N dialogues (default: all 5)") + p.add_argument("--dialogue_index", type=int, default=None, + help="Run only the dialogue at this index (0-4); overrides --dialogue_limit") + p.add_argument("--limit", type=int, default=60, + help="Questions per dialogue (default 60 — full; set lower for sampling)") + p.add_argument("--summary_schema", type=str, default="default", + choices=["default", "multi_topic"], + help="Summary schema: 'default' (active_task) or 'multi_topic' (preserve all topics)") + # ContextManager + p.add_argument("--token_threshold", type=int, default=12000) + p.add_argument("--keep_recent_pairs", type=int, default=2, + help="Recent (user, assistant) pairs preserved uncompressed " + "(default 2 — matches SDK ContextManagerConfig). " + "Larger values keep more raw turns out of compression — " + "e.g. 4 is a safer choice when probes ask about the " + "MOST RECENT session, but inflates last_compressed tokens.") + p.add_argument("--keep_recent_steps", type=int, default=4) + p.add_argument("--max_observation_length", type=int, default=20000) + # Ingest shaping + p.add_argument("--sessions_per_batch", type=int, default=4, + help="How many haystack sessions to ingest per agent run " + "(higher = fewer compression rounds, larger inputs)") + p.add_argument("--max_ingest_sessions", type=int, default=0, + help="Cap ingested sessions (0 = full ~111 sessions; " + "small value for smoke tests)") + p.add_argument("--ingest_max_steps", type=int, default=2) + p.add_argument("--probe_max_steps", type=int, default=3) + p.add_argument("--probe_concurrency", type=int, default=5, + help="Bounded asyncio concurrency for probe LLM calls " + "(default 5; set 1 for serial). Only affects probes — " + "ingest stays serial since compressions are ordered.") + p.add_argument("--probe_max_tokens", type=int, default=4096, + help="Per-call completion output cap for probe LLM calls " + "(default 4096 — matches SDK production default). " + "Lower to 1024-2048 for tighter loop containment.") + # Baseline + p.add_argument("--baseline_context_chars", type=int, default=480000, + help="Characters of the dialogue fed to the baseline arm") + # Arm selection + p.add_argument("--skip_baseline", action="store_true") + p.add_argument("--skip_compressed", action="store_true") + p.add_argument("--debug", action="store_true") + return p + + +if __name__ == "__main__": + asyncio.run(main(_build_arg_parser().parse_args())) diff --git a/sdk/benchmark/longmemeval_eval/run_with_debugger.py b/sdk/benchmark/longmemeval_eval/run_with_debugger.py new file mode 100644 index 000000000..b2f406bac --- /dev/null +++ b/sdk/benchmark/longmemeval_eval/run_with_debugger.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +"""Run LongMemEval benchmark with ContextDebugger attached for Langfuse export. + +Usage: + # Option 1: Smoke test (default schema) + NEXENT_CONTEXT_DEBUG=/tmp/longmemeval_smoke.jsonl \ + python run_with_debugger.py \ + --dialogue_index 0 --limit 1 --max_ingest_sessions 20 \ + --token_threshold 200000 --baseline_context_chars 800000 \ + --sessions_per_batch 12 --keep_recent_pairs 10 --summary_schema default + + # Option 2: Single dialogue with 10 questions (multi_topic schema - recommended) + NEXENT_CONTEXT_DEBUG=/tmp/longmemeval_q10_multi.jsonl \ + python run_with_debugger.py \ + --dialogue_index 0 --limit 10 \ + --token_threshold 200000 --baseline_context_chars 800000 \ + --sessions_per_batch 12 --keep_recent_pairs 10 --summary_schema multi_topic + + # Option 3: Full 60 questions (multi_topic schema) + NEXENT_CONTEXT_DEBUG=/tmp/longmemeval_q60_multi.jsonl \ + python run_with_debugger.py \ + --dialogue_index 0 --limit 60 \ + --token_threshold 200000 --baseline_context_chars 800000 \ + --sessions_per_batch 12 --keep_recent_pairs 10 --summary_schema multi_topic + +Export to Langfuse: + python -m ctx_debugger.langfuse_export \ + --session-id longmemeval-ctx0-question10-multi \ + --host http://localhost:3100 +""" +import asyncio +import os +import sys + +HERE = os.path.dirname(os.path.abspath(__file__)) +BENCHMARK_DIR = os.path.dirname(HERE) +SDK_DIR = os.path.dirname(BENCHMARK_DIR) +CTX_DEBUGGER_DIR = os.path.join(SDK_DIR, "ctx_debugger") + +for p in (SDK_DIR, BENCHMARK_DIR, HERE, CTX_DEBUGGER_DIR): + if p not in sys.path: + sys.path.insert(0, p) + +TRACE_PATH = os.environ.get( + "NEXENT_CONTEXT_DEBUG", "/tmp/nexent_longmemeval_trace.jsonl" +) +os.environ["NEXENT_CONTEXT_DEBUG"] = TRACE_PATH + + +def _install_auto_attach(): + """Wrap CoreAgent.__init__ to auto-attach debugger.""" + from nexent.core.agents.core_agent import CoreAgent + from ctx_debugger import attach_debugger + from ctx_debugger.debugger import _wrap_compress_if_needed + import logging + log = logging.getLogger(__name__) + + original_agent_init = CoreAgent.__init__ + + def patched_agent_init(self, *args, **kwargs): + original_agent_init(self, *args, **kwargs) + try: + attach_debugger(self, append=True) + except Exception as exc: + log.warning("Agent auto-attach failed: %s", exc, exc_info=True) + + def patched_setattr(self, name, value): + object.__setattr__(self, name, value) + if ( + name == "context_manager" + and value is not None + and getattr(value.config, "enabled", False) + ): + existing_dbg = getattr(self, "_debugger", None) + if existing_dbg is None: + return + if getattr(value, "_debugger", None) is existing_dbg: + return + try: + _wrap_compress_if_needed(value, existing_dbg) + except Exception as exc: + log.warning("Compression layer attach failed: %s", exc, exc_info=True) + + CoreAgent.__init__ = patched_agent_init + CoreAgent.__setattr__ = patched_setattr + + +def main(): + _install_auto_attach() + + os.chdir(HERE) + from run_longmemeval import main as longmemeval_main, _build_arg_parser + + args = _build_arg_parser().parse_args() + asyncio.run(longmemeval_main(args)) + print(f"\n[ctx_debugger] Trace written to: {TRACE_PATH}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/sdk/benchmark/longmemeval_eval/summary_schemas.py b/sdk/benchmark/longmemeval_eval/summary_schemas.py new file mode 100644 index 000000000..cdbebdaae --- /dev/null +++ b/sdk/benchmark/longmemeval_eval/summary_schemas.py @@ -0,0 +1,159 @@ +"""Custom summary schemas for LongMemEval multi-topic conversation compression. + +LongMemEval contains multi-session dialogues with MANY INDEPENDENT TOPICS: +- LinkedIn job search +- Work schedule (40 hours/week, peak campaign 50 hours) +- Bereavement support group (attended 3 sessions) +- Travel planning (Japan, Hawaii) +- Shopping (moisturizer, Sephora) +- Aquarium setup +- Green card application +- Hilton points redemption +- ... (111 sessions with ~60+ independent topics) + +The default schema assumes a CONTINUOUS TASK ("active_task" → "completed_work"), +which fails here because: +- It treats only the most recent topic as "active_task" +- Older topics (bereavement, work hours, travel) are discarded as "obsolete" +- Probe questions ask about ANY topic → Summary missing → Accuracy = 0% + +Solution: MULTI_TOPIC schema preserves ALL discussed topics. +""" + +# ============ Multi-topic summary prompts ============ + +MULTI_TOPIC_SUMMARY_SYSTEM_PROMPT = ( + "You are summarizing a multi-session conversation where the user discussed " + "MANY DIFFERENT TOPICS over time. This is NOT a single continuous task — " + "each topic is INDEPENDENT and has its own facts that must be preserved. " + "Your goal is to create a TOPIC-BY-TOPIC summary so that someone reading " + "only your summary could answer questions about ANY of the topics discussed, " + "not just the most recent one. " + "Treat the conversation below as source material. " + "Produce only the structured JSON summary; no greeting, preamble, or prefix. " + "Write the summary in the same language the user was using. " + "Be CONCRETE — include specific numbers, names, dates, and details for each topic. " + "Do NOT compress older topics into vague summaries like 'discussed various topics'. " + "Instead, LIST each topic with its key facts so they remain searchable. " + "CRITICAL: extract every quantitative leaf fact (dates, durations, counts, " + "amounts, prices, proper names, trail/product/book/place names) into the " + "'key_facts' section verbatim — these are the exact facts the user will ask " + "about later, and paraphrasing or rounding them loses the answer. " + "When the user UPDATES a previously stated value (e.g. 'now I have 2 free " + "nights' after earlier saying 1), record it in 'knowledge_updates' with the " + "LATEST value first and older superseded values listed for traceability. " + "Output strict JSON format without markdown blocks." +) + +MULTI_TOPIC_INCREMENTAL_SUMMARY_SYSTEM_PROMPT = ( + "You are maintaining a running summary of a multi-topic conversation. " + "The user has discussed MANY INDEPENDENT TOPICS over multiple sessions. " + "The existing summary shows previously discussed topics, and new conversation " + "turns may introduce NEW topics OR add details to EXISTING ones. " + "Update the summary by these rules:\n" + "1. PRESERVE all previously discussed topics — do NOT drop older topics just " + "because they are not discussed in the latest turns. Each topic is independent " + "and may be queried later.\n" + "2. ADD new topics to 'topics' if they appear in the new content.\n" + "3. UPDATE 'topic_details' for topics that got new information.\n" + "4. APPEND every new quantitative leaf fact (date, duration, count, amount, " + "proper name) to 'key_facts'. Never drop existing key_facts entries.\n" + "5. When a value is REPLACED by a newer one (e.g. session count went from 3 " + "to 5), move the old entry into 'knowledge_updates' with the new value first " + "and the older superseded value listed; do NOT silently overwrite.\n" + "6. UPDATE 'recent_topic' to reflect the most recently discussed topic.\n" + "7. Keep the 'user_profile' updated with user background info.\n" + "Be concrete — specific numbers, names, dates. " + "Output strict JSON format without markdown blocks." +) + +# ============ Multi-topic JSON schema ============ + +MULTI_TOPIC_SUMMARY_SCHEMA = { + "topics": ( + "THE MOST IMPORTANT FIELD. A numbered list of ALL topics discussed in " + "this conversation, from earliest to latest. Each entry: topic name + " + "brief description. Format: N. TOPIC_NAME — brief description. " + "Example: '1. Job Search — updating LinkedIn profile for senior roles'. " + "Include ALL topics, not just recent ones. (<=400 words)" + ), + "topic_details": ( + "Key facts for EACH topic mentioned above. This is a dictionary-like " + "structure where each topic gets its key details preserved. " + "Format each topic's details with concrete numbers, names, dates. " + "Example:\n" + "- Job Search: applied for Content Marketing Strategist, work 40 hrs/week, " + "peak campaign 50 hrs/week, has Google Analytics certification\n" + "- Bereavement Support: attended 3 sessions, started 2023/05, helpful for coping\n" + "- Travel: interested in Japan (food, culture), visited Hawaii with family\n" + "Include ALL topics that have specific facts. (<=800 words)" + ), + "key_facts": ( + "FACT-LEVEL INDEX for precise recall. Catalog every quantitative or " + "named leaf fact verbatim so questions asking 'when / how much / what " + "name / how long' can be answered exactly. Group entries under four " + "subcategories. Use the EXACT wording the user/assistant used — do not " + "round, paraphrase, or convert units.\n\n" + "Format (bullet under each subcategory):\n" + "- dates_and_durations:\n" + " - : " + "(e.g. 'Aquarium: bought neon tetras — 2023/04/12'; " + "'Cat Luna: acquired 9 months ago as of 2023/05'; " + "'BBQ event: attended June 3rd, 2023')\n" + "- quantities_and_amounts:\n" + " - : = " + "(e.g. 'Designer handbag: cost = $800'; " + "'Bereavement: support sessions attended = 5'; " + "'Hilton points: free nights available = 2')\n" + "- proper_names:\n" + " - : = " + "(e.g. 'Moncayo Park: recommended trail = GR-90'; " + "'Borges quote: source = The Library of Babel'; " + "'Soviet cartoon: title = Nu, pogodi!')\n" + "- preferences_and_opinions:\n" + " - : " + "(e.g. 'Remote work: prefers virtual coffee breaks for social " + "connection'; 'Baking: liked lemon poppyseed cake — wants similar')\n" + "Be exhaustive — every fact the user could be quizzed on belongs here. " + "Prefer many short bullets over long sentences. (<=1200 words)" + ), + "knowledge_updates": ( + "Facts that CHANGED over the conversation. When a value supersedes an " + "earlier one, record the LATEST value first and list the prior value(s) " + "for traceability. This is critical for 'knowledge-update' questions " + "that ask for the most recent state.\n" + "Format:\n" + "- · : current = " + "(was: @ , @ )\n" + "Example:\n" + "- Hilton points · free_nights: current = 2 (was: 1 @ early March)\n" + "- Bereavement · sessions_attended: current = 5 (was: 3 @ first mention)\n" + "Leave empty list [] if nothing was updated. (<=300 words)" + ), + "recent_topic": ( + "The most recently discussed topic, in finer detail than the older ones, " + "for continuity with what comes next. Include specific details from the " + "latest turns about this topic. (<=200 words)" + ), + "user_profile": ( + "Background info about the user: job title, interests, preferences, " + "demographics that appeared across the conversation. (<=150 words)" + ), + "pending_items": ( + "User's mentioned intentions, decisions pending, or plans not yet executed. " + "Format as list: each item with topic context. (<=100 words)" + ), +} + + +def build_multi_topic_config(base_config) -> None: + """Override base ContextManagerConfig with multi-topic schema. + + Modifies the config IN-PLACE (does not return a new object). + Only overrides the three summary-template fields; all other + ContextManager behavior (incremental compression, caching, boundaries) + remains unchanged. + """ + base_config.summary_system_prompt = MULTI_TOPIC_SUMMARY_SYSTEM_PROMPT + base_config.incremental_summary_system_prompt = MULTI_TOPIC_INCREMENTAL_SUMMARY_SYSTEM_PROMPT + base_config.summary_json_schema = MULTI_TOPIC_SUMMARY_SCHEMA \ No newline at end of file diff --git a/sdk/benchmark/manual_cases/.gitignore b/sdk/benchmark/manual_cases/.gitignore new file mode 100644 index 000000000..d174bd8a6 --- /dev/null +++ b/sdk/benchmark/manual_cases/.gitignore @@ -0,0 +1,7 @@ +# Generated benchmark artifacts (regenerated by each run) +reports/ +inspections/ + +# Runtime artifacts +__pycache__/ +nexent_context_metrics.log diff --git a/sdk/benchmark/manual_cases/README.md b/sdk/benchmark/manual_cases/README.md new file mode 100644 index 000000000..f6e910578 --- /dev/null +++ b/sdk/benchmark/manual_cases/README.md @@ -0,0 +1,78 @@ +# benchmark — Nexent Agent Context Compression Evaluation + +Evaluate the practical effectiveness of **Agent Context Compression**: whether the compressed Agent can still complete tasks, remember key states, and tokens actually decrease. Does not measure text similarity between summary and original, only measures **functional retention**. + +> For complete design documentation of the evaluation mechanism, see [`note_benchmark.md`](note_benchmark.md). +> This file only covers **how to run**. + +--- + +## Prerequisites + +- Use backend's venv (nexent SDK and dependencies already installed): `nexent/backend/.venv/bin/python` +- LLM credentials in repo root's `nexent/.env` (`agent_runner` will `load_dotenv`): + `LLM_API_KEY` / `LLM_MODEL_NAME` / `LLM_API_URL` +- Commands below assume you're in this directory (`sdk/benchmark/`), using relative paths. + +--- + +## Two Entry Points + +### 1. `test_benchmark.py` — End-to-end Case Evaluation (Main Entry) + +```bash +nexent/backend/.venv/bin/python test_benchmark.py +``` + +Automatically discovers all cases under `cases/*/case.json`, each case runs two comparison experiments: + +| Group | Compression | Purpose | +|---|---|---| +| Baseline | `enabled=False` | Capability ceiling | +| Compressed | `enabled=True` + case custom params | Actual performance after compression | + +Evaluates three dimensions: **Continuation** (multi-turn task continuation), **Probe** (early history memory retention), **Token Reduction** (token reduction rate). No CLI arguments; per-case reports written to `reports/.json`, cross-case summary to `reports/summary.json`. + +### 2. `summary_inspector.py` — Compressor Static Quality Check + +Runs without Agent, directly checks whether summary text retains key information—used to distinguish "compressor missed it" vs "Agent didn't use it" failure root causes. + +```bash +# Run all inspections under inspections/ +nexent/backend/.venv/bin/python summary_inspector.py +# Run only one +nexent/backend/.venv/bin/python summary_inspector.py -n example_infra +# Custom compression params + also save raw summary text +nexent/backend/.venv/bin/python summary_inspector.py --config cfg.json --save-summary +``` + +--- + +## Directory Structure + +``` +manual_cases/ +├── test_benchmark.py # End-to-end case evaluation entry +├── summary_inspector.py # Static summary quality check entry +├── agent_runner.py # Agent run wrapper (build run info, run agent with tracking) +├── eval_utils.py # LLM scoring tools (eval_text / average_score) +├── cases// # End-to-end evaluation cases +│ ├── case.json # Config: id / history_file / queries / probes / +│ │ # summary_checks / task_checks / compressed_config +│ └── history.json # Initial multi-turn conversation history (user/assistant pairs) +├── inspections// # Static quality check cases +│ ├── history.json # Conversation history to compress +│ └── checks.json # Summary key information check items +├── reports/ # test_benchmark.py output (.json + summary.json) +└── note_benchmark.md # Complete evaluation mechanism design documentation +``` + +--- + +## Adding a New Case + +1. Create directory `cases//`, place `history.json` (initial history) and `case.json`. +2. `case.json` fields: `id`, `history_file`, `queries` (multi-turn continuation questions), `probes` (memory probes only targeting compressed region), `summary_checks`, `task_checks`, `compressed_config` (compression param overrides). +3. Run `test_benchmark.py`, results appear in `reports/.json`. + +> To see the full trace of context construction and compression during a benchmark run, use [`../../ctx_debugger/`](../../ctx_debugger/) (`example_with_benchmark.py` attaches debugger to batch-run benchmark). \ No newline at end of file diff --git a/sdk/benchmark/manual_cases/calibrate_thresholds.py b/sdk/benchmark/manual_cases/calibrate_thresholds.py new file mode 100644 index 000000000..701372c6e --- /dev/null +++ b/sdk/benchmark/manual_cases/calibrate_thresholds.py @@ -0,0 +1,148 @@ +"""Calibrate token_threshold in case.json files based on actual history token counts. + +For each case under ./cases/, computes the token count of history.json PLUS +the system prompt tokens (using the same estimate_tokens_text() used at runtime), +then writes that value into case.json's compressed_config.token_threshold so +compression triggers precisely when the full context reaches this size. + +The threshold must account for system_prompt + history, because the ContextManager +checks token count against the full message list (which includes system prompt). + +Uses the same BENCHMARK_SYSTEM_PROMPT as test_benchmark.py for consistency. + +Usage: + python calibrate_thresholds.py [--cases-root ./cases] [--system-prompt ] [--dry-run] +""" + +import glob +import json +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) +import paths # noqa: F401 — side-effect: adds sdk/ to sys.path + +from nexent.core.utils.token_estimation import estimate_tokens_text + +# Same lean benchmark prompt as test_benchmark.py — kept in sync. +BENCHMARK_SYSTEM_PROMPT = """You are a helpful assistant. Answer the user's questions based on the conversation history and your knowledge. + +- Be precise and concise. +- When the answer depends on information from earlier conversation, refer to it accurately. +- Do not fabricate information you do not know. +- Use final_answer to submit your response. + +Now start!""" + + +def calibrate_thresholds( + cases_root: str = "./cases", + system_prompt: str = None, + dry_run: bool = False, +) -> list[dict]: + """Calibrate token_threshold in every case.json under cases_root. + + token_threshold = system_prompt_tokens + history_tokens + + Args: + cases_root: Directory containing case subdirectories. + system_prompt: System prompt string. Defaults to BENCHMARK_SYSTEM_PROMPT + (matching test_benchmark.py runtime). + dry_run: If True, compute but do not write files. + + Returns: + List of dicts with calibration details for each case. + """ + sp = system_prompt if system_prompt is not None else BENCHMARK_SYSTEM_PROMPT + sp_tokens = estimate_tokens_text(sp) + + results = [] + case_paths = sorted(glob.glob(os.path.join(cases_root, "*/case.json"))) + + if not case_paths: + print(f"No cases found under {cases_root}") + return results + + print(f"System prompt tokens: {sp_tokens}") + + for case_path in case_paths: + case_dir = os.path.dirname(case_path) + case_name = os.path.basename(case_dir) + + with open(case_path, "r", encoding="utf-8") as f: + case = json.load(f) + + history_relpath = case.get("history_file", "history.json") + history_abspath = os.path.join(case_dir, history_relpath) + + if not os.path.exists(history_abspath): + print(f" SKIP {case_name}: {history_relpath} not found") + continue + + with open(history_abspath, "r", encoding="utf-8") as f: + history = json.load(f) + + history_text = "".join(msg["content"] for msg in history) + history_tokens = estimate_tokens_text(history_text) + total_tokens = sp_tokens + history_tokens + + old_threshold = case.get("compressed_config", {}).get("token_threshold") + changed = old_threshold != total_tokens + + results.append({ + "case": case_name, + "old_threshold": old_threshold, + "new_threshold": total_tokens, + "history_tokens": history_tokens, + "system_prompt_tokens": sp_tokens, + "changed": changed, + }) + + if changed: + case.setdefault("compressed_config", {})["token_threshold"] = total_tokens + if not dry_run: + with open(case_path, "w", encoding="utf-8") as f: + json.dump(case, f, ensure_ascii=False, indent=2) + print( + f" {case_name}: token_threshold {old_threshold} -> {total_tokens} " + f"(sp={sp_tokens} + history={history_tokens})" + ) + else: + print( + f" {case_name}: token_threshold {old_threshold} -> {total_tokens} " + f"(sp={sp_tokens} + history={history_tokens}) [dry-run]" + ) + else: + print(f" {case_name}: token_threshold already {total_tokens}, no change") + + changed_count = sum(1 for r in results if r["changed"]) + if dry_run: + print(f"\nDry-run: {changed_count} case(s) would be calibrated (no files written).") + else: + print(f"\nCalibrated {changed_count} case(s).") + + return results + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Calibrate token_threshold in case.json files") + parser.add_argument( + "--cases-root", default="./cases", + help="Root directory containing case subdirectories (default: ./cases)", + ) + parser.add_argument( + "--system-prompt", default=None, + help="Custom system prompt string (default: build from agent_runner template)", + ) + parser.add_argument( + "--dry-run", action="store_true", + help="Compute new thresholds but do not write to case.json files", + ) + args = parser.parse_args() + calibrate_thresholds( + cases_root=args.cases_root, + system_prompt=args.system_prompt, + dry_run=args.dry_run, + ) \ No newline at end of file diff --git a/sdk/benchmark/manual_cases/cases/.gitignore b/sdk/benchmark/manual_cases/cases/.gitignore new file mode 100644 index 000000000..77c7af6a8 --- /dev/null +++ b/sdk/benchmark/manual_cases/cases/.gitignore @@ -0,0 +1,5 @@ +# Only keep the example case under version control; all other cases are +# private/experimental data that should not be committed. +* +!.gitignore +!example_infra/ diff --git a/sdk/benchmark/manual_cases/cases/example_infra/case.json b/sdk/benchmark/manual_cases/cases/example_infra/case.json new file mode 100644 index 000000000..2de16f9ef --- /dev/null +++ b/sdk/benchmark/manual_cases/cases/example_infra/case.json @@ -0,0 +1,144 @@ +{ + "id": "example_infra", + "history_file": "./history.json", + "queries": [ + "请总结一下我们之前主要讨论了哪些部署和基础设施问题。", + "如果 Elasticsearch 因磁盘使用率过高变成只读,我应该按什么步骤恢复?", + "结合之前的讨论,请说明把容器内服务地址改写为 localhost:映射端口是否可行,以及需要满足什么条件。" + ], + "probes": [ + { + "question": "docker ps 输出中,Elasticsearch 容器的 STATUS 显示的是什么状态?", + "must_contain": [ + "unhealthy" + ] + }, + { + "question": "PostgreSQL 容器映射到宿主机的端口号是多少?", + "must_contain": [ + "5434" + ] + }, + { + "question": "Elasticsearch 默认低水位线是多少?触发后会发生什么?", + "must_contain": [ + "85%", + "停止分配", + "不再分配" + ] + }, + { + "question": "deploy.sh 中默认的部署版本选项是什么?", + "must_contain": [ + "speed" + ] + }, + { + "question": "Supabase 启动时拉取失败的具体是哪个 Docker 镜像?", + "must_contain": [ + "gotrue" + ] + }, + { + "question": "Elasticsearch 容器映射到宿主机的端口号是多少?", + "must_contain": [ + "9210" + ] + } + ], + "summary_checks": [ + { + "description": "ES low watermark value", + "must_contain": [ + "85%" + ] + }, + { + "description": "PostgreSQL mapped port", + "must_contain": [ + "5434" + ] + }, + { + "description": "ES mapped port", + "must_contain": [ + "9210" + ] + }, + { + "description": "deploy.sh default version", + "must_contain": [ + "speed" + ] + }, + { + "description": "sed command for env modification", + "must_contain": [ + "sed" + ] + }, + { + "description": "ES unhealthy status", + "must_contain": [ + "unhealthy" + ] + }, + { + "description": "read_only_allow_delete setting", + "must_contain": [ + "read_only_allow_delete" + ] + }, + { + "description": "container localhost limitation", + "must_contain": [ + "容器内", + "localhost" + ] + }, + { + "description": "gotrue image pull failure", + "must_contain": [ + "gotrue" + ] + }, + { + "description": "context classification", + "must_contain": [ + "Model", + "Tool" + ] + } + ], + "task_checks": [ + { + "turn": 1, + "must_contain_any": [ + "Docker Compose", + "Elasticsearch", + "Terminal Tool", + "localhost" + ] + }, + { + "turn": 2, + "must_contain_any": [ + "清理", + "解除只读", + "read_only_allow_delete" + ] + }, + { + "turn": 3, + "must_contain": [ + "localhost", + "映射端口" + ] + } + ], + "compressed_config": { + "token_threshold": 9576, + "keep_recent_pairs": 1, + "keep_recent_steps": 4 + } +} \ No newline at end of file diff --git a/sdk/benchmark/manual_cases/cases/example_infra/history.json b/sdk/benchmark/manual_cases/cases/example_infra/history.json new file mode 100644 index 000000000..e846504eb --- /dev/null +++ b/sdk/benchmark/manual_cases/cases/example_infra/history.json @@ -0,0 +1,98 @@ +[ + { + "role": "user", + "content": "docker compose -p nexent down\n解释这个命令" + }, + { + "role": "assistant", + "content": "这个命令用于停止并清理由 Docker Compose 管理的应用栈,具体解释如下:\n\n- **`docker compose`**:Docker 的原生 Compose 子命令(而非旧版的 `docker-compose`),用于定义和运行多容器应用。\n- **`-p nexent`**:指定项目名称为 `nexent`。默认情况下,Compose 会以当前目录名作为项目名,而 `-p`(或 `--project-name`)允许你显式指定。项目名称会影响容器、网络、卷等资源的命名前缀。\n- **`down`**:子命令,用于停止并移除由该 Compose 项目创建的资源,包括:\n - 所有运行中的容器(会先停止再删除)\n - 默认创建的专用网络(除非被其他项目使用)\n - 默认创建的匿名卷(但显式命名的外部卷不会被自动删除,以保留数据)\n\n**执行效果**: \n查找当前或指定路径下(通常是你所在的目录)名为 `nexent` 的 Compose 项目,将其中的所有容器停止并删除,同时清理相关的网络和临时卷。这相当于“关闭并彻底移除”整个应用栈,但不会删除镜像、构建缓存或命名的外部卷。\n\n**常见使用场景**: \n当你完成调试或测试,或者需要彻底重置 `nexent` 项目的运行环境时,使用此命令可以快速清理资源,避免残留容器占用端口或磁盘空间。\n\n----" + }, + { + "role": "user", + "content": "$ docker ps\nCONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES\n2d4f98f6a2ba quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z \"/bin/sh -c '\\n mini…\" About an hour ago Up About an hour 0.0.0.0:9010->9000/tcp, [::]:9010->9000/tcp, 0.0.0.0:9011->9001/tcp, [::]:9011->9001/tcp nexent-minio\n1d99caa961e7 redis:alpine \"docker-entrypoint.s…\" About an hour ago Up About an hour (healthy) 0.0.0.0:6379->6379/tcp, [::]:6379->6379/tcp nexent-redis\n9c8b27bc4dba postgres:15-alpine \"docker-entrypoint.s…\" About an hour ago Up About an hour 0.0.0.0:5434->5432/tcp, [::]:5434->5432/tcp nexent-postgresql\nae1ca998a532 docker.elastic.co/elasticsearch/elasticsearch:8.17.4 \"/bin/tini -- /usr/l…\" About an hour ago Up About an hour (unhealthy) 0.0.0.0:9210->9200/tcp, [::]:9210->9200/t\n\n这里如果 docker compose -p nexent down能够识别到该关闭哪些服务吗" + }, + { + "role": "assistant", + "content": "从 `docker ps` 输出看,所有容器名称都以 `nexent-` 开头,这说明它们很可能属于名为 `nexent` 的 Docker Compose 项目(因为 Compose 默认命名规则是 `<项目名>-<服务名>-<序号>`)。\n\n执行 `docker compose -p nexent down` **能够识别并关闭这些容器**,但前提是满足以下条件之一:\n\n1. **当前目录下有创建这些容器时所用的 `docker-compose.yml`(或 `.yaml`)文件** \n Compose 会读取该文件,通过项目名称 `nexent` 和文件中定义的服务名,与运行中容器上的标签(`com.docker.compose.project=nexent`)进行匹配,然后停止并删除它们。\n\n2. **即使当前目录没有 compose 文件**,Docker Compose 也会尝试通过容器标签来查找属于项目 `nexent` 的资源(容器、网络等)。但是:\n - 会输出警告,提示缺少配置文件(`no configuration file provided`)。\n - 可能不会正确移除网络(因为网络配置需要从 compose 文件读取)。\n - 对于卷的处理也可能不完整(但 `down` 默认不删除命名卷)。\n\n**结论**:只要这些容器当初是由 `docker compose -p nexent up` 创建的,那么执行相同的 `-p nexent down` **通常可以关闭并删除它们**,无论当前目录是否有配置文件。但最安全、最完整的做法是**切换到当初使用的 `docker-compose.yml` 所在目录**,再执行该命令,以确保网络、卷等资源也能被正确清理。\n\n另外,你的 `docker ps` 中 Elasticsearch 容器名显示不全(`docker.elastic.co/...` 后应为 `nexent-elasticsearch` 之类),它也带有同样的项目标签,因此也会被关闭。\n\n----" + }, + { + "role": "user", + "content": "elasticsearch似乎需要 不高于90%的容量" + }, + { + "role": "assistant", + "content": "是的,你说得没错,当磁盘使用率达到90%时,Elasticsearch的自动保护机制就会被触发。\n\n不过,Elasticsearch对磁盘容量的管理其实有三个关键的阈值,每个都对应着不同的保护动作。\n\n### 💡 理解 Elasticsearch 的磁盘水位线机制\n\nElasticsearch 使用“磁盘水位线”来监控磁盘空间,从低到高,逐步介入保护。\n\n| 水位线 | 默认阈值 | 触发行为 |\n| :--- | :--- | :--- |\n| **低水位线** | 85% | **停止向该节点分配新的分片**,作为预警,防止情况恶化。 |\n| **高水位线** | 90% | **尝试将分片迁移到其他磁盘使用率较低的节点**,以平衡负载。 |\n| **决堤水位线** | 95% | **强制将所有索引设置为只读 (`read_only_allow_delete`)**。此时无法写入,只能读取和删除,这是最严格的保护措施。 |\n\n---\n\n### 🔍 如何查看磁盘使用率\n\n结合你正在使用 Docker Compose 的场景,可以通过几种方式查看磁盘使用率。\n\n#### 方法一:进入容器使用 API (推荐)\n\n这是最直接的方法。你需要先进入 Elasticsearch 容器的内部,然后通过它自带的 API 来查询状态。\n\n1. **进入容器**:在你的终端执行以下命令,这会为你提供一个容器内的 Bash 环境。\n ```bash\n docker exec -it /bin/bash\n ```\n (请将 `` 替换为实际的容器名或ID)。\n\n2. **查询节点信息**:在容器内,使用 `curl` 命令调用 Elasticsearch 的 API。\n ```bash\n curl -s \"http://localhost:9200/_cat/allocation?v\"\n ```\n 这个命令会返回每个节点的分片数量和磁盘使用情况,`disk.used_percent` 列就是磁盘使用百分比。\n ```bash\n curl -s \"http://localhost:9200/_cat/nodes?v&h=name,disk.used_percent,disk.avail\"\n ```\n 这个命令可以更简洁地查看节点名称、磁盘使用百分比和剩余可用空间。\n\n#### 方法二:通过 Docker 查看主机磁盘空间\n\n这虽然不是 Elasticsearch 的内部数据,但却是导致问题的根源。\n\n```bash\ndocker exec df -h /\n```\n或者,直接在宿主机上查看 Docker 的存储目录占用情况:\n```bash\ndocker system df\n```\n\n---\n\n### 🧹 如何应对磁盘空间不足\n\n如果发现磁盘使用率过高,可以按以下步骤处理。\n\n#### 第一步:清理旧数据并解除只读 (紧急处理)\n\n这是最直接的解决方式。如果你能确定哪些索引可以安全删除,这是最快的方法。\n\n1. **删除旧索引**:进入容器后,使用 `curl` 命令删除不需要的索引。\n ```bash\n # 警告:数据删除后不可恢复,请务必谨慎操作!\n curl -XDELETE \"http://localhost:9200/\"\n ```\n 例如,要删除所有以 `nginx-log-` 开头的旧索引,可以执行 `curl -XDELETE \"http://localhost:9200/nginx-log-*\"`。\n\n2. **解除只读锁定**:清理完空间后,需要手动解除索引的只读状态。\n ```bash\n # 解除所有索引的只读锁定\n curl -XPUT \"http://localhost:9200/_all/_settings\" -H 'Content-Type: application/json' -d'\n {\n \"index.blocks.read_only_allow_delete\": null\n }'\n ```\n 这个命令会告诉 Elasticsearch 移除所有索引的只读限制,让它们恢复正常写入。\n\n3. **重新分配分片**:如果集群状态仍然异常(例如显示为 `red`),可以尝试重试分配分片。\n ```bash\n curl -XPOST \"http://localhost:9200/_cluster/reroute?retry_failed=true\"\n ```\n\n#### 第二步:执行强制段合并 (Force Merge)\n\n有时,即使删除了数据,磁盘空间也不会立即释放。这是因为 Elasticsearch 只是做了“标记删除”,真正的物理删除需要等后台段合并。可以手动触发合并来立即回收空间。\n```bash\n# 对目标索引执行强制段合并,max_num_segments=1 表示合并成单个段\ncurl -XPOST \"http://localhost:9200//_forcemerge?max_num_segments=1\"\n```\n请注意,`_forcemerge` 是一个**非常消耗 I/O 资源的操作**,建议在业务低峰期进行。\n\n#### 第三步:临时调整水位线 (治标不治本)\n\n作为极短期的应急方案,可以临时调高水位线阈值以争取处理时间,但这不是长久之计。\n```bash\ncurl -XPUT \"http://localhost:9200/_cluster/settings\" -H 'Content-Type: application/json' -d'\n{\n \"persistent\": {\n \"cluster.routing.allocation.disk.watermark.low\": \"90%\",\n \"cluster.routing.allocation.disk.watermark.high\": \"95%\",\n \"cluster.routing.allocation.disk.watermark.flood_stage\": \"97%\"\n }\n}'\n```\n问题解决后,建议将设置重置回 Elasticsearch 的默认值。\n\n---\n\n### 🛡️ 如何从根本上预防\n\n预防问题总是比解决问题更重要。建议采取以下措施:\n\n- **设置监控告警**:配置系统监控(例如在 Kibana 中设置),当磁盘使用率超过 80% 时就发出告警,防患于未然。\n- **建立数据生命周期管理 (ILM)**:使用 Elasticsearch 自带的 **Index Lifecycle Management (ILM)** 功能,自动将旧索引转移到成本更低的节点,或定期自动删除过期的索引,实现自动化管理。\n\n### 💎 总结\n\nElasticsearch 的高磁盘使用率问题是可以通过理解其机制、积极监控和主动清理来有效管理的。\n\n* **根本解决**:**清理数据** 或 **增加磁盘容量**。\n* **核心命令**:**`DELETE /索引名`** 和 **`_forcemerge`**。\n* **紧急操作**:解除只读锁定(**`index.blocks.read_only_allow_delete`**)。\n \n\n----" + }, + { + "role": "user", + "content": "🔧 Terminal Tool Container Setup:\n Terminal tool allows AI agents to execute shell commands via SSH.\n This will create an openssh-server container for secure command execution.\n\n解释这里的含义" + }, + { + "role": "assistant", + "content": "这段提示是部署脚本中 **Terminal Tool Container(终端工具容器)** 的配置说明,其含义如下:\n\n- **用途**:允许 AI 代理(例如基于大模型的自动化任务)通过 **SSH 协议**在宿主机或指定目录中执行 shell 命令。\n- **实现方式**:脚本会额外创建一个 **openssh-server** 容器,该容器内部运行 SSH 服务,AI 代理可以 SSH 登录到这个容器,并执行命令。\n- **安全考虑**:通过容器隔离执行环境,限制 AI 代理的访问范围(如仅能操作挂载到容器内的目录),避免直接暴露宿主机的完整系统,从而提升安全性。\n- **用户交互**:此提示会询问是否启用该容器,如果选择启用,脚本会进一步要求设置 SSH 用户名/密码以及宿主机与容器的目录映射。\n\n简单来说,这是一个可选的安全沙箱,让 AI 能在受控环境中执行命令,而不直接接触宿主机核心系统。\n\n\n----" + }, + { + "role": "user", + "content": "🔧 Starting Supabase services...\n[+] up 3/3\n ! Image supabase/postgres:15.8.1.060 Interrupted 0.8s\n ✘ Image supabase/gotrue:v2.170.0 Error failed to resolve reference \"docker.io/supabase/gotrue:v2.170.0\": failed to do request: Head \"https://registry-1.docker.io/v2/supabase/gotrue/manifests/v2.170.0\": EOF 0.8s\n ! Image kong:2.8.1 Interrupted 0.8s\nError response from daemon: failed to resolve reference \"docker.io/supabase/gotrue:v2.170.0\": failed to do request: Head \"https://registry-1.docker.io/v2/supabase/gotrue/manifests/v2.170.0\": EOF\n ❌ ERROR Failed to start supabase services\n❌ Infrastructure deployment failed\n(backend) \n\n解释错误" + }, + { + "role": "assistant", + "content": "这个错误是在启动 Supabase 服务时,**拉取 Docker 镜像失败**导致的。具体分析如下:\n\n---\n\n### 错误核心\n```\nError response from daemon: failed to resolve reference \"docker.io/supabase/gotrue:v2.170.0\": \nfailed to do request: Head \"https://registry-1.docker.io/v2/supabase/gotrue/manifests/v2.170.0\": EOF\n```\n\n- **EOF** (End Of File) 表示 Docker 在与 Docker Hub 的 registry 通信时,**连接被对方意外关闭**,通常是网络层面的问题(丢包、超时、防火墙重置等),而不是镜像不存在或权限问题。\n\n---\n\n### 可能的原因\n\n1. **网络不稳定** \n 临时性的网络波动,导致 HTTPS 请求中断。\n\n2. **中国大陆访问 Docker Hub 缓慢或被限制** \n Docker Hub (registry-1.docker.io) 在国内访问经常出现超时或连接重置,尤其是拉取大镜像或多个镜像时。用户所在环境很可能是大陆网络,且未配置镜像加速器。\n\n3. **代理或防火墙干扰** \n 如果宿主机配置了 HTTP 代理,但 Docker daemon 未正确继承代理设置;或者防火墙拦截了部分请求。\n\n4. **Docker Hub 限流** \n 匿名用户拉取镜像有频率限制(每小时约 100 次),短时间内拉取过多镜像可能被临时拒绝,表现为 EOF 或 429 错误。\n\n---\n\n### 解决方案\n\n#### 1. 配置 Docker 镜像加速器(推荐,尤其在大陆)\n编辑 `/etc/docker/daemon.json`(Linux)或 Docker Desktop 设置,添加国内镜像加速地址,例如:\n```json\n{\n \"registry-mirrors\": [\n \"https://docker.mirrors.ustc.edu.cn\",\n \"https://hub-mirror.c.163.com\",\n \"https://mirror.ccs.tencentyun.com\"\n ]\n}\n```\n然后重启 Docker 服务:`sudo systemctl restart docker`。\n\n#### 2. 检查并设置代理(如果宿主机需要代理访问外网)\n为 Docker daemon 设置代理:创建目录 `/etc/systemd/system/docker.service.d/`,添加 `http-proxy.conf`:\n```\n[Service]\nEnvironment=\"HTTP_PROXY=http://your-proxy:port\"\nEnvironment=\"HTTPS_PROXY=http://your-proxy:port\"\nEnvironment=\"NO_PROXY=localhost,127.0.0.1\"\n```\n重启 Docker:`sudo systemctl daemon-reload && sudo systemctl restart docker`。\n\n#### 3. 重试部署(可能只是临时问题)\n直接重新运行脚本:\n```bash\n./deploy.sh --mode infrastructure --is-mainland Y\n```\n(`--is-mainland Y` 会让脚本尝试使用国内镜像源,如果脚本已配置 `.env.mainland` 中的镜像替换)\n\n#### 4. 手动拉取受影响的镜像\n先手动拉取失败的那个镜像,再重新运行脚本:\n```bash\ndocker pull supabase/gotrue:v2.170.0\ndocker pull supabase/postgres:15.8.1.060\ndocker pull kong:2.8.1\n```\n如果手动拉取也失败,则确认是网络问题,必须解决网络连通性。\n\n#### 5. 跳过 Supabase 启动(如果当前不需要)\n部署脚本支持 `DEPLOYMENT_VERSION=speed` 模式(不启动 Supabase)。可以尝试用 `--version speed` 参数运行:\n```bash\n./deploy.sh --mode infrastructure --version speed\n```\n这样就不会尝试拉取 Supabase 相关的镜像,只启动基础的 Elasticsearch、PostgreSQL、MinIO、Redis。\n\n---\n\n### 总结\n该错误本质是 **网络导致 Docker 拉取镜像失败**,不是脚本逻辑或配置错误。优先配置镜像加速器或使用代理,或者改用 speed 版本绕过 Supabase 的依赖。\n\n\n----" + }, + { + "role": "user", + "content": "- Image docker.m.daocloud.io/kong:2.8.1 [⣿⡀⣿⣿] 13.63MB / 49.33MB Pulling 80.0s\n - Image docker.m.daocloud.io/supabase/gotrue:v2.170.0 [⣿⣄⣿⣿⣿⣿⣿] 13.08MB / 22.85MB Pulling 80.0s\n - Image docker.m.daocloud.io/supabase/postgres:15.8.1.060 [⠀⠀\n\n解释这里的含义" + }, + { + "role": "assistant", + "content": "这段输出表明 **Docker 正在使用 DaoCloud 提供的镜像加速器(`docker.m.daocloud.io`)拉取 Supabase 相关镜像**,并且下载过程正在进行中。这实际上是一个 **好消息**——之前的 `EOF` 网络错误已经被成功规避了。\n\n下面逐部分解释含义:\n\n---\n\n### 1. 镜像地址前缀:`docker.m.daocloud.io/...`\n- 这是 **DaoCloud 提供的 Docker Hub 镜像加速服务**(中国大陆常用)。\n- 原本的镜像地址是 `docker.io/supabase/kong:2.8.1`,经过加速器后变为 `docker.m.daocloud.io/supabase/kong:2.8.1`。\n- 作用:利用国内 CDN 加速拉取,避免因访问 Docker Hub 慢或中断导致的 `EOF`、超时等错误。\n\n---\n\n### 2. 进度条 `[⣿⡀⣿⣿]` 和数字 `13.63MB / 49.33MB`\n- 这是 Docker CLI 显示的 **下载进度**。\n- `⣿` 和 `⡀` 等字符是 **Unicode 块状动画**,表示已下载的数据块分布(类似进度条)。\n- `13.63MB / 49.33MB`:已下载 13.63 MB,总大小 49.33 MB。\n- 括号内的方块图形还会动态变化,反映多线程分块下载的每个块的状态。\n\n---\n\n### 3. `Pulling` 状态\n- 表示镜像正在从 registry 拉取到本地。\n- 如果完成,会变成 `Pull complete` 或 `Downloaded`。\n\n---\n\n### 4. `80.0s` (80秒)\n- 这是该镜像拉取已消耗的时间。\n- 由于镜像较大且可能同时拉取多个镜像,80秒仍在正常范围内。\n\n---\n\n### 5. 第三行 `[⠀⠀` 不完整\n- 可能由于终端输出截断或正在初始化,但可以判断 `supabase/postgres` 镜像也在拉取中。\n- 通常每个镜像会显示独立的进度条。\n\n---\n\n### 整体含义总结\n| 现象 | 解释 |\n|------|------|\n| 使用了 `docker.m.daocloud.io` | 成功切换到了国内镜像加速器,解决了之前拉取 `supabase/gotrue` 时的 `EOF` 错误。 |\n| 进度条在增长 | 镜像正在正常下载,没有中断。 |\n| 多个镜像同时拉取 | Docker Compose 默认并行拉取多个服务所需的镜像,这是正常行为。 |\n| 耗时 80 秒且还在继续 | 镜像体积较大(kong 约 50MB,gotrue 约 23MB,postgres 可能更大),需要耐心等待。 |\n\n---\n\n### 你接下来应该做什么?\n- **耐心等待**:让拉取完成,不要按 `Ctrl+C` 中断。\n- 如果拉取过程中某个镜像再次失败(概率较低),可以重试或手动 `docker pull` 对应的镜像。\n- 拉取完成后,脚本会自动继续启动 Supabase 容器。\n\n---\n\n### 补充说明:为什么之前报错,现在好了?\n之前直接访问 `registry-1.docker.io` 遇到 `EOF`(连接被服务器关闭),通常是网络不稳定或被限制。现在脚本很可能因为:\n- 你在执行 `./deploy.sh` 时选择了 **`--is-mainland Y`**,触发脚本加载 `.env.mainland`,其中定义了 `DOCKER_REGISTRY_PREFIX=docker.m.daocloud.io/`。\n- 或者你手动配置了 Docker daemon 的 `registry-mirrors`。\n\n无论哪种方式,现在镜像正在顺利下载,说明网络问题已绕过。\n\n----" + }, + { + "role": "user", + "content": "解释这里对context的分类" + }, + { + "role": "assistant", + "content": "这个表格对 **Agent 执行循环中的上下文(Context)** 做了三类划分,用于帮助开发者理解在构建可靠 Agent 时,哪些环节可以主动控制,以及这些控制信息的生命周期是临时的还是持久的。\n\n---\n\n### 1. Model Context(模型上下文)\n- **控制什么**: \n 每次调用模型时,输入给模型的所有内容,包括:\n - 系统指令(instructions)\n - 对话历史(message history)\n - 可用工具列表(tools)\n - 期望的输出格式(response format,如 JSON、结构化数据等)\n- **生命周期**:**暂态(Transient)** \n 每次模型调用独立构建,调用结束后该上下文不再保留。下一次调用会重新构建(可能包含上一轮的结果,但作为新的输入)。\n\n> 💡 类比:每次给 AI 发消息时,你决定说哪些话、附带哪些信息。\n\n---\n\n### 2. Tool Context(工具上下文)\n- **控制什么**: \n 工具在执行时可以访问和产出的内容,包括:\n - 读取/写入 **状态(state)**(例如内存变量、会话状态)\n - 访问 **存储(store)**(如数据库、文件、外部 API)\n - 运行时环境信息(如当前用户、权限、请求 ID)\n- **生命周期**:**持久(Persistent)** \n 工具执行过程中产生的状态或存储变化会跨越多次 Agent 循环保留下来,影响后续调用。\n\n> 💡 类比:工具像是一台机器的按钮,按下后会改变机器的内部状态(比如计数器+1),这个状态一直存在直到被重置。\n\n---\n\n### 3. Life‑cycle Context(生命周期上下文)\n- **控制什么**: \n 发生在 **模型调用与工具调用之间** 以及 **整个 Agent 循环各阶段** 的逻辑,例如:\n - 对话历史总结(summarization)——当历史太长时压缩\n - 护栏(guardrails)——检查模型输出是否合规、安全\n - 日志记录(logging)——记录每一步的输入输出、耗时\n - 重试逻辑、超时处理、错误恢复\n- **生命周期**:**持久(Persistent)** \n 这些控制逻辑在 Agent 运行期间一直生效,跨越多次模型‑工具循环,且通常由框架或开发者预先定义,而不是每次调用时临时指定。\n\n> 💡 类比:像一个流水线上的质检员和记录员,每个零件经过时都会执行相同的检查和记录流程。\n\n---\n\n### 总结对比\n\n| 上下文类型 | 控制的核心内容 | 生命周期 | 举例 |\n|-----------|--------------|----------|------|\n| **Model Context** | 模型看到的输入 | 暂态 | 当前提示词 + 最近 5 轮对话 + 可用工具列表 |\n| **Tool Context** | 工具能读写的状态/存储 | 持久 | 数据库连接、用户会话变量、文件系统 |\n| **Life‑cycle Context** | 模型‑工具之间的流程逻辑 | 持久 | 自动总结、安全检查、日志、重试策略 |\n\n理解这三种上下文有助于设计更可控、可观测的 Agent 系统: \n- **Model Context** 负责每次推理的“素材” \n- **Tool Context** 负责 Agent 的“记忆”与“外部能力” \n- **Life‑cycle Context** 负责整个过程的“规矩”与“韧性”\n----" + }, + { + "role": "user", + "content": "docker compose up nexent-postgresql redis nexent-elasticsearch -d\n这里的-d代表什么" + }, + { + "role": "assistant", + "content": "`-d` 是 `--detach` 的缩写,表示 **后台运行模式**(detached mode)。\n\n具体含义:\n- 执行该命令后,Docker Compose 会启动指定的服务容器(`nexent-postgresql`、`redis`、`nexent-elasticsearch`),但 **不会将容器的日志输出附着到当前终端**。\n- 容器会在后台运行,终端立即返回提示符,你可以继续执行其他命令。\n- 若不使用 `-d`,则容器会以前台模式运行,当前终端会一直显示这些容器的日志输出,直到你按 `Ctrl+C` 停止它们。\n\n简单类比:\n- 不加 `-d`:像在前台运行一个程序,窗口一直被占用。\n- 加 `-d`:像启动一个后台服务,启动后终端恢复可用。\n\n其他常用配合:\n- `docker compose up -d`:后台启动 Compose 文件中定义的所有服务。\n- `docker compose logs`:查看后台运行容器的日志。\n----" + }, + { + "role": "user", + "content": "1. 上述的full模式是不是默认的\n2. 在full模式下,generate_env_for_infrastructure是不是不被调用" + }, + { + "role": "assistant", + "content": "针对你的两个问题,基于你提供的脚本内容及之前 `deploy.sh` 的逻辑,回答如下:\n\n---\n\n### 1. “full模式”是不是默认的?\n\n**不是默认的。** \n在 `deploy.sh` 的 `select_deployment_version` 函数中,默认选项是 **`1`(speed 版本)**,而不是 `2`(full 版本)。 \n相关代码片段:\n\n```bash\nread -p \"👉 Enter your choice [1/2] (default: 1): \" version_choice\ncase $version_choice in\n 2|\"full\") ... ;;\n 1|\"speed\"|*) # 默认走这里\n export DEPLOYMENT_VERSION=\"speed\"\n echo \"✅ Selected speed version ⚡️\"\n ;;\nesac\n```\n\n因此,除非用户明确输入 `2` 或 `full`,否则会使用 **speed 版本**。\n\n---\n\n### 2. 在 full 模式下,`generate_env_for_infrastructure` 是不是不被调用?\n\n**与 full 模式无关,取决于部署模式(deployment mode)**。 \n在 `deploy.sh` 的 `main_deploy` 函数中,`generate_env_for_infrastructure` 只在 **`DEPLOYMENT_MODE = infrastructure`** 时才会被调用:\n\n```bash\n# Special handling for infrastructure mode\nif [ \"$DEPLOYMENT_MODE\" = \"infrastructure\" ]; then\n generate_env_for_infrastructure || { echo \"❌ Environment generation failed\"; exit 1; }\n ...\nfi\n```\n\n- `DEPLOYMENT_MODE` 由 `--mode` 参数或交互选择决定,可取 `development`(默认)、`infrastructure`、`production`。\n- **只要不是 `infrastructure` 模式,无论 `DEPLOYMENT_VERSION` 是 `speed` 还是 `full`,都不会调用 `generate_env_for_infrastructure`**。\n- 换句话说,`full` 模式下如果选择了 `development` 或 `production` 模式,该函数不会被调用;只有在 `infrastructure` 模式下才会调用。\n\n---\n\n### 总结\n\n| 问题 | 答案 |\n|------|------|\n| full 模式是默认的吗? | 否,默认是 speed 版本 |\n| full 模式下 `generate_env_for_infrastructure` 是否被调用? | 与 full 无关,只取决于是否处于 `infrastructure` 部署模式 |\n\n如果你希望 full 版本也能自动生成根目录的 `.env` 文件(用于开发),可以手动将部署模式设置为 `infrastructure`,或者修改脚本逻辑。\n\n---" + }, + { + "role": "user", + "content": "假设我运行.deploy.sh,最开始输出的命令是什么呢?" + }, + { + "role": "assistant", + "content": "当你运行 `./deploy.sh`(或 `bash deploy.sh`)时,脚本**最先输出的是 Docker Compose 版本的检测信息**。\n\n具体顺序如下:\n\n1. **Bash 版本检查**(如果不符合要求会报错,但正常情况下不输出任何提示)\n2. **Docker Compose 版本检测**: \n - 如果检测到 V2,输出类似: \n `Detected Docker Compose V2, version: v2.24.0`\n - 如果检测到 V1,输出类似: \n `Detected Docker Compose V1, version: 1.29.2`\n - 如果未找到或版本过旧,输出错误信息并退出。\n3. **然后才进入主部署函数 `main_deploy`**,其第一行输出: \n `🚀 Nexent Deployment Script 🚀`\n\n所以,**你看到的第一条输出是 Docker Compose 的版本信息**(除非 Docker Compose 未安装或版本检测失败)。\n\n---" + }, + { + "role": "user", + "content": "generate_env_for_infrastructure() { # Function to generate complete environment file for infrastructure mode using generate_env.sh\n这里的 infrastructure 是什么含义呢" + }, + { + "role": "assistant", + "content": "在 `deploy.sh` 脚本的上下文中,**`infrastructure` 指的是一种部署模式(deployment mode)**,其具体含义如下:\n\n---\n\n### `infrastructure` 模式的核心特征\n\n| 维度 | 说明 |\n|------|------|\n| **启动的服务** | 仅启动 **基础支撑服务**(如 `nexent-elasticsearch`、`nexent-postgresql`、`nexent-minio`、`redis`,以及可选的 `nexent-openssh-server`)。 |\n| **不启动的服务** | **不启动核心业务服务**(如 `nexent-config`、`nexent-runtime`、`nexent-web`、`nexent-mcp`、`nexent-northbound` 等)。 |\n| **目标用户** | 开发者或需要 **单独管理/调试基础设施** 的场景,例如在宿主机上运行核心服务(通过 IDE 或命令行),但希望使用容器化的数据库、缓存、对象存储等。 |\n| **环境文件生成** | 此模式下会调用 `generate_env_for_infrastructure` 函数,该函数执行 `generate_env.sh` 脚本,**将容器内服务的访问地址改写为 `localhost:映射端口`**,使得宿主机上的代码可以直接通过 `localhost` 连接这些基础设施。 |\n\n---\n\n### 为什么叫 `infrastructure`?\n\n这里的 “infrastructure” 借用了软件架构中的概念:\n- **基础设施** 指的是支撑应用运行的下层组件:数据库、缓存、消息队列、对象存储、搜索引擎等。\n- 相对于 “核心业务服务”(即实现业务逻辑的应用代码),基础设施通常是通用的、可替换的、独立于业务逻辑的。\n\n因此,`infrastructure` 模式即 **只启动这些底层支撑组件,而让用户自行控制业务服务的启动方式**(例如在宿主机上以开发模式运行)。\n\n---\n\n### 与其他模式的对比\n\n| 模式 | 启动内容 | 适用场景 |\n|------|----------|----------|\n| **development** | 基础设施 + 核心业务服务(所有容器) | 全容器化开发,一键启动所有服务 |\n| **infrastructure** | 仅基础设施服务 | 宿主机开发,仅依赖容器化的数据库等 |\n| **production** | 基础设施 + 核心业务服务,但端口暴露最少、禁用调试仪表盘 | 生产环境部署 |\n\n---\n\n### 在 `generate_env_for_infrastructure` 中的具体作用\n\n该函数专为 `infrastructure` 模式设计,它的任务是:\n1. 确保根目录存在 `.env` 文件。\n2. 将 MinIO、Elasticsearch、PostgreSQL、Redis 等服务的地址 **强制设置为 `localhost:映射端口`**(如 `http://localhost:9010`、`localhost:5434` 等)。\n3. 写入已生成的密钥(MinIO AK/SK、Elasticsearch API Key 等)。\n\n这样,当开发者在宿主机上运行 `config`、`runtime` 等服务时,它们可以直接通过 `localhost` 连接基础设施容器,而无需通过 Docker 内部网络。\n\n---\n\n### 总结\n\n**`infrastructure` 模式下的 `generate_env_for_infrastructure` 函数,本质是为“宿主机开发 + 容器化基础设施”这种混合架构生成正确的连接配置**。\n\n---" + }, + { + "role": "user", + "content": "将容器内服务的访问地址改写为 localhost:映射端口,使得宿主机上的代码可以直接通过 localhost 连接这些基础设施。这里的改写访问地址如何实现呢?可行吗" + }, + { + "role": "assistant", + "content": "在 `generate_env_for_infrastructure` 函数中,“将容器内服务的访问地址改写为 `localhost:映射端口`”是通过 **`generate_env.sh` 脚本中的 `update_env_file` 函数** 实现的。下面详细解释其实现方式及可行性。\n\n---\n\n## 一、实现方式\n\n`generate_env.sh` 中的 `update_env_file` 函数使用 `sed` 命令强制修改根目录下 `.env` 文件中的相关变量,将原本可能指向容器内部服务名(如 `nexent-postgresql`)或内部端口的地址,统一改为 `localhost` + 宿主机映射端口。\n\n**具体代码片段示例**(来自你之前提供的 `generate_env.sh`):\n\n```bash\n# ELASTICSEARCH_HOST\nif grep -q \"^ELASTICSEARCH_HOST=\" ../.env; then\n sed -i.bak \"s~^ELASTICSEARCH_HOST=.*~ELASTICSEARCH_HOST=http://localhost:9210~\" ../.env\nelse\n echo \"ELASTICSEARCH_HOST=http://localhost:9210\" >> ../.env\nfi\n\n# MINIO_ENDPOINT\nif grep -q \"^MINIO_ENDPOINT=\" ../.env; then\n sed -i.bak \"s~^MINIO_ENDPOINT=.*~MINIO_ENDPOINT=http://localhost:9010~\" ../.env\nelse\n echo \"MINIO_ENDPOINT=http://localhost:9010\" >> ../.env\nfi\n\n# POSTGRES_HOST 和 POSTGRES_PORT\nif grep -q \"^POSTGRES_HOST=\" ../.env; then\n sed -i.bak \"s~^POSTGRES_HOST=.*~POSTGRES_HOST=localhost~\" ../.env\nelse\n echo \"POSTGRES_HOST=localhost\" >> ../.env\nfi\nif grep -q \"^POSTGRES_PORT=\" ../.env; then\n sed -i.bak \"s~^POSTGRES_PORT=.*~POSTGRES_PORT=5434~\" ../.env\nelse\n echo \"POSTGRES_PORT=5434\" >> ../.env\nfi\n\n# REDIS_URL\nif grep -q \"^REDIS_URL=\" ../.env; then\n sed -i.bak \"s~^REDIS_URL=.*~REDIS_URL=redis://localhost:6379/0~\" ../.env\nelse\n echo \"REDIS_URL=redis://localhost:6379/0\" >> ../.env\nfi\n```\n\n**关键点**:\n- 所有地址都写死为 `localhost` 加上具体的端口号(如 `9210`、`9010`、`5434`、`6379`)。\n- 这些端口正是 `docker-compose.yml` 中为各个服务定义的 **宿主机映射端口**(从 `docker ps` 输出可以看到:`0.0.0.0:9210->9200/tcp`、`0.0.0.0:9010->9000/tcp`、`0.0.0.0:5434->5432/tcp`、`0.0.0.0:6379->6379/tcp`)。\n\n---\n\n## 二、为什么可行?\n\n### 1. 前提条件:容器已正确映射端口到宿主机\n在 `infrastructure` 模式下,Docker Compose 启动的服务(如 `nexent-elasticsearch`、`nexent-minio`、`nexent-postgresql`、`redis`)都通过 `ports` 指令将容器端口暴露到宿主机。例如:\n\n```yaml\nservices:\n nexent-elasticsearch:\n ports:\n - \"9210:9200\"\n nexent-minio:\n ports:\n - \"9010:9000\"\n - \"9011:9001\"\n nexent-postgresql:\n ports:\n - \"5434:5432\"\n redis:\n ports:\n - \"6379:6379\"\n```\n\n这使得宿主机上的任何进程都可以通过 `localhost:映射端口` 访问这些服务。\n\n### 2. 使用场景:宿主机上直接运行核心服务\n`infrastructure` 模式的设计目标就是 **让开发者可以在宿主机上运行核心业务服务**(如 `nexent-config`、`nexent-runtime` 等),而只将数据库、缓存、对象存储等基础设施容器化。此时,宿主机上的代码(例如通过 IDE 或命令行启动的 Python 服务)需要连接这些基础设施。使用 `localhost:映射端口` 是最直接、最标准的方式,完全可行。\n\n### 3. 验证可行性\n你可以在宿主机上执行以下命令测试连通性:\n```bash\n# 测试 Elasticsearch\ncurl http://localhost:9210\n\n# 测试 MinIO\ncurl http://localhost:9010\n\n# 测试 Redis\nredis-cli -h localhost -p 6379 ping\n\n# 测试 PostgreSQL\npsql -h localhost -p 5434 -U postgres -d postgres\n```\n只要容器正在运行且端口映射正确,这些命令都会成功。\n\n---\n\n## 三、需要注意的边界情况\n\n| 情况 | 是否可行 | 说明 |\n|------|----------|------|\n| 宿主机上的进程(非容器)访问 `localhost` | ✅ 可行 | 标准 Docker 端口映射 |\n| 其他**容器内**的进程访问 `localhost` | ❌ 不可行 | 容器内的 `localhost` 指向容器自身,而非宿主机。如果需要从另一个容器访问,应使用容器名称或宿主机 IP(如 `host.docker.internal` on Mac/Windows,或 `172.17.0.1` on Linux)。但 `infrastructure` 模式不涉及其他容器访问,所以不影响设计目标。 |\n| 宿主机上的进程访问时端口被占用 | ❌ 会导致失败 | 如果宿主机上已有其他进程占用了 `9210`、`9010` 等端口,Docker 容器无法绑定,服务启动失败。`deploy.sh` 中的 `check_ports_in_env_files` 函数会预先检查端口占用。 |\n| Windows (Git Bash) 环境 | ✅ 可行 | `localhost` 在 Windows 上同样有效,Docker Desktop 会自动处理端口转发。 |\n\n---\n\n## 四、总结\n\n- **实现方式**:通过 `generate_env.sh` 中的 `sed` 命令,将 `.env` 文件中的服务地址硬编码为 `localhost:映射端口`。\n- **可行性**:**完全可行**,这是 Docker 端口映射的标准用法。前提是基础设施容器必须将端口正确发布到宿主机,并且宿主机上没有端口冲突。\n- **适用范围**:专为 `infrastructure` 模式设计,目的是让宿主机上运行的代码能够无缝连接容器化的基础设施服务。\n\n因此,该设计是合理且有效的。" + } +] \ No newline at end of file diff --git a/sdk/benchmark/manual_cases/eval_utils.py b/sdk/benchmark/manual_cases/eval_utils.py new file mode 100644 index 000000000..36dfde485 --- /dev/null +++ b/sdk/benchmark/manual_cases/eval_utils.py @@ -0,0 +1,77 @@ +from dataclasses import dataclass + + +@dataclass +class EvalResult: + passed: bool + score: float + details: dict + + +def contains_all(text: str, keywords: list[str]) -> bool: + text = text.lower() + return all(k.lower() in text for k in keywords) + + +def contains_any(text: str, keywords: list[str]) -> bool: + text = text.lower() + return any(k.lower() in text for k in keywords) + + +def count_matches(text: str, keywords: list[str]) -> int: + """Count how many keywords are present in the text (case-insensitive).""" + text = text.lower() + return sum(1 for k in keywords if k.lower() in text) + + +def eval_text(text: str, check: dict) -> EvalResult: + """Evaluate text against keyword checks with partial scoring. + + Scoring rules: + - must_contain: score = matched_count / total_keywords + (1.0 if all present, 0.6 if 3/5 present, etc.) + - must_contain_any: score = 1.0 if any present, 0.0 otherwise + - When both are present, score is the average of both sub-scores. + - passed is True only when all checks fully pass (backward compatible). + """ + passed = True + details = {} + scores = [] + + if "must_contain" in check: + keywords = check["must_contain"] + matched = count_matches(text, keywords) + ok = matched == len(keywords) + details["must_contain"] = { + "matched": matched, + "total": len(keywords), + "ok": ok, + } + scores.append(matched / len(keywords) if keywords else 1.0) + passed = passed and ok + + if "must_contain_any" in check: + keywords = check["must_contain_any"] + ok = contains_any(text, keywords) + matched = count_matches(text, keywords) + details["must_contain_any"] = { + "matched": matched, + "total": len(keywords), + "ok": ok, + } + scores.append(1.0 if ok else 0.0) + passed = passed and ok + + score = sum(scores) / len(scores) if scores else (1.0 if passed else 0.0) + + return EvalResult( + passed=passed, + score=score, + details=details, + ) + + +def average_score(results: list[EvalResult]) -> float: + if not results: + return 0.0 + return sum(r.score for r in results) / len(results) \ No newline at end of file diff --git a/sdk/benchmark/manual_cases/note_benchmark.md b/sdk/benchmark/manual_cases/note_benchmark.md new file mode 100644 index 000000000..87af83819 --- /dev/null +++ b/sdk/benchmark/manual_cases/note_benchmark.md @@ -0,0 +1,134 @@ +# Benchmark Testing Mechanism Analysis +> Benchmarks like LongBench, LooGLE, Needle evaluate the base LLM's long-context understanding capability (one-time input of long text, testing understanding/reasoning/retrieval), not Agent's context compression capability (after multi-turn interaction history is compressed, testing whether it can continue working). + +## 1. Core Objectives + +Evaluate the practical effectiveness of **Agent Context Compression**, answering: + +> **After compression, can the Agent still work and remember key states?** + +Does not evaluate text similarity between summary and original, but evaluates **functional retention**. + +Three key dimensions: +- **Continuation**: Can the task continue after compression +- **Memory Retention**: Can key states be remembered after compression +- **Token Reduction**: Does token count effectively decrease + +--- + +## 2. Test Structure: Two Experiments Per Case + +Each `cases//` directory contains: +- `history.json`: Initial multi-turn conversation history (user/assistant pairs) +- `case.json`: Test configuration and inspection criteria + +Each case runs two comparison experiments: + +| Group | Compression Status | Purpose | +|---|---|---| +| **Baseline** | `enabled=False` | No compression, measure capability ceiling | +| **Compressed** | `enabled=True` + custom params | Enable compression, measure actual performance | + +--- + +## 3. Case Configuration Key Fields + +```json +{ + "queries": [], // Multi-turn continuation questions + "probes": [], // Memory probe questions (test early history) + "task_checks": [], // Task output checks + "summary_checks": [], // Static summary checks + "compressed_config": {} // Compression parameter overrides +} +``` + +--- + +## 4. Three Evaluation Dimensions + +### 4.1 Continuation Evaluation (Task Continuation Capability) + +Simulate real multi-turn Agent interaction: +- Execute `queries` in sequence, append `(query, answer)` to history each turn +- Compressed group **shares the same ContextManager**, compression **continuously triggers** during execution +- Score `final_answer` at specified turns with `task_checks` + +**Metric**: `task_success_retention = compressed_task_score / baseline_task_score` + +--- + +### 4.2 Probe Evaluation (Memory Retention Capability) + +Test whether the compressed Agent can **utilize** residual information in the summary to answer questions about early history. + +**Key Design** (avoid redundant LLM calls): +1. Get summary and compression boundary from compressed run's `export_summary()` +2. `build_precompressed_history()` constructs precompressed history: + - Compressed prefix pairs → replaced with a single user summary message + - Retained tail pairs → preserved verbatim +3. All probes **reuse the same** precompressed history +4. Each probe `deep copy` then **runs independently**, compression disabled + +Baseline Probe also runs on the full history after compressed run ends, establishing the ceiling. + +**Metric**: `probe_retention = compressed_probe_score / baseline_probe_score` + +**Probe Construction Principle**: Only ask about information in the compressed region (early history). If asking about tail retained region, cannot measure memory retention. + +--- + +### 4.3 Static Summary Inspection (Compressor Static Quality) + +Run without Agent, directly check whether summary text contains key information. + +- Apply `summary_checks` to `previous_summary + current_summary` +- Distinguish failure root causes from Probe Eval: + +| | Probe Eval | Static Inspection | +|---|---|---| +| Input | Complete compressed context (summary + retained tail steps) | Summary text only | +| Execution | Run Agent (LLM) | Direct text inspection | +| What it tests | Agent **can utilize** residual information | Compressor **did retain** key information | +| Failure meaning | Summary has it but Agent didn't use it | Summary doesn't have it at all | + +--- + +## 5. Token Reduction Calculation + +Two-level fallback: +1. **Prefer ContextManager actual token statistics**: Take `last_uncompressed` vs `last_compressed` from the last turn of compressed run +2. **Fallback text estimation**: `1 - compressed.final_tokens / baseline.final_tokens` + +--- + +## 6. Final Report Structure + +```json +{ + "case_id": "...", + "baseline": { "task_score", "probe_score", "final_tokens" }, + "compressed": { "task_score", "probe_score", "final_tokens", "cm_stats", "cm_summary" }, + "metrics": { + "task_success_retention": ..., // Task continuation retention rate + "probe_retention": ..., // Memory probe retention rate + "token_reduction": ..., // Token reduction rate + "summary_score": ... // Static summary score + }, + "task_eval": [...], + "probe_eval": { "baseline": [...], "compressed": [...] }, + "summary_inspection": [...] +} +``` + +All cases aggregated to `reports/summary.json`. + +--- + +## 7. Key Design Principles Summary + +1. **Stateful Continuation**: Compressed group shares `ContextManager`, simulates real execution +2. **Probe Isolation**: Each probe `deep copy` + independent run, no cross-contamination +3. **Probe Reuses Compression Result**: Precompressed history built once, avoid redundant LLM calls +4. **Inspection vs Probe Separation**: Distinguish "compressor missed it" vs "Agent didn't use it" failures +5. **Functional Testing Only**: No text similarity measurement, test Agent's actual working capability in compressed context \ No newline at end of file diff --git a/sdk/benchmark/manual_cases/summary_inspector.py b/sdk/benchmark/manual_cases/summary_inspector.py new file mode 100644 index 000000000..4dc459af9 --- /dev/null +++ b/sdk/benchmark/manual_cases/summary_inspector.py @@ -0,0 +1,330 @@ +# -*- coding: utf-8 -*- +""" +Standalone Summary Inspector — quick evaluation of compression prompt/schema quality. + +Completely independent from test_benchmark.py and the cases/ directory. +Uses compress_history_offline to compress history and checks whether the +resulting summary retains key information. No agent runs needed — just +one LLM call per inspection + text-based checks. + +Use case: + - Iterate on summary prompt / schema in summary_config.py + - Verify that key facts survive compression without running full agent loops + - Compare different ContextManagerConfig settings side-by-side + +Directory layout (independent from cases/ and reports/): + + inspections/ + └── / + ├── history.json # [{"role": "user|assistant", "content": "..."}] + └── checks.json # [{"description": "...", "must_contain": [...]}] + +Result is written to inspections//_result.json (co-located with input). + +Usage: + python summary_inspector.py # all inspections + python summary_inspector.py -n example_infra # single inspection + python summary_inspector.py --config my_config.json # custom config overrides + python summary_inspector.py --save-summary # also save raw summary .txt +""" + +import argparse +import json +import os +import sys +import glob + +# ============ Path Setup ============ +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import paths # noqa: F401 — side-effect: adds sdk/, backend/ to sys.path + +from dotenv import load_dotenv +load_dotenv() + +from nexent.core.agents.agent_context import compress_history_offline, ContextManagerConfig +from nexent.core.agents.agent_model import ModelConfig +from nexent.core.models.openai_llm import OpenAIModel + +from eval_utils import eval_text + + +# ============ Config ============ +LLM_API_KEY = os.getenv("LLM_API_KEY") +LLM_MODEL_NAME = os.getenv("LLM_MODEL_NAME") +LLM_API_URL = os.getenv("LLM_API_URL") + +INSPECTIONS_DIR = "./inspections" + + +def create_model(temperature: float = 0.1): + """Create an LLM model for offline compression.""" + from nexent.core.utils.observer import MessageObserver + + model_config = ModelConfig( + cite_name="inspector_model", + api_key=LLM_API_KEY, + model_name=LLM_MODEL_NAME, + url=LLM_API_URL, + temperature=temperature, + ssl_verify=False, + ) + return OpenAIModel( + observer=MessageObserver(), + model_id=model_config.model_name, + api_key=model_config.api_key, + api_base=model_config.url, + temperature=model_config.temperature, + top_p=model_config.top_p, + ssl_verify=model_config.ssl_verify, + ) + + +def history_to_pairs(history: list) -> list[tuple[str, str]]: + """Convert [{role, content}] to [(user_text, assistant_text)] pairs. + + Consecutive user messages are merged; same for assistant messages, + so the output is a clean alternating sequence of pairs. + """ + pairs = [] + current_user = [] + current_assistant = [] + + for entry in history: + role = entry["role"] + content = entry["content"] + if role == "user": + if current_assistant: + pairs.append(( + "\n".join(current_user).strip(), + "\n".join(current_assistant).strip(), + )) + current_user = [] + current_assistant = [] + current_user.append(content) + elif role == "assistant": + current_assistant.append(content) + + if current_user and current_assistant: + pairs.append(( + "\n".join(current_user).strip(), + "\n".join(current_assistant).strip(), + )) + + return pairs + + +def build_config(overrides: dict = None) -> ContextManagerConfig: + """Build ContextManagerConfig with optional field overrides.""" + config = ContextManagerConfig() + if not overrides: + return config + + for key, value in overrides.items(): + if hasattr(config, key): + setattr(config, key, value) + else: + print(f"WARNING: unknown config field '{key}', ignoring") + + return config + + +def run_inspection( + inspection_dir: str, + model, + config: ContextManagerConfig, +) -> dict: + """Run summary inspection for a single inspection set. + + Reads: + - /history.json + - /checks.json + + Writes: + - /_result.json + - /_summary.txt (optional, if --save-summary) + + Returns: + dict with name, summary, checks, score, and compression metadata. + """ + name = os.path.basename(inspection_dir) + + # Load history + history_path = os.path.join(inspection_dir, "history.json") + if not os.path.exists(history_path): + print(f" SKIP: history.json not found in {inspection_dir}") + return {"name": name, "skipped": True, "reason": "no history.json"} + + with open(history_path, "r", encoding="utf-8") as f: + history = json.load(f) + + # Load checks + checks_path = os.path.join(inspection_dir, "checks.json") + if not os.path.exists(checks_path): + print(f" SKIP: checks.json not found in {inspection_dir}") + return {"name": name, "skipped": True, "reason": "no checks.json"} + + with open(checks_path, "r", encoding="utf-8") as f: + checks = json.load(f) + + if not checks: + print(f" SKIP: checks.json is empty for {name}") + return {"name": name, "skipped": True, "reason": "empty checks"} + + # Convert history to pairs + pairs = history_to_pairs(history) + print(f" History: {len(history)} messages -> {len(pairs)} pairs") + + # Compress + result = compress_history_offline(pairs=pairs, model=model, config=config) + summary = result.get("summary") or "" + is_fallback = result.get("is_fallback", False) + is_incremental = result.get("is_incremental", False) + input_chars = result.get("input_chars", 0) + + if not summary: + print(f" FAILED: compression returned no summary (fallback={is_fallback})") + report = { + "name": name, + "summary": None, + "is_fallback": is_fallback, + "input_chars": input_chars, + "checks": [], + "score": 0.0, + } + _write_result(inspection_dir, report) + return report + + print(f" Summary: {len(summary)} chars, fallback={is_fallback}, incremental={is_incremental}") + + # Evaluate checks against summary + check_results = [] + for check in checks: + eval_result = eval_text(summary, check) + check_results.append({ + "check": check, + "passed": eval_result.passed, + "score": eval_result.score, + "details": eval_result.details, + }) + + total_score = sum(r["score"] for r in check_results) / max(len(check_results), 1) + passed_count = sum(1 for r in check_results if r["passed"]) + + print(f" Result: {passed_count}/{len(check_results)} checks passed, score={total_score:.2f}") + + for r in check_results: + if not r["passed"]: + desc = r["check"].get("description", "") + keywords = r["check"].get("must_contain", r["check"].get("must_contain_any", [])) + print(f" FAIL: {desc} -- missing {keywords}") + + report = { + "name": name, + "summary": summary, + "is_fallback": is_fallback, + "is_incremental": is_incremental, + "input_chars": input_chars, + "summary_chars": len(summary), + "checks": check_results, + "score": total_score, + "passed": passed_count, + "total": len(check_results), + } + + _write_result(inspection_dir, report) + return report + + +def _write_result(inspection_dir: str, report: dict): + """Write _result.json (without full summary to keep file small) and optional _summary.txt.""" + result_path = os.path.join(inspection_dir, "_result.json") + result_out = {k: v for k, v in report.items() if k != "summary"} + with open(result_path, "w", encoding="utf-8") as f: + json.dump(result_out, f, ensure_ascii=False, indent=2, default=str) + print(f" Result saved to {result_path}") + + +def main(): + parser = argparse.ArgumentParser( + description="Standalone Summary Inspector -- quick compression quality check" + ) + parser.add_argument( + "-n", "--name", + type=str, + default=None, + help="Run a specific inspection by name (directory under inspections/)", + ) + parser.add_argument( + "--config", + type=str, + default=None, + help="Path to a JSON file with ContextManagerConfig field overrides", + ) + parser.add_argument( + "--save-summary", + action="store_true", + default=False, + help="Also save the raw summary text to _summary.txt alongside the result", + ) + args = parser.parse_args() + + # Discover inspections + if args.name: + inspection_dirs = [os.path.join(INSPECTIONS_DIR, args.name)] + if not os.path.isdir(inspection_dirs[0]): + print(f"ERROR: inspection directory not found: {inspection_dirs[0]}") + sys.exit(1) + else: + inspection_dirs = sorted(glob.glob(os.path.join(INSPECTIONS_DIR, "*/history.json"))) + inspection_dirs = [os.path.dirname(p) for p in inspection_dirs] + + if not inspection_dirs: + print(f"No inspections found under {INSPECTIONS_DIR}/*/\n" + f"Create one with: mkdir -p {INSPECTIONS_DIR}/my_test\n" + f"Then add history.json and checks.json") + sys.exit(1) + + # Build config + config_overrides = {} + if args.config: + with open(args.config, "r", encoding="utf-8") as f: + config_overrides = json.load(f) + + config = build_config(config_overrides) + config.enabled = True + + # Create model + model = create_model() + + # Run inspection for each + all_results = [] + for inspection_dir in inspection_dirs: + name = os.path.basename(inspection_dir) + print(f"\n===== Inspecting: {name} =====") + + report = run_inspection(inspection_dir, model, config) + all_results.append(report) + + # Optionally save raw summary text + if args.save_summary and report.get("summary"): + summary_path = os.path.join(inspection_dir, "_summary.txt") + with open(summary_path, "w", encoding="utf-8") as f: + f.write(report["summary"]) + print(f" Summary saved to {summary_path}") + + # Print overall summary + print("\n===== Overall =====") + for r in all_results: + if r.get("skipped"): + print(f" {r['name']}: SKIPPED ({r['reason']})") + else: + print(f" {r['name']}: {r.get('passed', 0)}/{r.get('total', 0)} passed, score={r.get('score', 0):.2f}") + + active = [r for r in all_results if not r.get("skipped")] + if active: + avg_score = sum(r.get("score", 0) for r in active) / max(len(active), 1) + print(f"\n Average score: {avg_score:.2f}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/sdk/benchmark/manual_cases/test_benchmark.py b/sdk/benchmark/manual_cases/test_benchmark.py new file mode 100644 index 000000000..75dd5f6fb --- /dev/null +++ b/sdk/benchmark/manual_cases/test_benchmark.py @@ -0,0 +1,726 @@ +import asyncio +import copy +import glob +import json +import os +import sys +import argparse + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import paths # noqa: F401 — side-effect: adds sdk/, backend/ to sys.path + +from agent_runner import ( + build_agent_run_info_with_custom_prompt, + run_agent_with_tracking, + parse_conversation_to_history, + AgentHistory, + ContextManagerConfig, +) + +from nexent.core.agents.agent_context import ContextManager +from nexent.core.utils.token_estimation import estimate_tokens_text + +from eval_utils import eval_text, average_score + +# Lean benchmark system prompt — generic, not task-specific. +# Strips the verbose platform scaffolding (File URL Guide, Reference Marks, +# safety principles, etc.) to minimize token overhead while retaining the +# core execution loop instructions the agent needs to function. +BENCHMARK_SYSTEM_PROMPT = """You are a helpful assistant. Answer the user's questions based on the conversation history and your knowledge. + +- Be precise and concise. +- When the answer depends on information from earlier conversation, refer to it accurately. +- Do not fabricate information you do not know. +- Use final_answer to submit your response. + +Now start!""" + + +# --- Custom summary schema and prompts for knowledge-discussion benchmarks --- +# These override the default 10-field Hermes schema from summary_config.py +# with a deduplicated 6-field schema (~620 word budget) that merges +# completed_work + resolved_questions into "progress" and restricts +# key_facts to values NOT already stated in progress, eliminating +# the 3-field redundancy that caused output bloat in incremental updates. +# +# KEY DESIGN PRINCIPLE for incremental compression: the output must be +# approximately the SAME size as the initial summary (~620 words). The +# incremental prompt treats old+new as a unified corpus and REWRITES the +# entire summary from scratch, rather than appending to the old one. +# This prevents output-token linear growth that would itself exceed +# token_threshold and defeat the purpose of compression. + +BENCHMARK_SUMMARY_SYSTEM_PROMPT = ( + "You are a summarization agent creating a compact working-memory checkpoint. " + "Treat the conversation turns below as source material, not as a transcript to preserve. " + "Your job is to produce a fixed-size JSON summary that preserves only the information " + "needed to continue the conversation correctly later.\n\n" + + "Output rules:\n" + "1. Produce only strict JSON. Do not add greeting, preamble, markdown, or explanation.\n" + "2. Write in the same language as the user's most recent message. Do not translate unless needed.\n" + "3. Never include API keys, tokens, passwords, secrets, credentials, or connection strings. " + "Replace any such values with [REDACTED].\n\n" + + "Compression goal:\n" + "The summary is working memory, not a historical log. " + "Do not list every question, every answer, or every conversation turn. " + "Group information by theme and keep only facts that are likely to matter for future continuation.\n\n" + + "Field constraints:\n" + "1. 'active_task' must describe only the current unfulfilled user request; if none, write 'None'.\n" + "2. 'goal' must describe the current overall objective in <=25 words.\n" + "3. 'state' must contain at most 6 numbered items. Never create item 7 or higher. " + "Each item must be <=45 words. Merge related topics into one item. " + "Do not organize by conversation order; organize by semantic importance.\n" + "4. 'decisions' must contain at most 5 short confirmed conclusions or choices. " + "Do not repeat facts already fully stated in 'state'.\n" + "5. 'open_items' must contain only unresolved questions or pending user requests. " + "If none, write 'None'.\n" + "6. 'verbatim_facts' may contain at most 12 raw values, formulas, thresholds, exact model names, " + "or identifiers that must be copied exactly later. " + "Before output, remove any item whose exact value already appears in 'state' or 'decisions'. " + "If no extra raw facts remain, write 'None'.\n\n" + + "Information priority:\n" + "Critical current task and constraints > final conclusions > decisions > exact values needed later > " + "background context. Drop vague descriptions, repeated facts, superseded intermediate reasoning, " + "and completed Q&A that no longer affects future work.\n\n" + + "Budget:\n" + "The total output must not exceed 620 words. Prefer shorter output. " + "If the content is too large, compress in this order: " + "(1) merge related state items; " + "(2) remove completed historical details; " + "(3) keep only the most diagnostic numbers; " + "(4) move only non-duplicated raw values to 'verbatim_facts'; " + "(5) write 'None' for fields with no current utility.\n\n" + + "Return strict JSON only." +) + + +BENCHMARK_INCREMENTAL_SUMMARY_SYSTEM_PROMPT = ( + "You are a summarization agent rewriting a compact working-memory checkpoint. " + "You receive a Previous Summary and New Conversations. Produce one fresh JSON summary " + "that preserves only the information needed to continue the conversation correctly. " + "Do not preserve discussion history for its own sake. The previous summary is source material, " + "not text to copy.\n\n" + + "Hard constraints:\n" + "1. The output must be no longer than the previous summary and must not exceed 620 words.\n" + "2. The 'state' field must contain at most 6 numbered items. Never create item 7 or higher.\n" + "3. When new information is added, older lower-utility information MUST be merged, generalized, or deleted.\n" + "4. Do not append to the previous summary. Rewrite by theme, not by conversation order.\n" + "5. Completed Q&A should become conclusions, not separate historical entries.\n" + "6. Preserve exact numbers only when they are needed for future correctness. If multiple numbers support the same conclusion, keep only the most diagnostic ones.\n" + "7. 'verbatim_facts' may contain at most 12 raw values/formulas/names. Remove any item already present in 'state' or 'decisions'. If none remain, write 'None'.\n" + "8. Update active_task, state, and open_items to reflect the current state.\n" + "9. Write in the same language as the user's most recent message.\n" + "10. Never include API keys, tokens, passwords, credentials, or connection strings; replace them with [REDACTED].\n\n" + + "Output strict JSON only. No markdown." +) + +BENCHMARK_SUMMARY_SCHEMA = { + "active_task": ( + "用户当前尚未完成的最新请求;如果没有,写 'None'。" + "必须是当前任务,不是历史任务。<=25 words" + ), + + "goal": ( + "对话的总体目标或当前工作方向。" + "只保留后续继续对话所需的目标。<=25 words" + ), + + "state": ( + "当前压缩后的工作记忆,不是历史日志。" + "最多 6 条编号条目;每条 <=45 words。" + "按主题合并信息,不按对话顺序罗列。" + "包括已经确定的结论、关键设计、关键结果和必要上下文。" + ), + + "decisions": ( + "已经确认、后续可能需要引用的结论或选择。" + "最多 5 条;每条 <=25 words。" + "不得重复 state 中已经完整表达的信息。" + ), + + "open_items": ( + "尚未解决的问题、待办事项或用户明确要求继续处理的内容。" + "如果没有,写 'None'。<=30 words" + ), + + "verbatim_facts": ( + "必须逐字保留的数字、公式、模型名、阈值或专有名词。" + "最多 12 项,用分号分隔。" + "不得包含已经出现在 state 或 decisions 中的事实。" + "如果没有额外需要保留的事实,写 'None'。" + ), +} +def history_to_text(history: list[AgentHistory]) -> str: + return "\n".join([f"{h.role}: {h.content}" for h in history]) + + +async def run_multi_turn_for_benchmark( + queries: list[str], + base_history: list[AgentHistory], + cm_config: ContextManagerConfig, + max_steps: int = 20, + system_prompt: str = BENCHMARK_SYSTEM_PROMPT, +): + conversation_history = list(base_history) + results = [] + + shared_cm = None + if cm_config and cm_config.enabled: + shared_cm = ContextManager(config=cm_config, max_steps=max_steps) + + initial_tokens = estimate_tokens_text(history_to_text(conversation_history)) + + # Track per-step actual input tokens for accurate token reduction + step_input_tokens = [] + + for query in queries: + agent_run_info = build_agent_run_info_with_custom_prompt( + query, + system_prompt, + conversation_history, + max_steps=max_steps, + context_manager_config=cm_config, + ) + + if shared_cm is not None: + agent_run_info.context_manager = shared_cm + + result = await run_agent_with_tracking(agent_run_info, debug=False) + results.append(result) + + # Collect actual input token count from the last step metrics + if shared_cm is not None: + tc = shared_cm.get_token_counts() + step_input_tokens.append(tc) + + conversation_history.append(AgentHistory(role="user", content=query)) + conversation_history.append( + AgentHistory(role="assistant", content=result.final_answer) + ) + + final_tokens = estimate_tokens_text(history_to_text(conversation_history)) + + cm_stats = None + cm_token_counts = None + cm_summary = None + if shared_cm is not None: + cm_stats = shared_cm.get_all_compression_stats() + cm_token_counts = shared_cm.get_token_counts() + cm_summary = shared_cm.export_summary() + + return { + "results": results, + "conversation_history": conversation_history, + "shared_cm": shared_cm, + "initial_tokens": initial_tokens, + "final_tokens": final_tokens, + "cm_stats": cm_stats, + "cm_token_counts": cm_token_counts, + "cm_summary": cm_summary, + "step_input_tokens": step_input_tokens, + } + + +def build_precompressed_history( + frozen_history: list[AgentHistory], + cm_summary: dict, +) -> list[AgentHistory]: + """Build a pre-compressed history from the compression snapshot. + + Replaces the compressed prefix pairs with a single user message containing + the summary text, then appends the retained tail pairs verbatim. This + mirrors the actual message structure produced by compress_if_needed: + + SummaryTaskStep.to_messages() → [ChatMessage(role=USER, summary)] + followed by retained tail steps → [TaskStep, ActionStep, ...] + + There is NO assistant message after the summary — the model sees the + summary as a user message, followed directly by the next retained step. + + Args: + frozen_history: The original uncompressed conversation history. + cm_summary: The export_summary() dict from the compressed run's + ContextManager, containing summary text and boundary info. + + Returns: + A new AgentHistory list that mirrors the compressed context structure. + """ + boundary = cm_summary.get("compression_boundary", {}) + compressed_pairs = boundary.get("previous_compressed_pairs", 0) + + # Each pair = 2 AgentHistory entries (user + assistant) + compressed_entries = compressed_pairs * 2 + + summary_text = cm_summary.get("previous_summary") or "" + + # If no compression happened, return original history unchanged + if not summary_text or compressed_entries == 0: + return list(frozen_history) + + # Build pre-compressed history: + # 1. Summary as a single USER message (matching SummaryTaskStep.to_messages) + # No paired assistant message — the model sees summary then next retained step + precompressed = [ + AgentHistory( + role="user", + content=f"Summary of earlier steps in this task:\n{summary_text}", + ), + ] + + # 2. Retained tail pairs (everything after the compressed prefix) + if compressed_entries < len(frozen_history): + precompressed.extend(frozen_history[compressed_entries:]) + + return precompressed + + +async def run_probe_questions( + probes: list[dict], + precompressed_history: list[AgentHistory], + max_steps: int = 20, + system_prompt: str = BENCHMARK_SYSTEM_PROMPT, +): + """Run probe questions against a pre-compressed history snapshot. + + Each probe runs independently with compression DISABLED, because the + history has already been pre-compressed (compressed prefix replaced with + summary text, retained tail kept verbatim). This avoids redundant LLM + compression calls — the compression was done once in the compressed run, + and all probes reuse that result. + + Per CLAUDE.md rules: + - Each probe uses a deep-copied frozen snapshot + - Probes see compressed context (summary + retained tail) + - No compression triggered during probe phase + - Probes are fully independent, no shared state + """ + probe_results = [] + no_compression_config = ContextManagerConfig(enabled=False, token_threshold=10**9) + + for probe in probes: + question = probe["question"] + + # Each probe gets its own deep copy — fully independent + probe_history = copy.deepcopy(precompressed_history) + + agent_run_info = build_agent_run_info_with_custom_prompt( + question, + system_prompt, + probe_history, + max_steps=max_steps, + context_manager_config=no_compression_config, + ) + + result = await run_agent_with_tracking(agent_run_info, debug=False) + eval_result = eval_text(result.final_answer, probe) + + probe_results.append( + { + "question": question, + "answer": result.final_answer, + "passed": eval_result.passed, + "score": eval_result.score, + "details": eval_result.details, + } + ) + + return probe_results + + +async def run_baseline_probes( + probes: list[dict], + frozen_history: list[AgentHistory], + max_steps: int = 20, + system_prompt: str = BENCHMARK_SYSTEM_PROMPT, +): + """Run probe questions against full uncompressed history (baseline). + + This measures the ceiling: what can the agent answer when it sees + the complete history. probe_retention = compressed_score / baseline_score. + """ + probe_results = [] + baseline_config = ContextManagerConfig(enabled=False, token_threshold=10**9) + + for probe in probes: + question = probe["question"] + probe_history = copy.deepcopy(frozen_history) + + agent_run_info = build_agent_run_info_with_custom_prompt( + question, + system_prompt, + probe_history, + max_steps=max_steps, + context_manager_config=baseline_config, + ) + + result = await run_agent_with_tracking(agent_run_info, debug=False) + eval_result = eval_text(result.final_answer, probe) + + probe_results.append( + { + "question": question, + "answer": result.final_answer, + "passed": eval_result.passed, + "score": eval_result.score, + "details": eval_result.details, + } + ) + + return probe_results + + +def eval_summary_inspection(summary: dict, checks: list[dict]) -> list[dict]: + """Static Compression Inspection — check if the compressed summary + retains key information (user preferences, file names, plans, tool results). + + Uses dedicated summary_checks when available, NOT probe must_contain + (which has different semantics — probe keywords are for agent answers, + summary keywords are for what the compressor chose to preserve). + """ + results = [] + + prev_summary = summary.get("previous_summary") or "" + curr_summary = summary.get("current_summary") or "" + combined = prev_summary + "\n" + curr_summary + + for check in checks: + eval_result = eval_text(combined, check) + results.append( + { + "check": check, + "passed": eval_result.passed, + "score": eval_result.score, + "details": eval_result.details, + } + ) + + return results + + +def eval_task_outputs(case: dict, run_outputs: list): + eval_results = [] + + for check in case.get("task_checks", []): + turn_idx = check["turn"] - 1 + if turn_idx >= len(run_outputs): + continue + + answer = run_outputs[turn_idx].final_answer + r = eval_text(answer, check) + + eval_results.append( + { + "turn": check["turn"], + "answer": answer, + "passed": r.passed, + "score": r.score, + "details": r.details, + } + ) + + return eval_results + + +def _resolve_compressed_config(case: dict, use_default_prompts: bool = False) -> ContextManagerConfig: + """Build compressed config from case definition, with sensible defaults. + + By default uses the benchmark-optimized custom summary schema and prompts. + Set use_default_prompts=True to fall back to the original ContextManager defaults. + """ + case_cfg = case.get("compressed_config", {}) + kwargs = dict( + enabled=True, + token_threshold=case_cfg.get("token_threshold", 3600), + keep_recent_pairs=case_cfg.get("keep_recent_pairs", 1), + keep_recent_steps=case_cfg.get("keep_recent_steps", 4), + max_observation_length=case_cfg.get("max_observation_length", 20000), + ) + if not use_default_prompts: + kwargs.update( + summary_json_schema=BENCHMARK_SUMMARY_SCHEMA, + summary_system_prompt=BENCHMARK_SUMMARY_SYSTEM_PROMPT, + incremental_summary_system_prompt=BENCHMARK_INCREMENTAL_SUMMARY_SYSTEM_PROMPT, + ) + return ContextManagerConfig(**kwargs) + + +async def run_one_case(case_dir: str, use_default_prompts: bool = False): + """Load and run a single benchmark case from its directory. + + Each case directory contains: + - case.json: queries, probes, summary_checks, task_checks, compressed_config + - history.json: conversation history + + Args: + case_dir: Absolute or relative path to the case directory. + + Returns: + Report dict for this case. + """ + case_path = os.path.join(case_dir, "case.json") + with open(case_path, "r", encoding="utf-8") as f: + case = json.load(f) + + # Resolve history_file relative to the case directory; + # defaults to "history.json" in the same directory if not specified. + history_relpath = case.get("history_file", "history.json") + history_abspath = os.path.join(case_dir, history_relpath) + + base_history = parse_conversation_to_history(history_abspath) + + baseline_config = ContextManagerConfig( + enabled=False, + token_threshold=10**9, + keep_recent_pairs=1, + ) + + # P5: Allow per-case config override + compressed_config = _resolve_compressed_config(case, use_default_prompts=use_default_prompts) + + print(f"\n===== CASE: {case['id']} =====") + + baseline = await run_multi_turn_for_benchmark( + queries=case["queries"], + base_history=base_history, + cm_config=baseline_config, + ) + + compressed = await run_multi_turn_for_benchmark( + queries=case["queries"], + base_history=base_history, + cm_config=compressed_config, + ) + + baseline_task_eval = eval_task_outputs(case, baseline["results"]) + compressed_task_eval = eval_task_outputs(case, compressed["results"]) + # P1: Baseline probe — agent sees full uncompressed history + # Same frozen_history, but with compression disabled, so the agent sees + # the complete unmodified context. This establishes the ceiling for + # probe_retention = compressed_probe_score / baseline_probe_score. + baseline_probe_eval = await run_baseline_probes( + probes=case["probes"], + frozen_history=compressed["conversation_history"], + max_steps=20, + ) + + # P0: Compressed probe — agent sees pre-compressed context + # Build the pre-compressed history ONCE using the summary from the + # compressed run's ContextManager, then run each probe independently + # against it with compression disabled. This avoids redundant LLM calls + # (compression was already done in the compressed multi-turn run). + precompressed_history = build_precompressed_history( + frozen_history=compressed["conversation_history"], + cm_summary=compressed["cm_summary"] or {}, + ) + compressed_probe_eval = await run_probe_questions( + probes=case["probes"], + precompressed_history=precompressed_history, + ) + + # P3: Summary inspection uses dedicated summary_checks, not probe must_contain + summary_inspection = [] + if compressed.get("cm_summary"): + summary_checks = case.get("summary_checks", []) + if summary_checks: + summary_inspection = eval_summary_inspection( + compressed["cm_summary"], summary_checks + ) + + baseline_task_score = sum(x["score"] for x in baseline_task_eval) / max( + len(baseline_task_eval), 1 + ) + + compressed_task_score = sum(x["score"] for x in compressed_task_eval) / max( + len(compressed_task_eval), 1 + ) + + baseline_probe_score = sum(x["score"] for x in baseline_probe_eval) / max( + len(baseline_probe_eval), 1 + ) + + compressed_probe_score = sum(x["score"] for x in compressed_probe_eval) / max( + len(compressed_probe_eval), 1 + ) + + summary_score = ( + sum(x["score"] for x in summary_inspection) / max(len(summary_inspection), 1) + if summary_inspection + else None + ) + + task_success_retention = ( + compressed_task_score / baseline_task_score + if baseline_task_score > 0 + else 0.0 + ) + + probe_retention = ( + compressed_probe_score / baseline_probe_score + if baseline_probe_score > 0 + else 0.0 + ) + + # P2: Token reduction from actual input token counts + # Use the last step's token counts (final compressed vs uncompressed state) + token_reduction = 0.0 + if compressed.get("step_input_tokens") and compressed["step_input_tokens"]: + last_tc = compressed["step_input_tokens"][-1] + if last_tc and last_tc.get("last_uncompressed") is not None: + unc = last_tc["last_uncompressed"] or 1 + comp = last_tc["last_compressed"] or 0 + if unc > 0: + token_reduction = 1 - comp / unc + # Fallback to text-based estimation + if token_reduction == 0.0: + token_reduction = 1 - ( + compressed["final_tokens"] / max(baseline["final_tokens"], 1) + ) + baseline_failed = baseline_task_score == 0 + + # Compute real main-LLM input token totals + baseline_real_input = sum(r.total_input_tokens for r in baseline["results"]) + compressed_real_input = sum(r.total_input_tokens for r in compressed["results"]) + + # Compression cost: tokens spent on compression LLM calls + compression_cost = 0 + if compressed.get("cm_stats"): + compression_cost = ( + compressed["cm_stats"].get("total_input_tokens", 0) + + compressed["cm_stats"].get("total_output_tokens", 0) + ) + + # Net token reduction = gross savings - compression cost + gross_input_savings = baseline_real_input - compressed_real_input + net_input_savings = gross_input_savings - compression_cost + net_token_reduction = ( + net_input_savings / max(baseline_real_input, 1) + if baseline_real_input > 0 + else 0.0 + ) + + report = { + "case_id": case["id"], + "baseline_failed": baseline_failed, + "baseline": { + "task_score": baseline_task_score, + "probe_score": baseline_probe_score, + "final_tokens": baseline["final_tokens"], + "real_input_tokens": baseline_real_input, + }, + "compressed": { + "task_score": compressed_task_score, + "probe_score": compressed_probe_score, + "final_tokens": compressed["final_tokens"], + "cm_stats": compressed["cm_stats"], + "cm_token_counts": compressed["cm_token_counts"], + "cm_summary": compressed["cm_summary"], + "real_input_tokens": compressed_real_input, + }, + "metrics": { + "task_success_retention": task_success_retention, + "probe_retention": probe_retention, + "token_reduction": token_reduction, + "net_token_reduction": net_token_reduction, + "compression_cost_tokens": compression_cost, + "summary_score": summary_score, + }, + "task_eval": compressed_task_eval, + "probe_eval": { + "baseline": baseline_probe_eval, + "compressed": compressed_probe_eval, + }, + "summary_inspection": summary_inspection, + } + + print(json.dumps(report, ensure_ascii=False, indent=2, default=str)) + return report + + +async def main(case_names: list[str] = None, use_default_prompts: bool = False): + # Discover cases: use specified names if provided, otherwise find all cases under ./cases/*/case.json + if case_names: + case_dirs = [os.path.join("./cases", name) for name in case_names] + else: + case_dirs = sorted(glob.glob("./cases/*/case.json")) + case_dirs = [os.path.dirname(p) for p in case_dirs] + + if not case_dirs: + print("No benchmark cases found under ./cases/*/case.json") + return + + print(f"Found {len(case_dirs)} case(s): {[os.path.basename(d) for d in case_dirs]}") + + # Output directory for reports + os.makedirs("./reports", exist_ok=True) + + reports = [] + for case_dir in case_dirs: + report = await run_one_case(case_dir, use_default_prompts=use_default_prompts) + reports.append(report) + + # Write per-case report + case_id = report["case_id"] + per_case_path = os.path.join("./reports", f"{case_id}.json") + with open(per_case_path, "w", encoding="utf-8") as f: + json.dump(report, f, ensure_ascii=False, indent=2, default=str) + print(f" Report saved to {per_case_path}") + + # Exclude cases where baseline itself failed + valid_reports = [r for r in reports if not r.get("baseline_failed")] + excluded_ids = [r["case_id"] for r in reports if r.get("baseline_failed")] + if excluded_ids: + print(f"\n Excluded from average (baseline failed): {excluded_ids}") + # Write summary across all cases + summary = { + "total_cases": len(reports), + "excluded_cases": len(reports) - len(valid_reports), + "metrics": { + "avg_task_success_retention": sum( + r["metrics"]["task_success_retention"] for r in valid_reports + ) / max(len(valid_reports), 1), + "avg_probe_retention": sum( + r["metrics"]["probe_retention"] for r in valid_reports + ) / max(len(valid_reports), 1), + "avg_token_reduction": sum( + r["metrics"]["token_reduction"] for r in valid_reports + ) / max(len(valid_reports), 1), + "avg_net_token_reduction": sum( + r["metrics"]["net_token_reduction"] for r in valid_reports + ) / max(len(valid_reports), 1), + "avg_compression_cost_tokens": sum( + r["metrics"]["compression_cost_tokens"] for r in valid_reports + ) / max(len(valid_reports), 1), + "per_case": { + r["case_id"]: r["metrics"] for r in reports + }, + }, + } + summary_path = "./reports/summary.json" + with open(summary_path, "w", encoding="utf-8") as f: + json.dump(summary, f, ensure_ascii=False, indent=2, default=str) + + print(f"\nBenchmark finished. Summary saved to {summary_path}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run Agent Context Compression Benchmark") + parser.add_argument( + "--cases",nargs="+",default=None, + help="Specific case names to run (e.g. --cases example_infra algotithm_data)." + "if omitted, run all cases under .cases/." + ) + parser.add_argument( + "--default-summary", action="store_true", default=False, + help="Use the original ContextManager summary defaults instead of the benchmark-optimized " + "custom schema (leaner 7-field, 800-word cap, merge-condense incremental updates)." + ) + args = parser.parse_args() + asyncio.run(main(case_names=args.cases, use_default_prompts=args.default_summary)) \ No newline at end of file diff --git a/sdk/benchmark/paths.py b/sdk/benchmark/paths.py new file mode 100644 index 000000000..88faf31f2 --- /dev/null +++ b/sdk/benchmark/paths.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- +"""Robust path resolution for benchmark scripts. + +Finds the project root by searching upward for a .git entry (directory +or file), then derives SDK_DIR and BACKEND_PATH from it. This makes +path setup resilient to file relocation within the project tree and to +git worktrees (which store a .git file rather than directory at root). +""" +import os +import sys + + +def _find_project_root(start: str = None) -> str: + """Walk upward from *start* until a .git entry is found. + + Accepts ``.git`` as either a directory (normal checkout) or a file + (git worktree, where ``.git`` is a pointer file to the gitdir). + """ + current = os.path.abspath(start or os.path.dirname(__file__)) + while True: + if os.path.exists(os.path.join(current, ".git")): + return current + parent = os.path.dirname(current) + if parent == current: + raise RuntimeError( + f"Could not find project root (.git) starting from {start or __file__}" + ) + current = parent + + +def setup_paths() -> dict: + """Resolve and inject project paths into sys.path. + + Returns a dict with resolved paths: + project_root, sdk_dir, backend_dir + + Adds the following to sys.path (idempotent): + - sdk_dir (for ``from nexent import ...``) + - project_root (for ``from backend.utils import ...``) + - backend_dir (for ``from utils.prompt_template_utils import ...``) + """ + project_root = _find_project_root() + sdk_dir = os.path.join(project_root, "sdk") + backend_dir = os.path.join(project_root, "backend") + + for p in (sdk_dir, project_root, backend_dir): + if p not in sys.path: + sys.path.insert(0, p) + + return { + "project_root": project_root, + "sdk_dir": sdk_dir, + "backend_dir": backend_dir, + } + + +# Convenience: resolve on import so callers can do `from paths import PROJECT_ROOT` +_resolved = setup_paths() +PROJECT_ROOT = _resolved["project_root"] +SDK_DIR = _resolved["sdk_dir"] +BACKEND_DIR = _resolved["backend_dir"] \ No newline at end of file diff --git a/sdk/benchmark/smoke.py b/sdk/benchmark/smoke.py new file mode 100644 index 000000000..7fcec8bcc --- /dev/null +++ b/sdk/benchmark/smoke.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +"""Minimal smoke test for benchmark-on-refactor integration. + +Goal: prove the refactor's ContextManager + component-based system prompt +assembly produces a working end-to-end agent run when driven from the +benchmark's agent_runner. Touches no production SDK code. + +Run from this directory: + + LLM_API_KEY=... LLM_MODEL_NAME=... LLM_API_URL=... \ + ../../backend/.venv/bin/python smoke.py + +Success criteria: +1. No ImportError / AttributeError at module load time. +2. agent_run returns at least one chunk and a non-empty final_answer. +3. The chosen LLM is actually called (i.e. we see model_output messages). + +Failure here points at the smallest viable repro for adapting the rest of +the benchmark — the trail of exceptions IS the work list. +""" + +import asyncio +import logging +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +import paths # noqa: F401 - side-effect: adds sdk/, backend/ to sys.path + +from agent_runner import build_agent_run_info, run_agent_with_tracking + + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", +) +logger = logging.getLogger("smoke") + + +async def main(): + query = "中华人民共和国的首都是哪里?请用一句话回答。" + + agent_run_info = build_agent_run_info( + query=query, + history=[], + duty_prompt="你是一个简明扼要的助手。", + constraint_prompt="只回答用户问题,不要展开。", + max_steps=3, + temperature=0.0, + agent_name="smoke_agent", + agent_description="Smoke test agent", + language="zh", + is_manager=False, + ) + + logger.info("Running agent on query: %s", query) + result = await run_agent_with_tracking(agent_run_info, debug=False) + + print("\n" + "=" * 60) + print(f"final_answer ({len(result.final_answer)} chars):") + print(result.final_answer) + print("=" * 60) + print(f"steps={result.step_count} msg_counts={result.message_type_count}") + if result.errors: + print(f"errors={result.errors}") + print("=" * 60) + + assert result.final_answer, "final_answer empty - smoke FAILED" + assert not result.errors, f"errors during run: {result.errors}" + print("\nSMOKE PASS") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/sdk/ctx_debugger/README.md b/sdk/ctx_debugger/README.md new file mode 100644 index 000000000..986dbfcf3 --- /dev/null +++ b/sdk/ctx_debugger/README.md @@ -0,0 +1,258 @@ +# ctx_debugger — Nexent Context Debugger + +Observation tool for the full process of **context construction and compression** in Nexent Agent. From system prompt, multi-turn history, compression decisions, LLM calls, to tool execution, observer events—all recorded as analyzable JSONL trace. + +> **Core positioning**: Nexent agent runtime is already "self-talking" (observer events, compression logs, token statistics), ctx_debugger just "eavesdrops" and structurally records, **no Nexent source modification**. + +--- + +## 1. What Problems It Solves + +When Agent context compression (`ContextManager`) has issues, developers need to answer: + +- Why did compression trigger/not trigger at this step? +- What did the compression LLM take in, produce, and how long? +- What does the context actually look like after compression? +- What information did the summary retain/lose? +- How much did tokens actually decrease (including compression call overhead)? + +This information is scattered across `ContextManager` internal state, `step_metrics`, `MessageObserver` events, without unified, traceable view. ctx_debugger aggregates them into one trace. + +--- + +## 2. Directory Structure + +``` +ctx_debugger/ +├── __init__.py # Package entry, re-export ContextDebugger / attach_debugger +├── __main__.py # Entry point for python -m ctx_debugger.inspector +├── debugger.py # Core: ContextDebugger, attach_debugger, layer proxies +├── interactive.py # Interactive REPL (main debugging mode) +├── inspector.py # Post-analysis CLI for trace files +├── langfuse_export.py # Import trace into Langfuse for visual analysis +├── example_with_benchmark.py# Attach debugger to benchmark batch run +└── README.md +``` + +Dependency direction: **ctx_debugger → only import nexent SDK**, nexent doesn't reverse-depend on this package. + +--- + +## 3. Prerequisites + +> Commands below assume you're in this directory (README's location `ctx_debugger/`). Relative path conventions: +> `.` = `ctx_debugger/`, `..` = `sdk/`, `../..` = nexent repo root directory +> (where `sdk/`, `backend/`, `.env` reside). + +- Use backend's venv Python (nexent SDK and dependencies installed): + ``` + ../../backend/.venv/bin/python + ``` +- LLM credentials in repo root `.env`, i.e., `../../.env` (`agent_runner` will `load_dotenv`): + ``` + LLM_API_KEY=... + LLM_MODEL_NAME=... + LLM_API_URL=... + ``` +- Trace output path controlled by environment variable `NEXENT_CONTEXT_DEBUG`, or explicitly pass `trace_path` in `attach_debugger`. + +--- + +## 4. Three Usage Modes + +### 4.1 Interactive REPL — Main Mode + +You type user messages line by line, each line triggers one real agent execution; history accumulates, `ContextManager` shared across turns, compression triggers naturally when threshold reached. + +```bash +# In ctx_debugger/ directory +../../backend/.venv/bin/python interactive.py +``` + +Each turn auto-displays agent answer + context construction panel (agent steps, main/compression LLM calls, compression triggered or not, token reduction, summary updated or not). + +Panel token counts split into two types, labeled separately: `main LLM` / `compression LLM` rows with `(API)` are LLM-reported `token_usage`; `compression` row with `(est.)` is `ContextManager` heuristic estimation (`estimate_tokens_text`, CJK-aware, no real tokenizer). **Compression threshold judgment uses estimated value**, may differ from API measured (Chinese text heuristic usually overestimates). + +Slash commands: + +| Command | Purpose | +|---|---| +| `/help` | Command list | +| `/context [N]` | Last turn main LLM actually received context (compressed: system + summary + recent turns); `N` selects N-th main call | +| `/history` | Accumulated session raw ledger (each turn verbatim, pre-compression; REPL's own accounting, not what model sees) | +| `/summary` | Current compression summary full text | +| `/compress` | Last turn's compression LLM input prompt (fed in) and output summary (produced), separate from main answer | +| `/tokens` | Per-turn token timeline | +| `/stats` | Entire session compression statistics—key is "LLM-invoking semantic compression" cumulative count, plus cache hits, token cost | +| `/trace` | Last turn raw event table | +| `/step N` | Last turn step N all events JSON | +| `/config` | Current `ContextManagerConfig` | +| `/reset [threshold]` | Clear and restart, optional new threshold | +| `/quit` `/q` | Exit | + +Default `token_threshold=3000`, few turns trigger compression. + +Input line supports up/down arrow history recall (shell habit), history persisted in `~/.nexent_ctx_debugger_history`, retained across sessions. + +### 4.2 Batch Attach to Benchmark + +Without modifying benchmark code, monkey-patch `CoreAgent.__init__` so each agent auto-attaches debugger, entire benchmark run produces one trace. + +```bash +# In ctx_debugger/ directory +NEXENT_CONTEXT_DEBUG=/tmp/trace.jsonl \ + ../../backend/.venv/bin/python example_with_benchmark.py +``` + +### 4.3 Post-analysis of Trace Files + +```bash +# In parent sdk/ directory +cd .. +python -m ctx_debugger.inspector [options] +``` + +| Subcommand | Purpose | +|---|---| +| `summary` | Overview: event count, run count, token totals, event histogram | +| `runs` | List all runs | +| `timeline [--run X]` | Chronological event list | +| `compress` | All compression cycles' decisions and token reductions | +| `llm [--tag main|compression]` | LLM call list (duration, tokens) | +| `step --step N [--run X]` | One step's all events JSON | + +`--run` supports 8-char short suffix matching. + +### 4.4 Import to Langfuse for Visual Analysis + +Map trace into self-hosted [Langfuse](https://langfuse.com), get nested traces, per-call drill-down, token/duration views, session grouping—no need to build custom web UI. + +```bash +# In parent sdk/ directory +cd .. +# First dry run, see mapping structure (offline) +python -m ctx_debugger.langfuse_export --dry-run +# After configuring credentials, real import +LANGFUSE_HOST=http://localhost:3000 \ +LANGFUSE_PUBLIC_KEY=pk-... LANGFUSE_SECRET_KEY=sk-... \ + python -m ctx_debugger.langfuse_export +``` + +Mapping rules: + +| ctx_debugger | Langfuse | +|---|---| +| Each agent turn (`agent_init`) | One trace | +| `llm_call_*` | generation (input/output, tokens, duration) | +| `compress_*` | span, nested compression generations inside | +| `tool_call_*` / `code_execute_*` | tool / span observation | +| Entire trace file | One Langfuse session (turn grouping) | + +Depends on `langfuse` SDK (`uv pip install langfuse`). Self-hosted Langfuse can be started with official docker compose. **Known limitation**: Observations created at export time, single duration faithful, but absolute position on Langfuse timeline is export time, not original wall-clock time. + +--- + +## 5. Core API + +### `attach_debugger(target, ...)` + +Attach debugger to an agent or `ContextManager`. + +```python +from ctx_debugger import attach_debugger +from nexent.core.agents.agent_context import ContextManager + +cm = ContextManager(config=...) +attach_debugger(cm, trace_path="/tmp/run.jsonl") # Only attach compression layer +# Or attach entire agent, auto-cover five layers +attach_debugger(agent, trace_path="/tmp/run.jsonl") +``` + +Parameters: + +| Parameter | Description | +|---|---| +| `target` | Nexent agent (CoreAgent/NexentAgent) or `ContextManager` | +| `trace_path` | Output JSONL path; fallback to `NEXENT_CONTEXT_DEBUG` env var when empty | +| `layers` | Subset of `{"compression","model","observer","tools","executor"}`, default all enabled | +| `run_id` | Explicit run identifier, auto-generated when omitted | +| `capture_full_summary` | Compression events include full summary text, default True | +| `capture_full_messages` | Main LLM calls also store full message text, default False; compression LLM calls always store full | +| `append` | Append to existing trace instead of overwriting | +| `existing` | Reuse an existing `ContextDebugger` (interactive session across multiple turns shares same trace/run_id) | + +When no trace path resolved, returns `None` without any wrapping (zero overhead). + +### Five Observation Layers + +| Layer | Attach Point | Capture | +|---|---| +| `compression` | `ContextManager.compress_if_needed` wrapper | Compression decision, compression call records, summary before/after state | +| `model` | `agent.model` replaced with `_ModelProxy` | Each LLM call's input/output/tokens/duration, tagged with contextvar `main` vs `compression` | +| `observer` | `agent.observer.add_message` mirror | All Nexent's own observer events | +| `tools` | Each `agent.tools[name].forward` instance-level wrapper | Single-tool granularity args / return / duration | +| `executor` | `agent.python_executor` replaced with `_PyExecutorProxy` | Executed Python code full text + output + duration | + +--- + +## 6. Trace Event Schema + +Each line is JSON, unified outer fields: + +```json +{ + "seq": 42, // Global monotonically increasing sequence number + "ts": 1778813372.87, // Unix timestamp + "run_id": "run_a70c9017", // One attach = one run + "agent_step": 1, // Current agent step number (from observer's step_count) + "event": "compress_end", + "data": { ... } // Event-specific fields +} +``` + +Event types: + +| Event | When emitted | Key data fields | +|---|---|---| +| `run_begin` | Debugger created | pid | +| `agent_init` | Attached to agent | system_prompt full text, tools list, cm config | +| `compress_begin` | `compress_if_needed` entry | `predicted_decision` (decision branch + compress_prev/curr), `estimated_tokens` | +| `compression_call` | Each compression call within step | call_type, cache_hit, in/out tokens | +| `compress_end` | `compress_if_needed` exit | `token_counts` (before/after), `summary_after`, `summary_changed` | +| `llm_call_begin` / `llm_call_end` | Each LLM call | `tag` (main/compression), input messages (compression calls each with full `text`), output (compression calls with `output_full`), tokens, duration | +| `code_execute_begin` / `code_execute_end` | Python executor execution | code full text, output, logs, duration | +| `tool_call_begin` / `tool_call_end` | Each tool call | tool name, args, return, duration | +| `observer_event` | Each Nexent observer message | process_type, content preview | +| `debug_error` | Debugger internal exception | phase, error (won't crash agent) | + +Text fields all bounded truncation (head N chars + `...[N chars elided]...` + tail M chars), +avoid trace file infinite growth. + +--- + +## 7. Design Principles + +1. **Zero SDK source modification**: Via monkey-patch wrapping + proxy objects, no changes to `nexent/`. +2. **Read-only public interface + few stable internal interfaces**: Underscore interfaces like `_step_local_log`, `_effective_*_tokens` are also used by benchmark, treated as de-facto stable. +3. **Five optional layers**: `layers` parameter narrows as needed, trace size controllable. +4. **Failure isolation**: Each attach point try/except兜底, single layer failure only emits `debug_error` event, won't crash agent. +5. **Reuse Nexent's own events**: `observer` layer directly mirrors `MessageObserver`, no reinventing wheel. +6. **No frontend pollution**: Observer tap modifies instance's `add_message`, original method still called, frontend stream unaffected. + +### Coupling Points with Nexent + +Debugger "simulates/eavesdrops" on Nexent behavior, thus soft coupling exists—if Nexent changes following interfaces, debugger must adapt (other changes auto-compatible): + +- `agent.model` / `agent.observer` / `agent.python_executor` / `agent.tools` renamed +- `tool.forward` method name changed +- `compress_if_needed` signature changed +- `observer.add_message` parameter order major change + +--- + +## 8. Known Limitations + +- **Main LLM calls default only store digest**: Compression LLM calls' input messages and output already stored verbatim in full (each message with `text`, output with `output_full`); Main LLM calls default still truncated digest, need full text pass `capture_full_messages=True` to `attach_debugger`. Interactive REPL already defaults this option on, so `/context` can see full text. +- **Trace file size unlimited**: Long session could be tens of MB; `inspector` currently one-time loads into memory. +- **Multi-agent nesting**: Each attach one run_id; interactive session uses `existing=` to reuse same debugger to unify run_id. +- **Interactive REPL requires real TTY**: Pipe feeding input works, but experience designed for interactive. \ No newline at end of file diff --git a/sdk/ctx_debugger/__init__.py b/sdk/ctx_debugger/__init__.py new file mode 100644 index 000000000..7b92ed8e3 --- /dev/null +++ b/sdk/ctx_debugger/__init__.py @@ -0,0 +1,21 @@ +"""External trace emitter for Nexent ContextManager. + +Independent from Nexent runtime; only imports from nexent SDK. Zero changes +to SDK source code required. + +Quick start: + from ctx_debugger import attach_debugger + from nexent.core.agents.agent_context import ContextManager + + cm = ContextManager(config=...) + attach_debugger(cm, trace_path="/tmp/run.jsonl") + # run the agent normally; events land in /tmp/run.jsonl + +Or rely on the environment variable: + export NEXENT_CONTEXT_DEBUG=/tmp/run.jsonl + attach_debugger(cm) # path auto-resolved from env +""" + +from .debugger import ContextDebugger, attach_debugger + +__all__ = ["ContextDebugger", "attach_debugger"] diff --git a/sdk/ctx_debugger/__main__.py b/sdk/ctx_debugger/__main__.py new file mode 100644 index 000000000..dfd1cec71 --- /dev/null +++ b/sdk/ctx_debugger/__main__.py @@ -0,0 +1,5 @@ +"""Entry point so `python -m ctx_debugger.inspector ...` works.""" +from .inspector import main + +if __name__ == "__main__": + main() diff --git a/sdk/ctx_debugger/debugger.py b/sdk/ctx_debugger/debugger.py new file mode 100644 index 000000000..956e4ec80 --- /dev/null +++ b/sdk/ctx_debugger/debugger.py @@ -0,0 +1,722 @@ +"""External trace emitter for Nexent ContextManager and agent runtime. + +Wraps a Nexent agent (or a bare ContextManager) without modifying SDK source. +Layers can be selectively enabled: + + compression - wrap ContextManager.compress_if_needed (Phase 1) + model - wrap agent.model __call__ (tagged: main vs compression) + observer - tap agent.observer.add_message + tools - wrap each agent.tools[name].forward + executor - wrap agent.python_executor __call__ + +Events are written as JSONL to a trace file. SDK source is untouched; the +debugger only reads public APIs and a handful of de-facto-stable internals +(_step_local_log, _effective_*_tokens) that the benchmark already uses. +""" + +import contextvars +import json +import logging +import os +import threading +import time +import uuid +from typing import Any, Iterable, List, Optional, Set + +logger = logging.getLogger(__name__) + +# Set inside the compression wrapper so the model wrapper can tag calls. +_compression_active: contextvars.ContextVar[bool] = contextvars.ContextVar( + "compression_active", default=False +) + +DEFAULT_LAYERS: Set[str] = {"compression", "model", "observer", "tools", "executor"} + + +# ============================================================ +# Bounded serialization helpers +# ============================================================ + +def _truncate_text(s: Optional[str], head: int = 500, tail: int = 500) -> Optional[str]: + if s is None: + return None + if not isinstance(s, str): + s = str(s) + if len(s) <= head + tail + 50: + return s + return s[:head] + f"\n...[{len(s) - head - tail} chars elided]...\n" + s[-tail:] + + +def _messages_digest(messages: Any, full: bool = False) -> List[dict]: + out = [] + for m in messages or []: + role = getattr(m, "role", None) + if hasattr(role, "value"): + role = role.value + content = getattr(m, "content", None) + if isinstance(content, list): + text = "".join( + seg.get("text", "") if isinstance(seg, dict) else str(seg) + for seg in content + ) + else: + text = str(content) if content is not None else "" + entry = { + "role": str(role), + "chars": len(text), + "preview": _truncate_text(text, head=200, tail=200), + } + # full=True keeps the verbatim message text (no truncation), so the + # exact prompt is recoverable. Used for compression LLM calls. + if full: + entry["text"] = text + out.append(entry) + return out + + +def _safe_repr(value: Any, head: int = 300, tail: int = 200) -> str: + try: + return _truncate_text(repr(value), head=head, tail=tail) + except Exception as exc: + return f"" + + +def _digest_call_args(args: tuple, kwargs: dict) -> dict: + return { + "args": [_safe_repr(a, head=200, tail=100) for a in args], + "kwargs": {k: _safe_repr(v, head=200, tail=100) for k, v in kwargs.items()}, + } + + +# ============================================================ +# Core debugger +# ============================================================ + +class ContextDebugger: + """Trace emitter. Writes events to a JSONL file.""" + + def __init__( + self, + trace_path: str, + run_id: Optional[str] = None, + capture_full_summary: bool = True, + capture_full_messages: bool = False, + append: bool = False, + ): + self.trace_path = os.path.abspath(trace_path) + self.run_id = run_id or f"run_{uuid.uuid4().hex[:8]}" + self.capture_full_summary = capture_full_summary + self.capture_full_messages = capture_full_messages + + self._lock = threading.Lock() + self._seq = 0 + self._current_step: Optional[int] = None # tracked via observer STEP_COUNT + self._compression_step_counter = 0 + self._prev_summary_cache: Optional[str] = None + self._curr_summary_cache: Optional[str] = None + + parent = os.path.dirname(self.trace_path) + if parent: + os.makedirs(parent, exist_ok=True) + if not append: + open(self.trace_path, "w", encoding="utf-8").close() + + self._emit( + "run_begin", + { + "capture_full_summary": capture_full_summary, + "capture_full_messages": capture_full_messages, + "pid": os.getpid(), + }, + ) + + def _emit(self, event: str, data: dict) -> None: + # The debugger must never crash the agent it observes: a failed trace + # write is swallowed rather than propagated. + try: + with self._lock: + self._seq += 1 + record = { + "seq": self._seq, + "ts": time.time(), + "run_id": self.run_id, + "agent_step": self._current_step, + "event": event, + "data": data, + } + line = json.dumps(record, ensure_ascii=False, default=str) + # errors="replace": lone surrogates (e.g. from text decoded + # with surrogateescape, such as some terminal stdin) cannot be + # UTF-8 encoded; replacing them keeps the write from raising. + with open(self.trace_path, "a", encoding="utf-8", + errors="replace") as f: + f.write(line + "\n") + except Exception: + pass + + # ------------------------------------------------------------ + # Compression-layer hooks (Phase 1) + # ------------------------------------------------------------ + + def on_compress_begin( + self, cm, memory, original_messages, current_run_start_idx + ) -> None: + self._compression_step_counter += 1 + + config_snapshot = { + "enabled": cm.config.enabled, + "token_threshold": cm.config.token_threshold, + "keep_recent_pairs": cm.config.keep_recent_pairs, + "keep_recent_steps": cm.config.keep_recent_steps, + } + + effective = prev_tokens = curr_tokens = None + try: + effective = cm._effective_tokens(memory, current_run_start_idx) + prev_steps = memory.steps[:current_run_start_idx] + curr_steps = memory.steps[current_run_start_idx:] + prev_tokens = cm._effective_prev_tokens(prev_steps) + curr_tokens = cm._effective_curr_tokens(curr_steps) + except Exception as exc: + self._emit("debug_error", {"phase": "compress_begin_est", "error": str(exc)}) + + predicted = self._predict_branch(cm.config, effective, prev_tokens, curr_tokens) + + summary_before = None + try: + summary_before = cm.export_summary() + self._prev_summary_cache = summary_before.get("previous_summary") + self._curr_summary_cache = summary_before.get("current_summary") + except Exception as exc: + self._emit("debug_error", {"phase": "compress_begin_summary", "error": str(exc)}) + + self._emit( + "compress_begin", + { + "compression_step": self._compression_step_counter, + "current_run_start_idx": current_run_start_idx, + "memory_step_count": len(memory.steps), + "original_messages": _messages_digest(original_messages), + "estimated_tokens": { + "effective": effective, + "prev": prev_tokens, + "curr": curr_tokens, + "threshold": cm.config.token_threshold, + }, + "config": config_snapshot, + "predicted_decision": predicted, + "summary_before": summary_before if self.capture_full_summary else None, + }, + ) + + def on_compress_end( + self, cm, result_messages, success: bool, error: Optional[str] = None + ) -> None: + if not success: + self._emit("compress_end", {"success": False, "error": error}) + return + + try: + records = list(getattr(cm, "_step_local_log", []) or []) + for i, rec in enumerate(records): + self._emit( + "compression_call", + { + "call_index": i, + "call_type": rec.call_type, + "cache_hit": rec.cache_hit, + "input_tokens": rec.input_tokens, + "output_tokens": rec.output_tokens, + "input_chars": rec.input_chars, + "output_chars": rec.output_chars, + "details": rec.details, + }, + ) + except Exception as exc: + self._emit("debug_error", {"phase": "compression_calls", "error": str(exc)}) + + step_stats = token_counts = summary_after = None + try: + step_stats = cm.get_step_compression_stats() + except Exception as exc: + self._emit("debug_error", {"phase": "step_stats", "error": str(exc)}) + try: + token_counts = cm.get_token_counts() + except Exception as exc: + self._emit("debug_error", {"phase": "token_counts", "error": str(exc)}) + try: + summary_after = cm.export_summary() + except Exception as exc: + self._emit("debug_error", {"phase": "end_summary", "error": str(exc)}) + + prev_after = (summary_after or {}).get("previous_summary") + curr_after = (summary_after or {}).get("current_summary") + summary_changed = { + "previous_changed": prev_after != self._prev_summary_cache, + "current_changed": curr_after != self._curr_summary_cache, + } + + self._emit( + "compress_end", + { + "success": True, + "result_messages": _messages_digest(result_messages), + "step_stats": step_stats, + "token_counts": token_counts, + "summary_after": summary_after if self.capture_full_summary else None, + "summary_changed": summary_changed, + }, + ) + + # ------------------------------------------------------------ + # Observer tap helper — also updates current_step + # ------------------------------------------------------------ + + def update_step_from_observer(self, process_type_value: str, content: Any) -> None: + """Track agent.step_number from observer STEP_COUNT events.""" + if process_type_value == "step_count": + try: + self._current_step = int(content) + except (ValueError, TypeError): + pass + + @staticmethod + def _predict_branch(config, effective, prev_tokens, curr_tokens) -> dict: + if not config.enabled: + return {"branch": "disabled"} + if effective is None: + return {"branch": "unknown_estimation_failed"} + threshold = config.token_threshold + if effective <= threshold: + return { + "branch": "stable_or_noop", + "effective": effective, + "threshold": threshold, + } + return { + "branch": "full_compression_path", + "compress_prev": (prev_tokens or 0) > threshold * 0.6, + "compress_curr": (curr_tokens or 0) > threshold * 0.4, + "prev_token_share": (prev_tokens or 0) / threshold if threshold else None, + "curr_token_share": (curr_tokens or 0) / threshold if threshold else None, + } + + +# ============================================================ +# Proxy objects (model, tool, python_executor) +# ============================================================ + +class _ModelProxy: + """Wraps a smolagents-compatible model object; logs every __call__. + + Forwards every other attribute to the underlying model so the agent + still sees the same interface. + """ + + def __init__(self, real_model, debugger: ContextDebugger): + object.__setattr__(self, "_real", real_model) + object.__setattr__(self, "_debugger", debugger) + + def __call__(self, *args, **kwargs): + debugger: ContextDebugger = object.__getattribute__(self, "_debugger") + real = object.__getattribute__(self, "_real") + tag = "compression" if _compression_active.get() else "main" + # Compression calls are this tool's primary subject: always capture + # the verbatim prompt and output. Main calls follow + # capture_full_messages so the trace stays lean by default. + full = tag == "compression" or debugger.capture_full_messages + + # Extract messages from first arg (smolagents calling convention) + input_messages = args[0] if args else kwargs.get("messages") + debugger._emit( + "llm_call_begin", + { + "tag": tag, + "input_messages": _messages_digest(input_messages, full=full), + "stop_sequences": kwargs.get("stop_sequences"), + }, + ) + + start = time.time() + try: + result = real(*args, **kwargs) + elapsed_ms = int((time.time() - start) * 1000) + + output_content = getattr(result, "content", None) + output_text = ( + output_content + if isinstance(output_content, str) + else (str(output_content) if output_content is not None else "") + ) + token_usage = getattr(result, "token_usage", None) + end_data = { + "tag": tag, + "duration_ms": elapsed_ms, + "output_preview": _truncate_text(output_text, head=600, tail=400), + "output_chars": len(output_text), + "input_tokens": getattr(token_usage, "input_tokens", None) if token_usage else None, + "output_tokens": getattr(token_usage, "output_tokens", None) if token_usage else None, + } + # full=True keeps the verbatim output (no truncation), so the + # exact compression summary is recoverable. + if full: + end_data["output_full"] = output_text + debugger._emit("llm_call_end", end_data) + return result + except Exception as exc: + elapsed_ms = int((time.time() - start) * 1000) + debugger._emit( + "llm_call_end", + { + "tag": tag, + "duration_ms": elapsed_ms, + "error": str(exc), + }, + ) + raise + + def __getattr__(self, name: str): + return getattr(object.__getattribute__(self, "_real"), name) + + def __setattr__(self, name: str, value: Any) -> None: + setattr(object.__getattribute__(self, "_real"), name, value) + + +class _PyExecutorProxy: + """Wraps python_executor; logs each code execution call.""" + + def __init__(self, real_executor, debugger: ContextDebugger): + object.__setattr__(self, "_real", real_executor) + object.__setattr__(self, "_debugger", debugger) + + def __call__(self, code, *args, **kwargs): + debugger: ContextDebugger = object.__getattribute__(self, "_debugger") + real = object.__getattribute__(self, "_real") + + code_str = code if isinstance(code, str) else str(code) + debugger._emit( + "code_execute_begin", + { + "code_preview": _truncate_text(code_str, head=400, tail=400), + "code_chars": len(code_str), + }, + ) + + start = time.time() + try: + result = real(code, *args, **kwargs) + elapsed_ms = int((time.time() - start) * 1000) + output = getattr(result, "output", None) + logs = getattr(result, "logs", None) + debugger._emit( + "code_execute_end", + { + "duration_ms": elapsed_ms, + "output_preview": _truncate_text( + str(output) if output is not None else "", + head=400, + tail=200, + ), + "logs_preview": _truncate_text( + str(logs) if logs is not None else "", + head=400, + tail=200, + ), + "is_final_answer": getattr(result, "is_final_answer", None), + }, + ) + return result + except Exception as exc: + elapsed_ms = int((time.time() - start) * 1000) + debugger._emit( + "code_execute_end", + { + "duration_ms": elapsed_ms, + "error": str(exc), + }, + ) + raise + + def __getattr__(self, name: str): + return getattr(object.__getattribute__(self, "_real"), name) + + def __setattr__(self, name: str, value: Any) -> None: + setattr(object.__getattribute__(self, "_real"), name, value) + + +# ============================================================ +# Attachment functions +# ============================================================ + +def _wrap_compress_if_needed(cm, debugger: ContextDebugger) -> None: + """Wrap cm.compress_if_needed with begin/end hooks + compression contextvar.""" + if getattr(cm, "_debugger", None) is debugger: + return # already wrapped by this debugger + original_compress = cm.compress_if_needed + + def wrapped(model, memory, original_messages, current_run_start_idx): + debugger.on_compress_begin(cm, memory, original_messages, current_run_start_idx) + token = _compression_active.set(True) + try: + result = original_compress( + model, memory, original_messages, current_run_start_idx + ) + debugger.on_compress_end(cm, result, success=True) + return result + except Exception as exc: + debugger.on_compress_end(cm, None, success=False, error=str(exc)) + raise + finally: + _compression_active.reset(token) + + cm.compress_if_needed = wrapped + cm._debugger = debugger + + +def _wrap_tool_forward(tool, name: str, debugger: ContextDebugger) -> None: + """Wrap a single tool's forward method on the instance. + + Tool.__call__ -> self.forward(...), so instance-level wrap of forward + intercepts every actual call without breaking isinstance checks. + """ + original_forward = getattr(tool, "forward", None) + if original_forward is None: + return + + def wrapped_forward(*args, **kwargs): + debugger._emit( + "tool_call_begin", + {"tool": name, **_digest_call_args(args, kwargs)}, + ) + start = time.time() + try: + result = original_forward(*args, **kwargs) + elapsed_ms = int((time.time() - start) * 1000) + debugger._emit( + "tool_call_end", + { + "tool": name, + "duration_ms": elapsed_ms, + "return_preview": _safe_repr(result, head=400, tail=200), + "return_type": type(result).__name__, + }, + ) + return result + except Exception as exc: + elapsed_ms = int((time.time() - start) * 1000) + debugger._emit( + "tool_call_end", + { + "tool": name, + "duration_ms": elapsed_ms, + "error": str(exc), + "error_type": type(exc).__name__, + }, + ) + raise + + tool.forward = wrapped_forward + + +def _tap_observer(observer, debugger: ContextDebugger) -> None: + """Mirror every observer.add_message call into the debugger trace. + + Original add_message is still called, so the front-end stream is + untouched. + """ + if getattr(observer, "_debugger_tapped", False): + return + original_add_message = observer.add_message + + def wrapped_add_message(agent_name, process_type, content, **kwargs): + # All debugger-side work is guarded so the observed agent's + # add_message call always runs, even if trace emission fails. + try: + pt_value = ( + process_type.value if hasattr(process_type, "value") + else str(process_type) + ) + debugger.update_step_from_observer(pt_value, content) + debugger._emit( + "observer_event", + { + "agent_name": agent_name, + "process_type": pt_value, + "content_preview": _truncate_text( + str(content) if content is not None else "", + head=600, + tail=300, + ), + "content_chars": len(str(content)) if content is not None else 0, + "extra_kwargs": list(kwargs.keys()) if kwargs else [], + }, + ) + except Exception: + pass + return original_add_message(agent_name, process_type, content, **kwargs) + + observer.add_message = wrapped_add_message + observer._debugger_tapped = True + + +def _snapshot_agent_static(agent, debugger: ContextDebugger) -> None: + """Emit a one-shot agent_init event with system prompt + tools + config.""" + agent_name = ( + getattr(agent, "name", None) + or getattr(agent, "agent_name", None) + or type(agent).__name__ + ) + system_prompt = getattr(agent, "system_prompt", None) + system_prompt_chars = len(system_prompt) if isinstance(system_prompt, str) else 0 + + tools_info: List[dict] = [] + tools = getattr(agent, "tools", None) or {} + for tname, tool in tools.items(): + tools_info.append( + { + "name": tname, + "description": _truncate_text( + getattr(tool, "description", None), head=400, tail=200 + ), + "inputs": getattr(tool, "inputs", None), + } + ) + + cm = getattr(agent, "context_manager", None) + cm_config = None + if cm is not None and getattr(cm, "config", None) is not None: + cfg = cm.config + cm_config = { + "enabled": getattr(cfg, "enabled", None), + "token_threshold": getattr(cfg, "token_threshold", None), + "keep_recent_pairs": getattr(cfg, "keep_recent_pairs", None), + "keep_recent_steps": getattr(cfg, "keep_recent_steps", None), + "max_observation_length": getattr(cfg, "max_observation_length", None), + } + + debugger._emit( + "agent_init", + { + "agent_name": agent_name, + "agent_class": type(agent).__name__, + "max_steps": getattr(agent, "max_steps", None), + "system_prompt": _truncate_text(system_prompt, head=2000, tail=500), + "system_prompt_chars": system_prompt_chars, + "tools": tools_info, + "context_manager_config": cm_config, + }, + ) + + +def _resolve_target(target) -> tuple: + """Return (agent, cm) given either an agent or a ContextManager.""" + if hasattr(target, "compress_if_needed"): + return None, target + cm = getattr(target, "context_manager", None) + return target, cm + + +def attach_debugger( + target, + trace_path: Optional[str] = None, + run_id: Optional[str] = None, + capture_full_summary: bool = True, + capture_full_messages: bool = False, + layers: Optional[Iterable[str]] = None, + append: bool = False, + existing: Optional[ContextDebugger] = None, +) -> Optional[ContextDebugger]: + """Attach the debugger to an agent or a ContextManager. + + Args: + target: Either a Nexent agent (CoreAgent/NexentAgent) or a ContextManager. + trace_path: Output JSONL path. Falls back to env var NEXENT_CONTEXT_DEBUG. + run_id: Optional explicit run id (auto-generated otherwise). + capture_full_summary: Include full summary text in compression events. + capture_full_messages: Also store verbatim message text for main LLM + calls. Compression LLM calls are always captured verbatim + regardless of this flag. + layers: Subset of {"compression", "model", "observer", "tools", "executor"}. + Default: all available layers. + append: Append to an existing trace file instead of truncating. + existing: Reuse this ContextDebugger instead of creating a new one. + Lets an interactive session share one trace/run_id across many + agent instances (one per conversation turn). + + Returns: + The ContextDebugger, or None if no trace path resolved. + """ + agent, cm = _resolve_target(target) + enabled_layers = set(layers) if layers is not None else set(DEFAULT_LAYERS) + + if existing is not None: + debugger = existing + else: + resolved_path = trace_path or os.environ.get("NEXENT_CONTEXT_DEBUG") + if not resolved_path: + return None + debugger = ContextDebugger( + trace_path=resolved_path, + run_id=run_id, + capture_full_summary=capture_full_summary, + capture_full_messages=capture_full_messages, + append=append, + ) + + if agent is not None: + try: + _snapshot_agent_static(agent, debugger) + except Exception as exc: + debugger._emit("debug_error", {"phase": "agent_init", "error": str(exc)}) + + if cm is not None and "compression" in enabled_layers: + try: + _wrap_compress_if_needed(cm, debugger) + except Exception as exc: + debugger._emit("debug_error", {"phase": "wrap_compress", "error": str(exc)}) + + if agent is not None and "model" in enabled_layers: + model = getattr(agent, "model", None) + if model is not None and not isinstance(model, _ModelProxy): + try: + agent.model = _ModelProxy(model, debugger) + except Exception as exc: + debugger._emit("debug_error", {"phase": "wrap_model", "error": str(exc)}) + + if agent is not None and "observer" in enabled_layers: + observer = getattr(agent, "observer", None) + if observer is not None: + try: + _tap_observer(observer, debugger) + except Exception as exc: + debugger._emit("debug_error", {"phase": "tap_observer", "error": str(exc)}) + + if agent is not None and "tools" in enabled_layers: + tools = getattr(agent, "tools", None) or {} + for tname, tool in list(tools.items()): + try: + _wrap_tool_forward(tool, tname, debugger) + except Exception as exc: + debugger._emit( + "debug_error", + {"phase": "wrap_tool", "tool": tname, "error": str(exc)}, + ) + + if agent is not None and "executor" in enabled_layers: + executor = getattr(agent, "python_executor", None) + if executor is not None and not isinstance(executor, _PyExecutorProxy): + try: + agent.python_executor = _PyExecutorProxy(executor, debugger) + except Exception as exc: + debugger._emit( + "debug_error", {"phase": "wrap_executor", "error": str(exc)} + ) + + agent_or_cm = agent if agent is not None else cm + if agent_or_cm is not None: + try: + agent_or_cm._debugger = debugger + except Exception: + pass + + return debugger diff --git a/sdk/ctx_debugger/example_with_benchmark.py b/sdk/ctx_debugger/example_with_benchmark.py new file mode 100644 index 000000000..3f6c80f67 --- /dev/null +++ b/sdk/ctx_debugger/example_with_benchmark.py @@ -0,0 +1,90 @@ +"""Run the existing benchmark with ContextDebugger attached, full layers, +without touching benchmark or SDK source. + +Strategy: monkey-patch the smolagents agent class so every newly-created +agent auto-attaches a debugger after __init__. The compression-only entry +point (attaching to ContextManager directly) is no longer needed in this +example because attaching to the agent picks up the cm anyway. + +Run from this directory (sdk/ctx_debugger); ../../ is the nexent repo root: + ../../backend/.venv/bin/python example_with_benchmark.py + +Trace lands at $NEXENT_CONTEXT_DEBUG or /tmp/nexent_ctx_trace.jsonl by default. +""" + +import asyncio +import os +import sys + +HERE = os.path.dirname(os.path.abspath(__file__)) +SDK_DIR = os.path.dirname(HERE) +BENCHMARK_DIR = os.path.join(SDK_DIR, "benchmark") + +for p in (SDK_DIR, BENCHMARK_DIR): + if p not in sys.path: + sys.path.insert(0, p) + +TRACE_PATH = os.environ.get( + "NEXENT_CONTEXT_DEBUG", "/tmp/nexent_ctx_trace.jsonl" +) +os.environ["NEXENT_CONTEXT_DEBUG"] = TRACE_PATH + + +def _install_auto_attach(): + """Wrap CoreAgent.__init__ so every agent auto-attaches a debugger, AND + CoreAgent.__setattr__ so a later assignment of `context_manager` wires the + compression layer using the agent's existing debugger (single run_id). + + This avoids the dual-patch fragmentation: a ContextManager assigned to an + agent that already has a debugger reuses that debugger's run_id, so + compress_* events and llm_call(tag=compression) events live in the same + run. + """ + from nexent.core.agents.core_agent import CoreAgent + from ctx_debugger import attach_debugger + from ctx_debugger.debugger import _wrap_compress_if_needed + import logging + log = logging.getLogger(__name__) + + original_agent_init = CoreAgent.__init__ + + def patched_agent_init(self, *args, **kwargs): + original_agent_init(self, *args, **kwargs) + try: + attach_debugger(self, append=True) + except Exception as exc: + log.warning("Agent auto-attach failed: %s", exc, exc_info=True) + + def patched_setattr(self, name, value): + object.__setattr__(self, name, value) + if ( + name == "context_manager" + and value is not None + and getattr(value.config, "enabled", False) + ): + existing_dbg = getattr(self, "_debugger", None) + if existing_dbg is None: + return + if getattr(value, "_debugger", None) is existing_dbg: + return + try: + _wrap_compress_if_needed(value, existing_dbg) + except Exception as exc: + log.warning("Compression layer attach failed: %s", exc, exc_info=True) + + CoreAgent.__init__ = patched_agent_init + CoreAgent.__setattr__ = patched_setattr + + +def main(): + _install_auto_attach() + + os.chdir(BENCHMARK_DIR) + from test_benchmark import main as bench_main + + asyncio.run(bench_main()) + print(f"\n[ctx_debugger] Trace written to: {TRACE_PATH}") + + +if __name__ == "__main__": + main() diff --git a/sdk/ctx_debugger/example_with_eventqa.py b/sdk/ctx_debugger/example_with_eventqa.py new file mode 100644 index 000000000..6f283cf78 --- /dev/null +++ b/sdk/ctx_debugger/example_with_eventqa.py @@ -0,0 +1,52 @@ +"""Run the EventQA benchmark with ContextDebugger attached (all layers). + +Same auto-attach strategy as example_with_benchmark.py, but targets the +EventQA runner (sdk/benchmark/eventqa_eval/run_eventqa.py). Every CLI argument +after the script name is forwarded straight to run_eventqa. + +Run from this directory (sdk/ctx_debugger); ../../ is the nexent repo root: + + NEXENT_CONTEXT_DEBUG=/tmp/eventqa_trace.jsonl \\ + ../../backend/.venv/bin/python example_with_eventqa.py \\ + --book_index 0 --limit 1 --max_ingest_chars 200000 + +The trace lands at $NEXENT_CONTEXT_DEBUG (default /tmp/nexent_eventqa_trace.jsonl). +Export it to Langfuse with: + python -m ctx_debugger.langfuse_export +""" + +import asyncio +import os +import sys + +HERE = os.path.dirname(os.path.abspath(__file__)) +SDK_DIR = os.path.dirname(HERE) +BENCHMARK_DIR = os.path.join(SDK_DIR, "benchmark") +EVENTQA_DIR = os.path.join(BENCHMARK_DIR, "eventqa_eval") + +for p in (SDK_DIR, BENCHMARK_DIR, EVENTQA_DIR): + if p not in sys.path: + sys.path.insert(0, p) + +TRACE_PATH = os.environ.get( + "NEXENT_CONTEXT_DEBUG", "/tmp/nexent_eventqa_trace.jsonl" +) +os.environ["NEXENT_CONTEXT_DEBUG"] = TRACE_PATH + +# Reuse the CoreAgent auto-attach monkey-patch from the sibling example. +from example_with_benchmark import _install_auto_attach + + +def main(): + _install_auto_attach() + + os.chdir(EVENTQA_DIR) + from run_eventqa import main as eventqa_main, _build_arg_parser + + args = _build_arg_parser().parse_args() + asyncio.run(eventqa_main(args)) + print(f"\n[ctx_debugger] Trace written to: {TRACE_PATH}") + + +if __name__ == "__main__": + main() diff --git a/sdk/ctx_debugger/inspector.py b/sdk/ctx_debugger/inspector.py new file mode 100644 index 000000000..4df2b86dc --- /dev/null +++ b/sdk/ctx_debugger/inspector.py @@ -0,0 +1,440 @@ +"""CLI inspector for ctx_debugger JSONL traces. + +Usage: + python -m ctx_debugger.inspector summary + python -m ctx_debugger.inspector runs + python -m ctx_debugger.inspector timeline [--run RUN_ID] + python -m ctx_debugger.inspector compress + python -m ctx_debugger.inspector llm [--tag main|compression] + python -m ctx_debugger.inspector step --step N [--run RUN_ID] + +Requires `rich` (already a transitive dep of smolagents/Nexent). +""" + +import argparse +import json +import sys +from collections import Counter, defaultdict +from typing import Any, Dict, List, Optional + +try: + from rich import box + from rich.console import Console + from rich.panel import Panel + from rich.table import Table +except ImportError: + sys.stderr.write("ERROR: rich is required. pip install rich\n") + sys.exit(1) + + +class Trace: + """Indexed view over a JSONL trace file.""" + + def __init__(self, path: str): + self.path = path + self.events: List[dict] = [] + with open(path, encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + self.events.append(json.loads(line)) + + def runs(self) -> List[str]: + seen, order = set(), [] + for e in self.events: + r = e["run_id"] + if r not in seen: + seen.add(r) + order.append(r) + return order + + +# ============================================================ +# Per-event one-line detail formatters +# ============================================================ + +def _fmt_detail(event: str, d: dict) -> str: + if event == "agent_init": + return f"agent={d.get('agent_name')}, tools={len(d.get('tools', []))}" + if event == "observer_event": + pt = d.get("process_type", "") + cp = (d.get("content_preview") or "").replace("\n", " ")[:55] + return f"[{pt}] {cp}" + if event == "llm_call_begin": + return f"tag={d.get('tag')} msgs={len(d.get('input_messages', []))}" + if event == "llm_call_end": + if d.get("error"): + return f"tag={d.get('tag')} ERROR: {d['error'][:60]}" + return ( + f"tag={d.get('tag')} dur={d.get('duration_ms')}ms " + f"in={d.get('input_tokens')} out={d.get('output_tokens')}" + ) + if event == "compress_begin": + pd = d.get("predicted_decision", {}) + et = d.get("estimated_tokens", {}) + return ( + f"branch={pd.get('branch')} " + f"eff={et.get('effective')}/{et.get('threshold')} " + f"P={pd.get('compress_prev')} C={pd.get('compress_curr')}" + ) + if event == "compression_call": + return ( + f"type={d.get('call_type')} cache={d.get('cache_hit')} " + f"in={d.get('input_tokens')} out={d.get('output_tokens')}" + ) + if event == "compress_end": + tc = d.get("token_counts") or {} + sc = d.get("summary_changed") or {} + return ( + f"unc={tc.get('last_uncompressed')}→comp={tc.get('last_compressed')} " + f"prev_changed={sc.get('previous_changed')}" + ) + if event == "code_execute_begin": + return f"code_chars={d.get('code_chars')}" + if event == "code_execute_end": + return f"dur={d.get('duration_ms')}ms final_answer={d.get('is_final_answer')}" + if event == "tool_call_begin": + return f"tool={d.get('tool')}" + if event == "tool_call_end": + return f"tool={d.get('tool')} dur={d.get('duration_ms')}ms" + if event == "run_begin": + return f"pid={d.get('pid')}" + if event == "debug_error": + return f"phase={d.get('phase')}: {d.get('error')}" + return "" + + +# ============================================================ +# Commands +# ============================================================ + +def cmd_summary(trace: Trace, args) -> None: + console = Console() + events = Counter(e["event"] for e in trace.events) + + main_calls = [e for e in trace.events + if e["event"] == "llm_call_end" and e["data"].get("tag") == "main"] + comp_calls = [e for e in trace.events + if e["event"] == "llm_call_end" and e["data"].get("tag") == "compression"] + + def _sum(events_, key): + return sum((e["data"].get(key) or 0) for e in events_) + + t = Table(title=f"Trace Summary — {trace.path}", box=box.SIMPLE_HEAD) + t.add_column("Metric", style="cyan") + t.add_column("Value", justify="right") + t.add_row("Total events", str(len(trace.events))) + t.add_row("Total runs", str(len(trace.runs()))) + t.add_row("Compression cycles", str(events.get("compress_begin", 0))) + t.add_row("Main LLM calls", str(len(main_calls))) + t.add_row("Compression LLM calls", str(len(comp_calls))) + t.add_row( + "Main tokens (in / out)", + f"{_sum(main_calls, 'input_tokens'):,} / {_sum(main_calls, 'output_tokens'):,}", + ) + t.add_row( + "Compression tokens (in / out)", + f"{_sum(comp_calls, 'input_tokens'):,} / {_sum(comp_calls, 'output_tokens'):,}", + ) + t.add_row("Main LLM time", f"{_sum(main_calls, 'duration_ms')/1000:.1f}s") + t.add_row("Compression LLM time", f"{_sum(comp_calls, 'duration_ms')/1000:.1f}s") + if trace.events: + span = trace.events[-1]["ts"] - trace.events[0]["ts"] + t.add_row("Wall-clock span", f"{span:.1f}s") + console.print(t) + + # Event histogram + h = Table(title="Event histogram", box=box.SIMPLE) + h.add_column("Event") + h.add_column("Count", justify="right") + for ev, n in events.most_common(): + h.add_row(ev, str(n)) + console.print(h) + + +def cmd_runs(trace: Trace, args) -> None: + console = Console() + by_run: Dict[str, List[dict]] = defaultdict(list) + for e in trace.events: + by_run[e["run_id"]].append(e) + + t = Table(title="Runs", box=box.SIMPLE_HEAD) + t.add_column("Run ID") + t.add_column("Start ts", justify="right") + t.add_column("Events", justify="right") + t.add_column("Compress?", justify="center") + t.add_column("Agent?", justify="center") + t.add_column("Agent name") + + for run_id, evts in sorted(by_run.items(), key=lambda x: x[1][0]["ts"]): + has_compress = any(e["event"] == "compress_begin" for e in evts) + agent_init = next((e for e in evts if e["event"] == "agent_init"), None) + agent_name = (agent_init["data"].get("agent_name") if agent_init else "") or "" + t.add_row( + run_id, + f"{evts[0]['ts']:.2f}", + str(len(evts)), + "✓" if has_compress else "", + "✓" if agent_init else "", + agent_name, + ) + console.print(t) + + +def cmd_timeline(trace: Trace, args) -> None: + console = Console() + events = trace.events + if args.run: + events = [e for e in events if e["run_id"] == args.run] + + if not events: + console.print(f"[red]No events for run={args.run}[/]") + return + + title = f"Timeline {f'(run={args.run})' if args.run else '(all runs)'}" + t = Table(title=title, box=box.SIMPLE) + t.add_column("seq", justify="right") + t.add_column("ts", justify="right") + t.add_column("step", justify="right") + if not args.run: + t.add_column("run") + t.add_column("event", style="cyan") + t.add_column("detail") + + for e in events: + detail = _fmt_detail(e["event"], e["data"]) + row = [ + str(e["seq"]), + f"{e['ts']:.1f}", + str(e.get("agent_step") if e.get("agent_step") is not None else "-"), + ] + if not args.run: + row.append(e["run_id"][-8:]) + row.append(e["event"]) + row.append(detail) + t.add_row(*row) + console.print(t) + + +def cmd_compress(trace: Trace, args) -> None: + """Group events into begin → compression_calls → end cycles.""" + console = Console() + by_run: Dict[str, List[dict]] = defaultdict(list) + for e in trace.events: + by_run[e["run_id"]].append(e) + + t = Table(title="Compression Cycles", box=box.SIMPLE_HEAD) + t.add_column("Run") + t.add_column("Begin seq", justify="right") + t.add_column("Step", justify="right") + t.add_column("Branch") + t.add_column("PC") + t.add_column("Eff/Thr", justify="right") + t.add_column("Calls", justify="right") + t.add_column("Cache hits", justify="right") + t.add_column("LLM in→out", justify="right") + t.add_column("Unc→Comp", justify="right") + t.add_column("Δ tok %", justify="right") + + any_row = False + for run_id, evts in by_run.items(): + i = 0 + while i < len(evts): + e = evts[i] + if e["event"] != "compress_begin": + i += 1 + continue + j = i + 1 + calls, end = [], None + while j < len(evts): + if evts[j]["event"] == "compression_call": + calls.append(evts[j]) + elif evts[j]["event"] == "compress_end": + end = evts[j] + break + j += 1 + + pd = e["data"].get("predicted_decision") or {} + et = e["data"].get("estimated_tokens") or {} + tc = (end["data"].get("token_counts") if end else {}) or {} + unc, comp = tc.get("last_uncompressed"), tc.get("last_compressed") + # Signed delta: negative = shrank, positive = grew. + ratio = "" + if unc and comp: + ratio = f"{(comp - unc) / unc * 100:+.0f}%" + + llm_io = "" + if calls: + in_sum = sum((c["data"].get("input_tokens") or 0) for c in calls) + out_sum = sum((c["data"].get("output_tokens") or 0) for c in calls) + llm_io = f"{in_sum}→{out_sum}" + + cache_hits = sum(1 for c in calls if c["data"].get("cache_hit")) + pc_flag = ( + ("P" if pd.get("compress_prev") else "-") + + ("C" if pd.get("compress_curr") else "-") + ) + + t.add_row( + run_id[-8:], + str(e["seq"]), + str(e.get("agent_step") or "-"), + pd.get("branch", "?"), + pc_flag, + f"{et.get('effective')}/{et.get('threshold')}", + str(len(calls)), + str(cache_hits), + llm_io or "-", + f"{unc}→{comp}" if unc else "-", + ratio, + ) + any_row = True + i = j + 1 if end else j + + if not any_row: + console.print("[yellow]No compression cycles in this trace.[/]") + return + console.print(t) + + +def cmd_llm(trace: Trace, args) -> None: + console = Console() + pending: Dict[str, dict] = {} + rows = [] + for e in trace.events: + run = e["run_id"] + if e["event"] == "llm_call_begin": + pending[run] = e + elif e["event"] == "llm_call_end": + begin = pending.pop(run, None) + tag = e["data"].get("tag", "?") + if args.tag and tag != args.tag: + continue + rows.append((begin, e)) + + t = Table(title=f"LLM Calls {f'(tag={args.tag})' if args.tag else ''}", + box=box.SIMPLE_HEAD) + t.add_column("Run") + t.add_column("Step", justify="right") + t.add_column("Seq", justify="right") + t.add_column("Tag") + t.add_column("Dur(ms)", justify="right") + t.add_column("In tok", justify="right") + t.add_column("Out tok", justify="right") + t.add_column("Input head", overflow="ellipsis", max_width=40) + t.add_column("Output head", overflow="ellipsis", max_width=40) + + for begin, end in rows: + in_msgs = (begin["data"].get("input_messages") or []) if begin else [] + first = (in_msgs[0]["preview"] if in_msgs else "").replace("\n", " ")[:40] + last_user = "" + for m in reversed(in_msgs): + if m.get("role") == "user": + last_user = (m.get("preview") or "").replace("\n", " ")[:40] + break + out = (end["data"].get("output_preview") or end["data"].get("error") or "") + out = out.replace("\n", " ")[:40] + d = end["data"] + t.add_row( + end["run_id"][-8:], + str(end.get("agent_step") or "-"), + str(end["seq"]), + d.get("tag", "?"), + str(d.get("duration_ms") or "-"), + str(d.get("input_tokens") or "-"), + str(d.get("output_tokens") or "-"), + last_user or first, + out, + ) + console.print(t) + + +def cmd_step(trace: Trace, args) -> None: + console = Console() + events = trace.events + if args.run: + events = [e for e in events if e["run_id"] == args.run] + events = [e for e in events if e.get("agent_step") == args.step] + + if not events: + console.print( + f"[red]No events match step={args.step}" + f"{' run=' + args.run if args.run else ''}[/]" + ) + return + + for e in events: + title = ( + f"seq={e['seq']} | {e['event']} | " + f"run={e['run_id'][-8:]} | step={e.get('agent_step')}" + ) + content = json.dumps(e["data"], ensure_ascii=False, indent=2) + if len(content) > 3500: + content = content[:3500] + f"\n...[{len(content) - 3500} chars elided]..." + console.print(Panel(content, title=title, expand=False, border_style="cyan")) + + +# ============================================================ +# Argparse +# ============================================================ + +def _build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + prog="ctx-inspect", + description="Inspect a ctx_debugger JSONL trace.", + ) + sub = p.add_subparsers(dest="cmd", required=True) + + s = sub.add_parser("summary", help="Overall stats and event histogram.") + s.add_argument("trace") + + r = sub.add_parser("runs", help="List runs in the trace.") + r.add_argument("trace") + + tl = sub.add_parser("timeline", help="Chronological event list.") + tl.add_argument("trace") + tl.add_argument("--run", help="Filter to one run_id (suffix match supported below).") + + c = sub.add_parser("compress", help="All compression cycles with stats.") + c.add_argument("trace") + + l = sub.add_parser("llm", help="LLM calls with durations and tokens.") + l.add_argument("trace") + l.add_argument("--tag", choices=["main", "compression"]) + + st = sub.add_parser("step", help="Dump every event for one agent step.") + st.add_argument("trace") + st.add_argument("--step", type=int, required=True) + st.add_argument("--run") + + return p + + +def main() -> None: + args = _build_parser().parse_args() + trace = Trace(args.trace) + + # Allow --run to match by suffix (8-char short form) + if getattr(args, "run", None): + runs = trace.runs() + if args.run not in runs: + matches = [r for r in runs if r.endswith(args.run)] + if len(matches) == 1: + args.run = matches[0] + elif len(matches) > 1: + print(f"Ambiguous --run {args.run}: matches {matches}", file=sys.stderr) + sys.exit(2) + + { + "summary": cmd_summary, + "runs": cmd_runs, + "timeline": cmd_timeline, + "compress": cmd_compress, + "llm": cmd_llm, + "step": cmd_step, + }[args.cmd](trace, args) + + +if __name__ == "__main__": + main() diff --git a/sdk/ctx_debugger/interactive.py b/sdk/ctx_debugger/interactive.py new file mode 100644 index 000000000..50ab53c83 --- /dev/null +++ b/sdk/ctx_debugger/interactive.py @@ -0,0 +1,772 @@ +"""Interactive context debugger REPL. + +Type user messages one at a time. Each line runs one agent turn against an +accumulating conversation history with a shared ContextManager, so compression +triggers naturally as the history grows. After every turn a debug panel shows +how the context was built and compressed. + +Run from this directory (sdk/ctx_debugger); ../../ is the nexent repo root: + ../../backend/.venv/bin/python interactive.py + +Slash commands: + /help list commands + /context [N] context the main LLM received last turn (post-compression) + /history raw accumulated session ledger (pre-compression) + /summary current compression summary (full text) + /compress last turn's compression LLM input prompt + output summary + /tokens per-turn token timeline + /stats session-wide compression stats (LLM compression call count) + /trace [N] raw trace events from the last N turns (default 1) + /step N dump every event of agent step N in the last turn + /config show ContextManagerConfig + /reset [threshold] clear history + compression state (optional new threshold) + /quit /q exit +""" + +import asyncio +import contextlib +import io +import json +import os +import sys +from collections import Counter + +try: + # Importing readline transparently gives input() shell-style line editing + # and up/down-arrow history recall. + import readline +except ImportError: # pragma: no cover - readline is stdlib on Linux/macOS + readline = None + +HERE = os.path.dirname(os.path.abspath(__file__)) +SDK_DIR = os.path.dirname(HERE) +BENCHMARK_DIR = os.path.join(SDK_DIR, "benchmark") +for _p in (SDK_DIR, BENCHMARK_DIR): + if _p not in sys.path: + sys.path.insert(0, _p) + +from rich import box +from rich.console import Console +from rich.panel import Panel +from rich.table import Table + +from agent_runner import build_agent_run_info, run_agent_with_tracking + +# agent_runner rebinds sys.stdout to a UTF-8 TextIOWrapper over the same +# terminal buffer. Use that wrapper for our console. Do NOT restore the +# previous stdout: restoring would orphan the wrapper, and closing it on GC +# would close the shared underlying buffer, breaking output entirely. +_OUT = sys.stdout + +from nexent.core.agents.agent_context import ContextManager, ContextManagerConfig +from nexent.core.agents.agent_model import AgentHistory +from nexent.core.utils.token_estimation import estimate_tokens_text + +from ctx_debugger import ContextDebugger, attach_debugger + +TRACE_PATH = os.environ.get("NEXENT_CONTEXT_DEBUG", "/tmp/nexent_ctx_interactive.jsonl") +# Shell-style persistent command history, kept across sessions like ~/.bash_history. +HISTORY_FILE = os.path.expanduser("~/.nexent_ctx_debugger_history") +# readline needs non-printing escape sequences wrapped in \001..\002 so it +# measures the prompt width correctly when redrawing on history navigation. +_PROMPT = "\n\001\033[1;36m\002you>\001\033[0m\002 " +console = Console(file=_OUT) + + +def _sum(events, key): + return sum((e["data"].get(key) or 0) for e in events) + + +def _strip_surrogates(s): + """Drop lone surrogate code points from a string. + + Terminal line-editing of multi-byte characters (e.g. backspacing over + CJK input in WSL / some terminals) can leave half a UTF-8 sequence, + which stdin decodes via surrogateescape into lone surrogates. Those + cannot be UTF-8 encoded and crash both the agent and the trace writer. + """ + if not isinstance(s, str): + return s + return s.encode("utf-8", errors="ignore").decode("utf-8") + + +def _clean_input(raw): + """Sanitize a REPL input line; warn the user if anything was removed.""" + cleaned = _strip_surrogates(raw) + if cleaned != raw: + console.print( + "[yellow]·[/] [dim]removed invalid characters from your input " + "(terminal line-editing artifact — retype if it looks wrong)[/]" + ) + return cleaned + + +def _init_history(): + """Load shell-style command history (up/down-arrow recall) from disk.""" + if readline is None: + return + try: + readline.read_history_file(HISTORY_FILE) + except (FileNotFoundError, OSError): + pass + readline.set_history_length(2000) + + +def _save_history(): + """Persist command history so it survives across sessions, like a shell.""" + if readline is None: + return + try: + readline.write_history_file(HISTORY_FILE) + except OSError: + pass + + +class Session: + """One interactive debugging session: shared cm + debugger + history.""" + + def __init__(self, token_threshold=3000, keep_recent_pairs=1, + keep_recent_steps=4, max_steps=5): + self.max_steps = max_steps + self.cm_config = ContextManagerConfig( + enabled=True, + token_threshold=token_threshold, + keep_recent_pairs=keep_recent_pairs, + keep_recent_steps=keep_recent_steps, + ) + self.history = [] # list[AgentHistory] + self.turn = 0 + self.turn_tokens = [] # list of dict per turn + self.last_turn_events = [] # events of the most recent turn + self._last_seq = 0 + + self.shared_cm = ContextManager(config=self.cm_config, max_steps=max_steps) + # capture_full_messages=True so /context can show the verbatim + # post-compression context the main LLM received, not just a digest. + self.debugger = ContextDebugger( + trace_path=TRACE_PATH, capture_full_messages=True) + + # Wrap the shared cm's compression layer once, up front. + attach_debugger(self.shared_cm, existing=self.debugger, layers={"compression"}) + self._install_agent_patch() + + def _install_agent_patch(self): + """Patch CoreAgent.__init__ so each turn's fresh agent wires its + model/observer/tools/executor layers onto this session's debugger.""" + from nexent.core.agents.core_agent import CoreAgent + + dbg = self.debugger + if getattr(CoreAgent, "_ctxdbg_orig_init", None) is None: + CoreAgent._ctxdbg_orig_init = CoreAgent.__init__ + + orig_init = CoreAgent._ctxdbg_orig_init + + def patched_init(agent_self, *args, **kwargs): + orig_init(agent_self, *args, **kwargs) + try: + attach_debugger( + agent_self, + existing=dbg, + layers={"model", "observer", "tools", "executor"}, + ) + except Exception as exc: + console.print(f"[yellow]layer attach failed: {exc}[/]") + + CoreAgent.__init__ = patched_init + + async def _run_turn_async(self, user_msg): + info = build_agent_run_info( + user_msg, + list(self.history), + max_steps=self.max_steps, + context_manager_config=self.cm_config, + ) + info.context_manager = self.shared_cm + buf = io.StringIO() + with contextlib.redirect_stdout(buf): + result = await run_agent_with_tracking(info) + return result + + def run_turn(self, user_msg): + self.turn += 1 + # Defense in depth: keep the conversation history surrogate-free so a + # single bad turn cannot poison every later replay. + user_msg = _strip_surrogates(user_msg) + result = asyncio.run(self._run_turn_async(user_msg)) + result.final_answer = _strip_surrogates(result.final_answer or "") + self.history.append(AgentHistory(role="user", content=user_msg)) + self.history.append(AgentHistory(role="assistant", content=result.final_answer)) + self.last_turn_events = self._drain_events() + self._record_tokens() + return result + + def _drain_events(self): + events = [] + try: + with open(TRACE_PATH, encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + e = json.loads(line) + if e["seq"] > self._last_seq: + events.append(e) + except FileNotFoundError: + return [] + if events: + self._last_seq = max(e["seq"] for e in events) + return events + + def _record_tokens(self): + evs = self.last_turn_events + main = [e for e in evs if e["event"] == "llm_call_end" + and e["data"].get("tag") == "main"] + comp = [e for e in evs if e["event"] == "llm_call_end" + and e["data"].get("tag") == "compression"] + self.turn_tokens.append({ + "turn": self.turn, + "main_in": _sum(main, "input_tokens"), + "main_out": _sum(main, "output_tokens"), + "comp_in": _sum(comp, "input_tokens"), + "comp_out": _sum(comp, "output_tokens"), + }) + + +# ============================================================ +# Rendering +# ============================================================ + +def render_turn(session, result, events): + answer = result.final_answer or "(no answer)" + console.print(Panel( + answer.strip(), + title=f"Turn {session.turn} · agent final answer (main LLM)", + border_style="green", + expand=False, + )) + + main = [e for e in events if e["event"] == "llm_call_end" + and e["data"].get("tag") == "main"] + comp = [e for e in events if e["event"] == "llm_call_end" + and e["data"].get("tag") == "compression"] + steps = [e for e in events if e["event"] == "observer_event" + and e["data"].get("process_type") == "step_count"] + cbegins = [e for e in events if e["event"] == "compress_begin"] + cends = [e for e in events if e["event"] == "compress_end"] + tools = [e for e in events if e["event"] == "tool_call_end"] + code = [e for e in events if e["event"] == "code_execute_end"] + + t = Table(box=box.SIMPLE, show_header=False) + t.add_column("k", style="cyan", no_wrap=True) + t.add_column("v") + + t.add_row("agent steps", str(len(steps))) + if main: + t.add_row( + "main LLM", + f"×{len(main)} {_sum(main,'input_tokens')}→{_sum(main,'output_tokens')} tok" + f" {_sum(main,'duration_ms')/1000:.1f}s [dim](API)[/]", + ) + if comp: + t.add_row( + "compression LLM", + f"×{len(comp)} {_sum(comp,'input_tokens')}→{_sum(comp,'output_tokens')} tok" + f" {_sum(comp,'duration_ms')/1000:.1f}s [dim](API)[/]", + ) + t.add_row( + "", + "[dim]↳ separate LLM call (not the answer above) — " + "/compress shows its prompt + summary[/]", + ) + + if cbegins: + for cb, ce in zip(cbegins, cends): + pd = cb["data"].get("predicted_decision") or {} + tc = ce["data"].get("token_counts") or {} + unc, cmp_ = tc.get("last_uncompressed"), tc.get("last_compressed") + # Signed delta: negative = shrank, positive = grew. Compression + # can grow the count when a regenerated summary plus the retained + # recent steps outweigh the original slice. + ratio = f" ({(cmp_-unc)/unc*100:+.0f}%)" if unc and cmp_ else "" + sc = ce["data"].get("summary_changed") or {} + changed = [] + if sc.get("previous_changed"): + changed.append("previous") + if sc.get("current_changed"): + changed.append("current") + t.add_row( + "compression", + f"[bold]TRIGGERED[/] branch={pd.get('branch')} " + f"{unc}→{cmp_} tok{ratio} [dim](est.)[/]", + ) + if changed: + t.add_row("", f"summary updated: {', '.join(changed)}") + else: + t.add_row("compression", "[dim]not triggered[/]") + + if code: + t.add_row("code exec", f"×{len(code)}") + if tools: + names = ", ".join(e["data"].get("tool", "?") for e in tools) + t.add_row("tool calls", names) + + errors = [e for e in events if e["event"] == "debug_error"] + if errors: + t.add_row("debug errors", f"[red]{len(errors)}[/] (see /trace)") + + console.print(Panel(t, title="context construction", border_style="blue", + expand=False)) + + +# ============================================================ +# Slash commands +# ============================================================ + +def _print_config(session): + c = session.cm_config + t = Table(box=box.SIMPLE, show_header=False) + t.add_column("k", style="cyan") + t.add_column("v") + t.add_row("token_threshold", str(c.token_threshold)) + t.add_row("keep_recent_pairs", str(c.keep_recent_pairs)) + t.add_row("keep_recent_steps", str(c.keep_recent_steps)) + t.add_row("max_steps", str(session.max_steps)) + t.add_row("trace file", TRACE_PATH) + console.print(Panel(t, title="ContextManagerConfig", border_style="dim", + expand=False)) + + +def _cmd_history(session): + """Raw accumulated session ledger — every user message and final answer, + verbatim, never compressed. This is the REPL's bookkeeping (the input to + the next turn), NOT what the model sees. See /context for that.""" + if not session.history: + console.print("[dim](no history yet)[/]") + return + t = Table(box=box.SIMPLE) + t.add_column("#", justify="right") + t.add_column("role", style="cyan") + t.add_column("content") + for i, h in enumerate(session.history): + content = h.content if isinstance(h.content, str) else str(h.content) + if len(content) > 200: + content = content[:200] + f" …[+{len(content)-200} chars]" + t.add_row(str(i), h.role, content.replace("\n", " ")) + console.print(Panel( + t, + title=f"Session ledger — pre-compression ({len(session.history)} msgs)", + border_style="blue", expand=False, + )) + + +def _is_summary_msg(text): + """Nexent injects the compression summary as a user message with this + marker prefix. Used to flag the compressed slice in /context.""" + return isinstance(text, str) and text.startswith("Summary of earlier steps") + + +def _cmd_context(session, arg=None): + """Show what the main LLM actually received last turn — the + post-compression context (system prompt + summary + recent turns), not + the raw session ledger (see /history for that).""" + evs = session.last_turn_events + if not evs: + console.print("[dim](no events from last turn — run a turn first)[/]") + return + # Pair main-tagged LLM calls (begin -> end) in chronological order. + pairs = [] + pending = None + for e in evs: + if e["event"] == "llm_call_begin" and e["data"].get("tag") == "main": + pending = e + elif e["event"] == "llm_call_end" and e["data"].get("tag") == "main": + pairs.append((pending, e)) + pending = None + if pending is not None: + pairs.append((pending, None)) + if not pairs: + console.print("[dim](no main LLM call in the last turn)[/]") + return + + idx = 1 + if arg: + try: + idx = int(arg) + except ValueError: + console.print("[red]usage: /context [N] (N = which main LLM call)[/]") + return + if not (1 <= idx <= len(pairs)): + console.print( + f"[red]turn made {len(pairs)} main LLM call(s); pick 1..{len(pairs)}[/]") + return + + begin, end = pairs[idx - 1] + msgs = (begin["data"].get("input_messages")) or [] + has_summary = False + est_total = 0 + + t = Table(box=box.SIMPLE) + t.add_column("#", justify="right") + t.add_column("role", style="cyan", no_wrap=True) + t.add_column("tokens", justify="right") + t.add_column("content") + for i, m in enumerate(msgs): + body = m.get("text") or m.get("preview") or "" + # estimate_tokens_text is Nexent's own estimator — same primitive the + # ContextManager uses, so these line up with the threshold logic. + toks = estimate_tokens_text(body) + est_total += toks + role = m.get("role", "?") + is_summary = _is_summary_msg(body) + if is_summary: + has_summary = True + role = "user · summary" + flat = body.replace("\n", " ") + if len(flat) > 280: + flat = flat[:280] + f" …[+{len(flat)-280} chars]" + t.add_row( + str(i), role, str(toks), + f"[yellow]{flat}[/]" if is_summary else flat, + ) + + title = (f"Context fed to main LLM — turn {session.turn}, " + f"call {idx}/{len(pairs)} ({len(msgs)} msgs" + f"{', incl. compression summary' if has_summary else ''})") + console.print(Panel(t, title=title, border_style="blue", expand=False)) + + real_in = end["data"].get("input_tokens") if end else None + footer = f"[dim]· ~{est_total} tokens estimated" + if real_in: + footer += f" · {real_in} reported by the API" + console.print(footer + "[/]") + if has_summary: + console.print( + "[dim]· the [yellow]summary[/] row replaced earlier turns — " + "/summary for its full text, /history for the raw ledger[/]") + else: + console.print( + "[dim]· no summary yet — model still sees the full history " + "verbatim (compression has not collapsed anything)[/]") + # These rows are the INPUT to the call. The model's reply is the call's + # output (the agent answer panel), not a context message — so the table + # ending at the user's question is correct, nothing is missing. + out_chars = end["data"].get("output_chars") if end else None + reply_note = f" ({out_chars} chars)" if out_chars else "" + console.print( + f"[dim]· these are the INPUT to the call; the model's reply{reply_note} " + f"is its output — see the agent answer panel above[/]") + if len(pairs) > 1: + console.print( + f"[dim]· turn made {len(pairs)} main LLM calls (one per step); " + f"/context N for call N[/]") + + +def _cmd_summary(session): + s = session.shared_cm.export_summary() + prev = s.get("previous_summary") + curr = s.get("current_summary") + if not prev and not curr: + console.print("[dim](no compression summary yet — nothing compressed)[/]") + return + if prev: + console.print(Panel(prev, title="previous_summary", border_style="yellow", + expand=False)) + if curr: + console.print(Panel(curr, title="current_summary", border_style="yellow", + expand=False)) + boundary = s.get("compression_boundary") or {} + console.print(f"[dim]boundary: {boundary}[/]") + + +def _cmd_compress(session): + """Show the compression LLM's input prompt and output summary for the + last turn. + + Makes the three things in a compression turn distinguishable: + - what was fed INTO the compression LLM (cyan panels) + - what the compression LLM PRODUCED (yellow panel — the summary) + - the main agent answer is the separate green panel from render_turn. + """ + evs = session.last_turn_events + if not evs: + console.print("[dim](no events from last turn)[/]") + return + + # Pair compression-tagged LLM calls in chronological order. + pairs = [] + pending = None + for e in evs: + if e["event"] == "llm_call_begin" and e["data"].get("tag") == "compression": + pending = e + elif e["event"] == "llm_call_end" and e["data"].get("tag") == "compression": + pairs.append((pending, e)) + pending = None + + if not pairs: + console.print( + "[dim](no compression LLM call last turn — compression did not " + "run, or resolved without invoking the LLM)[/]" + ) + return + + # call_type labels come from compression_call records. Cache hits skip the + # LLM, so only non-cache-hit records line up with the LLM calls above. + call_types = [ + e["data"].get("call_type") + for e in evs + if e["event"] == "compression_call" and not e["data"].get("cache_hit") + ] + + for idx, (begin, end) in enumerate(pairs): + ctype = call_types[idx] if idx < len(call_types) else None + header = f"compression call #{idx + 1}" + if ctype: + header += f" · {ctype}" + console.print(f"\n[bold]{header}[/]") + + in_msgs = (begin["data"].get("input_messages") if begin else None) or [] + for m in in_msgs: + body = m.get("text") or m.get("preview") or "(empty)" + console.print(Panel( + body, + title=(f"→ fed to compression LLM [{m.get('role')}] " + f"{m.get('chars')} chars"), + border_style="cyan", + expand=False, + )) + + d = end["data"] + out = d.get("output_full") or d.get("output_preview") or "(empty)" + console.print(Panel( + out, + title=(f"← compression LLM produced (summary) " + f"{d.get('output_chars')} chars {d.get('duration_ms')}ms"), + border_style="yellow", + expand=False, + )) + + +def _cmd_tokens(session): + if not session.turn_tokens: + console.print("[dim](no turns yet)[/]") + return + t = Table(box=box.SIMPLE_HEAD, title="Token timeline") + t.add_column("Turn", justify="right") + t.add_column("Main in", justify="right") + t.add_column("Main out", justify="right") + t.add_column("Comp in", justify="right") + t.add_column("Comp out", justify="right") + for tk in session.turn_tokens: + t.add_row( + str(tk["turn"]), + str(tk["main_in"]), str(tk["main_out"]), + str(tk["comp_in"] or "-"), str(tk["comp_out"] or "-"), + ) + console.print(t) + + +def _cmd_stats(session): + """Session-wide compression stats — chiefly how many semantic + (LLM-invoking) compressions have run so far, plus cache hits and cost. + + Source is the shared ContextManager's compression_calls_log, which + accumulates across every turn of the session (cleared only by /reset).""" + cm = session.shared_cm + try: + stats = cm.get_all_compression_stats() + except Exception as exc: + console.print(f"[red]could not read compression stats: {exc}[/]") + return + + log = list(getattr(cm, "compression_calls_log", []) or []) + llm_by_type = Counter(r.call_type for r in log if not r.cache_hit) + cache_by_type = Counter(r.call_type for r in log if r.cache_hit) + + t = Table(box=box.SIMPLE, show_header=False) + t.add_column("k", style="cyan") + t.add_column("v") + t.add_row("turns run", str(session.turn)) + t.add_row("LLM compression calls", f"[bold]{stats.get('total_calls', 0)}[/]") + t.add_row("cache hits (no LLM call)", str(stats.get("total_cache_hits", 0))) + t.add_row("total compression attempts", str(stats.get("total_attempts", 0))) + t.add_row( + "compression tokens in→out", + f"{stats.get('total_input_tokens', 0)}→" + f"{stats.get('total_output_tokens', 0)} [dim](API)[/]", + ) + console.print(Panel(t, title="Compression stats — session-wide", + border_style="blue", expand=False)) + if llm_by_type: + bd = " ".join(f"{k}×{n}" for k, n in llm_by_type.items()) + console.print(f"[dim]· LLM compression calls by type: {bd}[/]") + if cache_by_type: + bd = " ".join(f"{k}×{n}" for k, n in cache_by_type.items()) + console.print(f"[dim]· cache-hit (no-LLM) compressions by type: {bd}[/]") + + +def _cmd_trace(session, arg): + events = session.last_turn_events + if not events: + console.print("[dim](no events from last turn)[/]") + return + t = Table(box=box.SIMPLE, title="Last turn — raw events") + t.add_column("seq", justify="right") + t.add_column("step", justify="right") + t.add_column("event", style="cyan") + t.add_column("detail") + for e in events: + d = e["data"] + ev = e["event"] + if ev == "llm_call_end": + detail = (f"tag={d.get('tag')} dur={d.get('duration_ms')}ms " + f"in={d.get('input_tokens')} out={d.get('output_tokens')}") + elif ev == "compress_begin": + pd = d.get("predicted_decision") or {} + detail = f"branch={pd.get('branch')}" + elif ev == "compression_call": + detail = (f"type={d.get('call_type')} cache={d.get('cache_hit')} " + f"in={d.get('input_tokens')} out={d.get('output_tokens')}") + elif ev == "compress_end": + tc = d.get("token_counts") or {} + detail = f"{tc.get('last_uncompressed')}→{tc.get('last_compressed')}" + elif ev == "observer_event": + detail = f"[{d.get('process_type')}]" + elif ev == "code_execute_end": + detail = f"dur={d.get('duration_ms')}ms final={d.get('is_final_answer')}" + elif ev == "tool_call_end": + detail = f"tool={d.get('tool')} dur={d.get('duration_ms')}ms" + elif ev == "debug_error": + detail = f"[red]{d.get('phase')}: {d.get('error')}[/]" + else: + detail = "" + t.add_row(str(e["seq"]), str(e.get("agent_step") or "-"), ev, detail) + console.print(t) + + +def _cmd_step(session, arg): + try: + step_n = int(arg) + except (ValueError, TypeError): + console.print("[red]usage: /step N[/]") + return + events = [e for e in session.last_turn_events + if e.get("agent_step") == step_n] + if not events: + console.print(f"[dim](no events at step {step_n} in last turn)[/]") + return + for e in events: + content = json.dumps(e["data"], ensure_ascii=False, indent=2) + if len(content) > 3000: + content = content[:3000] + f"\n…[+{len(content)-3000} chars]" + console.print(Panel(content, title=f"seq={e['seq']} {e['event']}", + border_style="cyan", expand=False)) + + +HELP = """[bold]Commands[/] + /help this help + /context [N] context the main LLM received last turn (post-compression) + /history raw session ledger (every turn verbatim, pre-compression) + /summary current compression summary (full text) + /compress last turn's compression LLM input prompt + output summary + /tokens per-turn token timeline + /stats session-wide compression stats (LLM compression call count) + /trace raw trace events from the last turn + /step N dump every event of agent step N (last turn) + /config show ContextManagerConfig + /reset [threshold] fresh session, optionally new token_threshold + /quit /q exit + +Anything not starting with / is sent to the agent as a user turn.""" + + +def handle_command(session, line): + """Return (new_session_or_None, should_quit).""" + parts = line.split() + cmd = parts[0].lower() + arg = parts[1] if len(parts) > 1 else None + + if cmd in ("/quit", "/q", "/exit"): + return None, True + if cmd == "/help": + console.print(Panel(HELP, border_style="magenta", expand=False)) + elif cmd == "/context": + _cmd_context(session, arg) + elif cmd == "/history": + _cmd_history(session) + elif cmd == "/summary": + _cmd_summary(session) + elif cmd == "/compress": + _cmd_compress(session) + elif cmd == "/tokens": + _cmd_tokens(session) + elif cmd == "/stats": + _cmd_stats(session) + elif cmd == "/trace": + _cmd_trace(session, arg) + elif cmd == "/step": + _cmd_step(session, arg) + elif cmd == "/config": + _print_config(session) + elif cmd == "/reset": + threshold = session.cm_config.token_threshold + if arg: + try: + threshold = int(arg) + except ValueError: + console.print("[red]threshold must be an integer[/]") + return session, False + new = Session(token_threshold=threshold) + console.print(f"[green]session reset[/] (token_threshold={threshold})") + return new, False + else: + console.print(f"[red]unknown command: {cmd}[/] (/help)") + return session, False + + +def main(): + console.print(Panel( + "Nexent Context Debugger — interactive REPL\n" + "Type a message to run one agent turn. /help for commands.\n" + "Up/down arrows recall earlier input (history kept across sessions).", + border_style="magenta", expand=False, + )) + session = Session() + _print_config(session) + _init_history() + + while True: + try: + # Builtin input() (not console.input) so readline owns the prompt + # and up/down-arrow history recall works cleanly. + raw = input(_PROMPT) + except (EOFError, KeyboardInterrupt): + console.print("\n[dim]bye.[/]") + break + _save_history() + + line = _clean_input(raw).strip() + + if not line: + continue + + if line.startswith("/"): + session, should_quit = handle_command(session, line) + if should_quit: + console.print("[dim]bye.[/]") + break + continue + + with console.status("[dim]running agent turn…[/]"): + try: + result = session.run_turn(line) + except Exception as exc: + console.print(f"[red]turn failed: {exc}[/]") + import traceback + traceback.print_exc(file=_OUT) + continue + + render_turn(session, result, session.last_turn_events) + + +if __name__ == "__main__": + main() diff --git a/sdk/ctx_debugger/langfuse/.gitignore b/sdk/ctx_debugger/langfuse/.gitignore new file mode 100644 index 000000000..18b219590 --- /dev/null +++ b/sdk/ctx_debugger/langfuse/.gitignore @@ -0,0 +1,2 @@ +# Instance secrets — keep local, do not commit. +.env diff --git a/sdk/ctx_debugger/langfuse/docker-compose.yml b/sdk/ctx_debugger/langfuse/docker-compose.yml new file mode 100644 index 000000000..ad79bed5f --- /dev/null +++ b/sdk/ctx_debugger/langfuse/docker-compose.yml @@ -0,0 +1,169 @@ +# Self-hosted Langfuse for ctx_debugger trace analysis. +# +# Adapted from the official langfuse/langfuse docker-compose.yml. Isolation +# choices for running alongside the Nexent stack: +# - separate compose project (run with: docker compose -p langfuse ...) +# - ONLY langfuse-web is published, on 127.0.0.1:3100 (Nexent web owns 3000) +# - postgres / clickhouse / redis / minio / worker have NO published ports; +# they talk over the isolated `langfuse_default` network only +# Secrets and init credentials live in the sibling .env file. +services: + langfuse-worker: + image: docker.io/langfuse/langfuse-worker:3 + restart: always + depends_on: &langfuse-depends-on + postgres: + condition: service_healthy + minio: + condition: service_healthy + redis: + condition: service_healthy + clickhouse: + condition: service_healthy + environment: &langfuse-worker-env + NEXTAUTH_URL: ${NEXTAUTH_URL:-http://localhost:3100} + DATABASE_URL: ${DATABASE_URL:-postgresql://postgres:postgres@postgres:5432/postgres} + SALT: ${SALT:-mysalt} + ENCRYPTION_KEY: ${ENCRYPTION_KEY:-0000000000000000000000000000000000000000000000000000000000000000} + TELEMETRY_ENABLED: ${TELEMETRY_ENABLED:-true} + LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES: ${LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES:-false} + CLICKHOUSE_MIGRATION_URL: ${CLICKHOUSE_MIGRATION_URL:-clickhouse://clickhouse:9000} + CLICKHOUSE_URL: ${CLICKHOUSE_URL:-http://clickhouse:8123} + CLICKHOUSE_USER: ${CLICKHOUSE_USER:-clickhouse} + CLICKHOUSE_PASSWORD: ${CLICKHOUSE_PASSWORD:-clickhouse} + CLICKHOUSE_CLUSTER_ENABLED: ${CLICKHOUSE_CLUSTER_ENABLED:-false} + LANGFUSE_USE_AZURE_BLOB: ${LANGFUSE_USE_AZURE_BLOB:-false} + LANGFUSE_USE_OCI_NATIVE_OBJECT_STORAGE: ${LANGFUSE_USE_OCI_NATIVE_OBJECT_STORAGE:-false} + LANGFUSE_OCI_AUTH_TYPE: ${LANGFUSE_OCI_AUTH_TYPE:-workload_identity} + LANGFUSE_S3_EVENT_UPLOAD_BUCKET: ${LANGFUSE_S3_EVENT_UPLOAD_BUCKET:-langfuse} + LANGFUSE_S3_EVENT_UPLOAD_REGION: ${LANGFUSE_S3_EVENT_UPLOAD_REGION:-auto} + LANGFUSE_S3_EVENT_UPLOAD_ACCESS_KEY_ID: ${LANGFUSE_S3_EVENT_UPLOAD_ACCESS_KEY_ID:-minio} + LANGFUSE_S3_EVENT_UPLOAD_SECRET_ACCESS_KEY: ${LANGFUSE_S3_EVENT_UPLOAD_SECRET_ACCESS_KEY:-miniosecret} + LANGFUSE_S3_EVENT_UPLOAD_ENDPOINT: ${LANGFUSE_S3_EVENT_UPLOAD_ENDPOINT:-http://minio:9000} + LANGFUSE_S3_EVENT_UPLOAD_FORCE_PATH_STYLE: ${LANGFUSE_S3_EVENT_UPLOAD_FORCE_PATH_STYLE:-true} + LANGFUSE_S3_EVENT_UPLOAD_PREFIX: ${LANGFUSE_S3_EVENT_UPLOAD_PREFIX:-events/} + LANGFUSE_S3_MEDIA_UPLOAD_BUCKET: ${LANGFUSE_S3_MEDIA_UPLOAD_BUCKET:-langfuse} + LANGFUSE_S3_MEDIA_UPLOAD_REGION: ${LANGFUSE_S3_MEDIA_UPLOAD_REGION:-auto} + LANGFUSE_S3_MEDIA_UPLOAD_ACCESS_KEY_ID: ${LANGFUSE_S3_MEDIA_UPLOAD_ACCESS_KEY_ID:-minio} + LANGFUSE_S3_MEDIA_UPLOAD_SECRET_ACCESS_KEY: ${LANGFUSE_S3_MEDIA_UPLOAD_SECRET_ACCESS_KEY:-miniosecret} + LANGFUSE_S3_MEDIA_UPLOAD_ENDPOINT: ${LANGFUSE_S3_MEDIA_UPLOAD_ENDPOINT:-http://minio:9000} + LANGFUSE_S3_MEDIA_UPLOAD_FORCE_PATH_STYLE: ${LANGFUSE_S3_MEDIA_UPLOAD_FORCE_PATH_STYLE:-true} + LANGFUSE_S3_MEDIA_UPLOAD_PREFIX: ${LANGFUSE_S3_MEDIA_UPLOAD_PREFIX:-media/} + LANGFUSE_S3_BATCH_EXPORT_ENABLED: ${LANGFUSE_S3_BATCH_EXPORT_ENABLED:-false} + LANGFUSE_S3_BATCH_EXPORT_BUCKET: ${LANGFUSE_S3_BATCH_EXPORT_BUCKET:-langfuse} + LANGFUSE_S3_BATCH_EXPORT_PREFIX: ${LANGFUSE_S3_BATCH_EXPORT_PREFIX:-exports/} + LANGFUSE_S3_BATCH_EXPORT_REGION: ${LANGFUSE_S3_BATCH_EXPORT_REGION:-auto} + LANGFUSE_S3_BATCH_EXPORT_ENDPOINT: ${LANGFUSE_S3_BATCH_EXPORT_ENDPOINT:-http://minio:9000} + LANGFUSE_S3_BATCH_EXPORT_EXTERNAL_ENDPOINT: ${LANGFUSE_S3_BATCH_EXPORT_EXTERNAL_ENDPOINT:-http://minio:9000} + LANGFUSE_S3_BATCH_EXPORT_ACCESS_KEY_ID: ${LANGFUSE_S3_BATCH_EXPORT_ACCESS_KEY_ID:-minio} + LANGFUSE_S3_BATCH_EXPORT_SECRET_ACCESS_KEY: ${LANGFUSE_S3_BATCH_EXPORT_SECRET_ACCESS_KEY:-miniosecret} + LANGFUSE_S3_BATCH_EXPORT_FORCE_PATH_STYLE: ${LANGFUSE_S3_BATCH_EXPORT_FORCE_PATH_STYLE:-true} + LANGFUSE_INGESTION_QUEUE_DELAY_MS: ${LANGFUSE_INGESTION_QUEUE_DELAY_MS:-} + LANGFUSE_INGESTION_CLICKHOUSE_WRITE_INTERVAL_MS: ${LANGFUSE_INGESTION_CLICKHOUSE_WRITE_INTERVAL_MS:-} + REDIS_HOST: ${REDIS_HOST:-redis} + REDIS_PORT: ${REDIS_PORT:-6379} + REDIS_AUTH: ${REDIS_AUTH:-myredissecret} + REDIS_TLS_ENABLED: ${REDIS_TLS_ENABLED:-false} + EMAIL_FROM_ADDRESS: ${EMAIL_FROM_ADDRESS:-} + SMTP_CONNECTION_URL: ${SMTP_CONNECTION_URL:-} + + langfuse-web: + image: docker.io/langfuse/langfuse:3 + restart: always + depends_on: *langfuse-depends-on + # Only this service is published. Bound to all interfaces so a Windows + # netsh portproxy (host:3100 -> WSL:3100) can reach it from the LAN. + # All other Langfuse components stay internal — no other port is exposed. + ports: + - "3100:3000" + environment: + <<: *langfuse-worker-env + NEXTAUTH_SECRET: ${NEXTAUTH_SECRET:-mysecret} + LANGFUSE_INIT_ORG_ID: ${LANGFUSE_INIT_ORG_ID:-} + LANGFUSE_INIT_ORG_NAME: ${LANGFUSE_INIT_ORG_NAME:-} + LANGFUSE_INIT_PROJECT_ID: ${LANGFUSE_INIT_PROJECT_ID:-} + LANGFUSE_INIT_PROJECT_NAME: ${LANGFUSE_INIT_PROJECT_NAME:-} + LANGFUSE_INIT_PROJECT_PUBLIC_KEY: ${LANGFUSE_INIT_PROJECT_PUBLIC_KEY:-} + LANGFUSE_INIT_PROJECT_SECRET_KEY: ${LANGFUSE_INIT_PROJECT_SECRET_KEY:-} + LANGFUSE_INIT_USER_EMAIL: ${LANGFUSE_INIT_USER_EMAIL:-} + LANGFUSE_INIT_USER_NAME: ${LANGFUSE_INIT_USER_NAME:-} + LANGFUSE_INIT_USER_PASSWORD: ${LANGFUSE_INIT_USER_PASSWORD:-} + + clickhouse: + image: docker.io/clickhouse/clickhouse-server + restart: always + user: "101:101" + environment: + CLICKHOUSE_DB: default + CLICKHOUSE_USER: ${CLICKHOUSE_USER:-clickhouse} + CLICKHOUSE_PASSWORD: ${CLICKHOUSE_PASSWORD:-clickhouse} + volumes: + - langfuse_clickhouse_data:/var/lib/clickhouse + - langfuse_clickhouse_logs:/var/log/clickhouse-server + healthcheck: + test: wget --no-verbose --tries=1 --spider http://localhost:8123/ping || exit 1 + interval: 5s + timeout: 5s + retries: 10 + start_period: 1s + + minio: + image: cgr.dev/chainguard/minio + restart: always + entrypoint: sh + # create the 'langfuse' bucket before starting the service + command: -c 'mkdir -p /data/langfuse && minio server --address ":9000" --console-address ":9001" /data' + environment: + MINIO_ROOT_USER: ${MINIO_ROOT_USER:-minio} + MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-miniosecret} + volumes: + - langfuse_minio_data:/data + healthcheck: + test: ["CMD", "mc", "ready", "local"] + interval: 1s + timeout: 5s + retries: 5 + start_period: 1s + + redis: + image: docker.io/redis:7 + restart: always + command: > + --requirepass ${REDIS_AUTH:-myredissecret} + --maxmemory-policy noeviction + volumes: + - langfuse_redis_data:/data + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 3s + timeout: 10s + retries: 10 + + postgres: + image: docker.io/postgres:${POSTGRES_VERSION:-17} + restart: always + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 3s + timeout: 3s + retries: 10 + environment: + POSTGRES_USER: ${POSTGRES_USER:-postgres} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-postgres} + POSTGRES_DB: ${POSTGRES_DB:-postgres} + TZ: UTC + PGTZ: UTC + volumes: + - langfuse_postgres_data:/var/lib/postgresql/data + +volumes: + langfuse_postgres_data: + driver: local + langfuse_clickhouse_data: + driver: local + langfuse_clickhouse_logs: + driver: local + langfuse_minio_data: + driver: local + langfuse_redis_data: + driver: local diff --git a/sdk/ctx_debugger/langfuse_eval_assessment.md b/sdk/ctx_debugger/langfuse_eval_assessment.md new file mode 100644 index 000000000..cade3395c --- /dev/null +++ b/sdk/ctx_debugger/langfuse_eval_assessment.md @@ -0,0 +1,78 @@ +# Langfuse Evaluation Capability Adaptation Assessment + +For the three benchmarks in this repo (`sdk/benchmark/`) — `manual_cases` / `acon_eval` / `eventqa_eval` — evaluate feasibility and gaps of using Langfuse's built-in **Evaluation / Scores / LLM-as-a-Judge / Human Annotation / Datasets** as the main evaluation framework. + +> Scope: Only evaluate Langfuse evaluation features. We already use Langfuse's trace visualization and session grouping (`ctx_debugger/langfuse_export.py`), that part not discussed here. + +--- + +## 1. Langfuse Evaluation Capabilities vs This Repo's Needs + +| Langfuse Feature | Design Purpose | Where suitable in this repo | +|---|---|---| +| **Scores** | Attach numeric/category metrics to trace / observation / session | ✅ Attach each question's correct/incorrect / retention / token_reduction; dashboard cross-session comparison | +| **LLM-as-a-Judge** | Let a judge LLM score open-ended answers | ⚠️ Most evaluation here is deterministic (MCQ, EM/F1, keywords); judge反而introduces noise | +| **Human Annotation** | Queue traces for manual annotation | ⚠️ Only useful for open-ended output/quality subjective judgment | +| **Datasets** | Collection of input + expected output pairs, run experiment | ⚠️ Dataset and task model mismatch (see below) | + +--- + +## 2. (a) Overall Benchmark Adaptation Assessment + +Three benchmarks' evaluation methods: + +| Benchmark | Evaluation Method | Langfuse Replacement Feasibility | +|---|---|---| +| `manual_cases` | `eval_text(text, check)` keyword `must_contain` / `must_contain_any` | Keyword check done externally cheaper, more accurate; **but summary inspection layer switching to LLM-as-a-Judge has value**—current `must_contain` only verifies "appeared or not", judge can ask "does this summary retain key states" | +| `acon_eval` | EM / F1 (deterministic string) | ❌ No need for judge / annotation | +| `eventqa_eval` | Six-choice string match | ❌ No need for judge / annotation | + +**Structural gap**: Langfuse's Experiment framework follows **"one input → one LLM call → one output"** model. Our task is **entire agent run + multi-turn ingest + multiple probes**—doesn't match Langfuse Dataset/Experiment's "task per item". Forcing in等于把 `run_*.py`拆成一堆 Langfuse callbacks, complexity rises, benefit small. + +**Real incremental value** in two areas: + +1. **Scores push (high priority)**: Extend `langfuse_export.py`, attach each probe trace with `correctness: 0/1` score, attach entire session with aggregate `accuracy` / `retention` / `token_reduction`. Dashboard can visualize time-series comparison of different params/schema/models. **Highest ROI integration**. +2. **LLM-as-a-Judge only for `manual_cases` summary inspection layer**: Current `summary_checks` uses `must_contain` keyword check, misses synonymous rewrites. Switch judge evaluating "does summary retain X info" more robust. But don't touch acon/eventqa—MCQ上 judge反而introduces误判. + +--- + +## 3. (b) EventQA Individual Assessment + +| Dimension | Langfuse Replacement | Evaluation | +|---|---|---| +| Probe MCQ scoring | Langfuse Scores | ✅ **Feasible and recommended**—attach each probe trace with `correctness: 0/1`, `match_type: exact/containment/fuzzy/no_answer` | +| Token reduction | Langfuse built-in token tracking | ✅ Langfuse **自带 per-call token count** (input/output/cost), more precise than "take last turn get_token_counts"; can use ingest phase LLM calls total tokens as Score | +| Retention (compressed/baseline) | Langfuse cross-session aggregation | ⚠️ Langfuse **不自动算 retention**—only shows各自 acc, ratio needs external calculation then push as Score | +| LLM-as-a-Judge | — | ❌ **Not needed**—MCQ gold is one of six options, deterministic match sufficient; judge introduces unnecessary LLM calls | +| Human Annotation | — | ❌ **Not needed**—same as above | +| Datasets | Put 100 questions into Langfuse Dataset | ⚠️ **Duplicate data storage**—we already have `data/eventqa_full.jsonl`; unless running Langfuse Experiment flow, pure duplication | + +### EventQA Specific Gaps + +1. **Cannot "end-to-end run EventQA in Langfuse"**—its task model is "one input → one LLM call → one output". EventQA's "input" is entire novel (needs 24 turns of ingest to compress), "output" is 100 question answers. Entire ingest+probe flow forcing into Langfuse Experiment unnatural—still need external `run_eventqa.py` to run, import results in. + +2. **Retention is cross-arm ratio**: Langfuse has no "cross session/trace automatic comparison" concept. To get compressed_acc / baseline_acc must calculate externally then push. + +3. **Per-probe context cost**: Langfuse's token count is LLM actual input/output tokens, **more precise than `manual_cases`同款 "take last turn effective tokens"**. Can switch to Langfuse-reported real token cost替代 single-point estimate. + +--- + +## 4. Implementation Priority + +By descending benefit: + +| Priority | Action | Benefit | Work量 | +|---|---|---|---| +| **High (已落地)** | Extend `langfuse_export.py`: Add `--benchmarkqa-outputs `; each probe trace attach `correctness` (NUMERIC 0/1) + `match_type` (CATEGORICAL), score metadata contains arm / schema / qid. Langfuse UI auto aggregates `correctness` by session, filter by `metadata.arm` can split compressed / baseline. `retention` / `token_reduction` **不push**—already in `outputs//summary.json`, pushing to Langfuse反而needs creating phantom "session-summary" trace polluting trace list. | Dashboard directly see time-series / cross-session comparison; foundation for other features | ~80 lines | +| **Medium** | Add LLM-as-a-Judge evaluator for `manual_cases`' summary_checks (doesn't miss synonymous rewrites) | Real complement to `must_contain` keyword method | ~100 lines + judge prompt design | +| **Low** | Move EventQA data into Langfuse Dataset | Not much new value—already have jsonl | ~30 lines | +| **Don't do** | Move EventQA evaluation main flow to Langfuse Experiments | Model mismatch—forcing等于把 `run_eventqa.py`拆成一堆 callbacks | × | +| **Don't do** | LLM-as-a-Judge / Human Annotation on MCQ | Introduces noise, no benefit | × | + +--- + +## 5. Summary + +- Langfuse's evaluation framework **cannot replace main flow** (agent multi-turn ingest + probe + cross-arm retention structure doesn't match its task model) +- **Only high ROI integration is Scores push**—push existing evaluation results into Langfuse for visualization, convenient cross-session comparison of params/model/schema tuning +- LLM-as-a-Judge / Human Annotation / Datasets only have marginal value for `manual_cases`' summary inspection一小段; for acon/eventqa deterministic evaluation introduces noise \ No newline at end of file diff --git a/sdk/ctx_debugger/langfuse_export.py b/sdk/ctx_debugger/langfuse_export.py new file mode 100644 index 000000000..f5509eb09 --- /dev/null +++ b/sdk/ctx_debugger/langfuse_export.py @@ -0,0 +1,543 @@ +"""Export a ctx_debugger JSONL trace into Langfuse for visual analysis. + +This is the "option 1" adapter: instead of building a custom web UI, map the +trace onto a self-hosted Langfuse instance and get nested traces, drill-down, +token/cost views and session grouping for free. + +Mapping: + each agent turn (an `agent_init` event) -> one Langfuse trace + llm_call_begin/end -> a generation + compress_begin/end -> a span wrapping its + compression generations + tool_call_begin/end -> a tool observation + code_execute_begin/end -> a span + the whole file -> one Langfuse session + +Usage (from sdk/): + python -m ctx_debugger.langfuse_export [options] + +Options: + --session-id ID Langfuse session id (default: -) + --dry-run Print the mapped trace tree; do not contact Langfuse + --host URL Langfuse host (else $LANGFUSE_HOST) + +Langfuse credentials are read from the environment, the standard way: + LANGFUSE_HOST, LANGFUSE_PUBLIC_KEY, LANGFUSE_SECRET_KEY + +Known limitation: observations are created at export time, so each one's +duration is faithful but absolute placement on the Langfuse timeline is the +export moment, not the original wall-clock time. +""" + +import argparse +import contextlib +import json +import os +import re +import sys +import time +from typing import Any, Dict, List, Optional + +# Begin event -> its matching end event. Everything else is standalone. +BEGIN_TO_END = { + "compress_begin": "compress_end", + "llm_call_begin": "llm_call_end", + "code_execute_begin": "code_execute_end", + "tool_call_begin": "tool_call_end", +} +END_EVENTS = set(BEGIN_TO_END.values()) + + +class Obs: + """One Langfuse observation built from a begin/end event pair.""" + + __slots__ = ("as_type", "name", "input", "output", "metadata", + "usage", "duration_ms", "children") + + def __init__(self, as_type: str, name: str): + self.as_type = as_type + self.name = name + self.input: Any = None + self.output: Any = None + self.metadata: Dict[str, Any] = {} + self.usage: Optional[Dict[str, int]] = None + self.duration_ms: Optional[float] = None + self.children: List["Obs"] = [] + + +# ============================================================ +# Trace file -> per-turn segments +# ============================================================ + +def _load(path: str) -> List[dict]: + events = [] + with open(path, encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + events.append(json.loads(line)) + return events + + +# ============================================================ +# Benchmark probe-score helpers (optional --benchmarkqa-outputs) +# ============================================================ +# When the caller points us at a benchmark outputs// directory +# (currently eventqa_eval; longmemeval & others can plug in later as long as +# they emit a compatible predictions.jsonl), we read its predictions.jsonl +# and attach Langfuse Scores to each probe trace: +# * name=correctness (NUMERIC 0/1) + name=match_type (CATEGORICAL), +# with arm/schema in score metadata +# AND session-level aggregates (read from summary.json) pushed directly +# to the session_id: +# * baseline_accuracy / compressed_accuracy_ / +# memory_retention_ / token_reduction_ +# These show up in the Langfuse project session list as per-session +# aggregates — visible alongside session name without drilling into traces. + +def _qnum(qid: Optional[str]) -> int: + m = re.search(r"no(\d+)$", qid or "") + return int(m.group(1)) if m else -1 + + +def _load_benchmark_outputs(out_dir: Optional[str]) -> Optional[dict]: + if not out_dir: + return None + pred_p = os.path.join(out_dir, "predictions.jsonl") + sum_p = os.path.join(out_dir, "summary.json") + if not os.path.exists(pred_p): + return None + preds = [] + with open(pred_p, encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + preds.append(json.loads(line)) + preds.sort(key=lambda p: _qnum(p.get("qid"))) + summary = None + if os.path.exists(sum_p): + with open(sum_p, encoding="utf-8") as f: + summary = json.load(f) + return {"predictions": preds, "summary": summary} + + +def _push_session_aggregates(client, session_id: str, summary: dict) -> int: + """Push session-level aggregates (baseline_accuracy / compressed_accuracy_ + / memory_retention_ / token_reduction_) directly to the + session — no host trace required. Despite my earlier failed API queries, + these scores DO persist in Langfuse v4 and show up in the project session + list as per-session aggregates (visible in the UI alongside the session + name, no need to drill into a trace). + """ + if not summary: + return 0 + pushed = 0 + + def _push(name, value): + nonlocal pushed + if value is None: + return + try: + client.create_score(session_id=session_id, name=name, + value=float(value), data_type="NUMERIC") + pushed += 1 + except Exception as e: + print(f" warn: failed to push {name}={value}: {e}", file=sys.stderr) + + _push("baseline_accuracy", (summary.get("baseline") or {}).get("accuracy")) + for schema, c in (summary.get("compressed") or {}).items(): + _push(f"compressed_accuracy_{schema}", c.get("accuracy")) + _push(f"memory_retention_{schema}", c.get("memory_retention")) + _push(f"token_reduction_{schema}", c.get("token_reduction")) + return pushed + + +def _classify_probe_arm(events: List[dict]) -> str: + """compressed vs baseline — detect by the 'Here is the novel' marker.""" + for ev in events: + if ev.get("event") != "llm_call_begin": + continue + for m in ev.get("data", {}).get("input_messages", []) or []: + txt = m.get("text") or m.get("preview") or "" + if "Here is the novel" in txt: + return "baseline" + break + return "compressed" + + +def _split_turns(events: List[dict]) -> List[dict]: + """Split a flat event list into per-turn segments, one per agent_init.""" + turns: List[dict] = [] + current: Optional[dict] = None + orphan: List[dict] = [] + for e in events: + ev = e["event"] + if ev == "run_begin": + continue + if ev == "agent_init": + if current is not None: + turns.append(current) + current = {"init": e, "events": []} + elif current is None: + orphan.append(e) + else: + current["events"].append(e) + if current is not None: + turns.append(current) + if orphan: + if turns: + turns[0]["events"] = orphan + turns[0]["events"] + else: + turns.append({"init": None, "events": orphan}) + return turns + + +# ============================================================ +# Events -> intermediate observation tree +# ============================================================ + +def _chat(input_messages: Any) -> List[dict]: + """Render captured input_messages as a chat list for Langfuse.""" + out = [] + for m in input_messages or []: + out.append({ + "role": m.get("role"), + "content": m.get("text") or m.get("preview") or "", + }) + return out + + +def _begin_obs(begin_ev: str, data: dict) -> Obs: + if begin_ev == "llm_call_begin": + tag = data.get("tag", "?") + o = Obs("generation", f"{tag} LLM call") + o.input = _chat(data.get("input_messages")) + o.metadata = {"tag": tag, "stop_sequences": data.get("stop_sequences")} + return o + if begin_ev == "compress_begin": + o = Obs("span", "compression") + o.input = { + "predicted_decision": data.get("predicted_decision"), + "estimated_tokens": data.get("estimated_tokens"), + } + o.metadata = { + "compression_step": data.get("compression_step"), + "config": data.get("config"), + "summary_before": data.get("summary_before"), + } + return o + if begin_ev == "code_execute_begin": + o = Obs("span", "code execution") + o.input = data.get("code_preview") + o.metadata = {"code_chars": data.get("code_chars")} + return o + if begin_ev == "tool_call_begin": + o = Obs("tool", f"tool: {data.get('tool', '?')}") + o.input = {"args": data.get("args"), "kwargs": data.get("kwargs")} + return o + return Obs("span", begin_ev) + + +def _finish_obs(obs: Obs, begin_ev: str, begin_e: dict, end_e: dict) -> None: + d = end_e["data"] + obs.duration_ms = round((end_e["ts"] - begin_e["ts"]) * 1000, 1) + if begin_ev == "llm_call_begin": + obs.output = d.get("output_full") or d.get("output_preview") + it, ot = d.get("input_tokens"), d.get("output_tokens") + if it is not None or ot is not None: + obs.usage = {"input": it or 0, "output": ot or 0} + if d.get("error"): + obs.metadata["error"] = d["error"] + elif begin_ev == "compress_begin": + obs.output = { + "token_counts": d.get("token_counts"), + "summary_changed": d.get("summary_changed"), + "summary_after": d.get("summary_after"), + } + obs.metadata["success"] = d.get("success") + obs.metadata["step_stats"] = d.get("step_stats") + elif begin_ev == "code_execute_begin": + obs.output = { + "output": d.get("output_preview"), + "logs": d.get("logs_preview"), + } + obs.metadata["is_final_answer"] = d.get("is_final_answer") + elif begin_ev == "tool_call_begin": + obs.output = d.get("return_preview") + obs.metadata["return_type"] = d.get("return_type") + + +def _build_tree(events: List[dict]) -> List[Obs]: + """Pair begin/end events into a nested observation tree.""" + roots: List[Obs] = [] + stack: List[tuple] = [] # (obs, begin_event, begin_ev_name) + for e in events: + ev = e["event"] + if ev in BEGIN_TO_END: + obs = _begin_obs(ev, e["data"]) + (stack[-1][0].children if stack else roots).append(obs) + stack.append((obs, e, ev)) + elif ev in END_EVENTS: + for i in range(len(stack) - 1, -1, -1): + obs, begin_e, begin_ev = stack[i] + if BEGIN_TO_END[begin_ev] == ev: + _finish_obs(obs, begin_ev, begin_e, e) + del stack[i:] # close it (and any left wrongly open) + break + elif ev == "compression_call": + for obs, _be, begin_ev in reversed(stack): + if begin_ev == "compress_begin": + obs.metadata.setdefault("compression_calls", []).append(e["data"]) + break + elif ev == "debug_error": + target = stack[-1][0].metadata if stack else None + if target is not None: + target.setdefault("debug_errors", []).append(e["data"]) + # observer_event and others are intentionally skipped (noise). + return roots + + +def _init_payload(init: Optional[dict]): + if not init: + return None, {} + d = init["data"] + inp = { + "agent": d.get("agent_name"), + "agent_class": d.get("agent_class"), + "tools": [t.get("name") for t in d.get("tools", [])], + } + meta = { + "system_prompt": d.get("system_prompt"), + "system_prompt_chars": d.get("system_prompt_chars"), + "max_steps": d.get("max_steps"), + "context_manager_config": d.get("context_manager_config"), + } + return inp, meta + + +# ============================================================ +# Dry-run printer +# ============================================================ + +def _print_turns(turns: List[dict]) -> None: + for i, turn in enumerate(turns, 1): + roots = _build_tree(turn["events"]) + init = turn["init"] + agent = (init["data"].get("agent_name") if init else None) or "agent" + print(f"\n● trace: turn {i} · {agent}") + for o in roots: + _print_obs(o, 1) + + +def _print_obs(o: Obs, depth: int) -> None: + pad = " " * depth + dur = f"{o.duration_ms / 1000:.1f}s" if o.duration_ms else "-" + extra = "" + if o.usage: + extra = f" in={o.usage['input']} out={o.usage['output']} tok" + print(f"{pad}{o.name} [{o.as_type}] {dur}{extra}") + for c in o.children: + _print_obs(c, depth + 1) + + +# ============================================================ +# Langfuse push +# ============================================================ + +def _clean(d: Optional[dict]) -> dict: + return {k: v for k, v in (d or {}).items() if v is not None} + + +def _emit(parent, o: Obs) -> None: + """Recursively create a Langfuse observation and its children.""" + start_ns = time.time_ns() + kwargs: Dict[str, Any] = {"name": o.name, "as_type": o.as_type} + if o.input is not None: + kwargs["input"] = o.input + md = _clean(o.metadata) + if md: + kwargs["metadata"] = md + if o.usage and o.as_type == "generation": + kwargs["usage_details"] = o.usage + child = parent.start_observation(**kwargs) + for c in o.children: + _emit(child, c) + if o.output is not None: + child.update(output=o.output) + # Explicit end_time so the displayed duration matches the recorded one. + child.end(end_time=start_ns + int((o.duration_ms or 0) * 1e6)) + + +def _push_probe_score(client, turn: dict, trace_id: str, benchmark_data: dict, + comp_idx: int, base_idx: int) -> tuple: + """If this turn is a benchmark probe agent, attach correctness + match_type + scores to the just-created trace. Returns updated (comp_idx, base_idx).""" + init = turn.get("init") or {} + agent_name = (init.get("data") or {}).get("agent_name") or "" + # Currently recognises eventqa_answerer; longmemeval / other benchmarks + # can plug in here once their probe agent uses an *_answerer name. + if "answerer" not in agent_name: + return comp_idx, base_idx + if not trace_id: + return comp_idx, base_idx + + arm = _classify_probe_arm(turn["events"]) + preds = benchmark_data["predictions"] + idx = comp_idx if arm == "compressed" else base_idx + if idx >= len(preds): + return comp_idx, base_idx # out of probes — skip silently + + row = preds[idx] + if arm == "compressed": + compressed_block = row.get("compressed") or {} + # First schema present (single-schema case) — for multi-schema use the + # session-level score breakdown to disambiguate. + if not compressed_block: + return comp_idx + 1, base_idx + schema = next(iter(compressed_block.keys())) + arm_pred = compressed_block[schema] + meta = {"arm": "compressed", "schema": schema, + "qid": row.get("qid"), "match_type": arm_pred.get("match_type")} + else: + arm_pred = row.get("baseline") or {} + if not arm_pred: + return comp_idx, base_idx + 1 + meta = {"arm": "baseline", "qid": row.get("qid"), + "match_type": arm_pred.get("match_type")} + + client.create_score( + trace_id=trace_id, + name="correctness", + value=1.0 if arm_pred.get("correct") else 0.0, + data_type="NUMERIC", + metadata=meta, + ) + if arm_pred.get("match_type"): + client.create_score( + trace_id=trace_id, + name="match_type", + value=arm_pred["match_type"], + data_type="CATEGORICAL", + metadata={"arm": arm}, + ) + + return (comp_idx + 1, base_idx) if arm == "compressed" else (comp_idx, base_idx + 1) + + +def _export(turns: List[dict], session_id: str, + benchmark_data: Optional[dict] = None) -> None: + from langfuse import Langfuse + try: + from langfuse import propagate_attributes + except Exception: # pragma: no cover - older/newer SDK layout + propagate_attributes = None + + client = Langfuse() + comp_idx = 0 + base_idx = 0 + + for i, turn in enumerate(turns, 1): + roots = _build_tree(turn["events"]) + init = turn["init"] + agent = (init["data"].get("agent_name") if init else None) or "agent" + inp, meta = _init_payload(init) + + all_ev = turn["events"] + ([init] if init else []) + t0 = min((e["ts"] for e in all_ev), default=time.time()) + t1 = max((e["ts"] for e in all_ev), default=t0) + + ctx = (propagate_attributes(session_id=session_id, trace_name=f"turn-{i}") + if propagate_attributes else contextlib.nullcontext()) + with ctx: + start_ns = time.time_ns() + root = client.start_observation( + name=f"turn {i}: {agent}", as_type="span", + input=inp, metadata=_clean(meta), + ) + for o in roots: + _emit(root, o) + + # Attach per-probe correctness scores using the explicit trace_id + # of the just-created root observation. Doesn't depend on + # OTEL "current span" context (start_observation does NOT make + # the span current — would need start_as_current_observation). + if benchmark_data is not None: + comp_idx, base_idx = _push_probe_score( + client, turn, getattr(root, "trace_id", None), + benchmark_data, comp_idx, base_idx, + ) + + root.end(end_time=start_ns + int((t1 - t0) * 1e9)) + + if benchmark_data is not None: + # Per-probe scores attached above. Now push session-level aggregates + # (baseline_accuracy / compressed_accuracy_ / memory_retention + # / token_reduction) directly to the session_id — these show up in + # the Langfuse project session list as per-session aggregates without + # needing a phantom 'session-summary' trace. + n = _push_session_aggregates(client, session_id, + benchmark_data.get("summary") or {}) + print(f" scores: {comp_idx} compressed + {base_idx} baseline " + f"correctness on probe traces + {n} session aggregates") + + client.flush() + + +# ============================================================ +# CLI +# ============================================================ + +def main() -> None: + ap = argparse.ArgumentParser( + prog="ctx-langfuse-export", + description="Export a ctx_debugger JSONL trace into Langfuse.", + ) + ap.add_argument("trace", help="Path to a ctx_debugger JSONL trace file.") + ap.add_argument("--session-id", help="Langfuse session id to group turns.") + ap.add_argument("--dry-run", action="store_true", + help="Print the mapped trace tree; do not contact Langfuse.") + ap.add_argument("--host", help="Langfuse host (else $LANGFUSE_HOST).") + ap.add_argument( + "--benchmarkqa-outputs", default=None, + help=("Optional path to a benchmark outputs// directory " + "(e.g. eventqa_eval/outputs/eventqa_full_book0). When set, the " + "export attaches per-probe Langfuse Scores: name=correctness " + "(NUMERIC 0/1) + name=match_type (CATEGORICAL), with arm/schema " + "in score metadata. Langfuse UI rolls these up into per-session " + "averages automatically (filter by metadata.arm). When NOT set, " + "the export does plain trace upload — identical to before."), + ) + args = ap.parse_args() + + events = _load(args.trace) + turns = _split_turns(events) + if not turns: + sys.exit("No turns (agent_init events) found in this trace.") + + stem = os.path.splitext(os.path.basename(args.trace))[0] + session_id = args.session_id or f"{stem}-{time.strftime('%m%d-%H%M%S')}" + + if args.dry_run: + print(f"DRY RUN — {len(turns)} turn(s), session_id={session_id}") + _print_turns(turns) + return + + if args.host: + os.environ["LANGFUSE_HOST"] = args.host + if not (os.environ.get("LANGFUSE_PUBLIC_KEY") + and os.environ.get("LANGFUSE_SECRET_KEY")): + sys.exit("ERROR: set LANGFUSE_PUBLIC_KEY / LANGFUSE_SECRET_KEY " + "(and LANGFUSE_HOST), or use --dry-run.") + + benchmark_data = _load_benchmark_outputs(args.benchmarkqa_outputs) + if args.benchmarkqa_outputs and not benchmark_data: + print(f" warn: --benchmarkqa-outputs={args.benchmarkqa_outputs} did not " + f"yield predictions.jsonl; skipping score upload.", + file=sys.stderr) + + _export(turns, session_id, benchmark_data=benchmark_data) + print(f"Exported {len(turns)} turn(s) to Langfuse — session_id={session_id}") + + +if __name__ == "__main__": + main() diff --git a/sdk/nexent/core/agents/__init__.py b/sdk/nexent/core/agents/__init__.py index 18dfe7eee..53ac6d8bc 100644 --- a/sdk/nexent/core/agents/__init__.py +++ b/sdk/nexent/core/agents/__init__.py @@ -1,8 +1,28 @@ from .core_agent import CoreAgent -from .agent_model import ModelConfig, ToolConfig, AgentConfig, AgentRunInfo, AgentHistory +from .agent_model import ( + ModelConfig, + ToolConfig, + AgentConfig, + AgentRunInfo, + AgentHistory, + ContextComponent, + SystemPromptComponent, + ToolsComponent, + SkillsComponent, + MemoryComponent, + KnowledgeBaseComponent, + ManagedAgentsComponent, + ExternalAgentsComponent, + ContextStrategy, + FullStrategy, + TokenBudgetStrategy, + BufferedStrategy, + PriorityWeightedStrategy, + ComponentType, +) from .agent_context import ContextManager, SummaryTaskStep from .summary_cache import PreviousSummaryCache, CurrentSummaryCache, CompressionCallRecord -from .summary_config import ContextManagerConfig +from .summary_config import ContextManagerConfig, StrategyType __all__ = [ "CoreAgent", @@ -17,4 +37,19 @@ "CurrentSummaryCache", "CompressionCallRecord", "ContextManagerConfig", + "StrategyType", + "ContextComponent", + "SystemPromptComponent", + "ToolsComponent", + "SkillsComponent", + "MemoryComponent", + "KnowledgeBaseComponent", + "ManagedAgentsComponent", + "ExternalAgentsComponent", + "ContextStrategy", + "FullStrategy", + "TokenBudgetStrategy", + "BufferedStrategy", + "PriorityWeightedStrategy", + "ComponentType", ] \ No newline at end of file diff --git a/sdk/nexent/core/agents/agent_context.py b/sdk/nexent/core/agents/agent_context.py index 134110c5f..0b40d325c 100644 --- a/sdk/nexent/core/agents/agent_context.py +++ b/sdk/nexent/core/agents/agent_context.py @@ -2,6 +2,11 @@ Provides ContextManager for token-aware compression of agent memory, supporting incremental summarization with cache-based optimization. + +Also provides ContextManager as the single source of truth for: +- Context component registration and lifecycle +- System prompt assembly from components +- Strategy-based component selection """ import hashlib @@ -10,13 +15,16 @@ import re import threading from dataclasses import dataclass -from typing import List, Optional, Tuple, Union +from typing import TYPE_CHECKING, List, Optional, Tuple, Union + +if TYPE_CHECKING: + from .agent_model import ContextComponent, ContextStrategy from smolagents.memory import ActionStep, AgentMemory, MemoryStep, TaskStep from smolagents.models import ChatMessage, MessageRole from .summary_cache import CompressionCallRecord, CurrentSummaryCache, PreviousSummaryCache -from .summary_config import ContextManagerConfig +from .summary_config import ContextManagerConfig, StrategyType logger = logging.getLogger("agent_context") @@ -41,15 +49,214 @@ def to_messages(self, summary_mode: bool = False) -> list: return [ChatMessage(role=MessageRole.USER, content=content)] +# ============================================================ +# Standalone utilities (no ContextManager state required) +# ============================================================ + +def format_summary_output(raw_output: str) -> Optional[str]: + """Clean and validate LLM summary output. + + Strips markdown code fences, attempts JSON parse for normalization, + falls back to plain text if not valid JSON. + """ + cleaned = raw_output.strip() + if cleaned.startswith("```"): + cleaned = re.sub(r"^```(?:json)?\s*\n?", "", cleaned) + cleaned = re.sub(r"\n?```\s*$", "", cleaned) + if not cleaned: + return None + try: + parsed = json.loads(cleaned) + return json.dumps(parsed, ensure_ascii=False, indent=2) + except json.JSONDecodeError: + logger.warning("Summary output is not valid JSON; using as plain text") + return cleaned + + +def _is_context_length_error(err: Exception) -> bool: + """Check if an exception indicates a context length / token limit error.""" + msg = str(err).lower() + return any(k in msg for k in ( + "context_length", "context length", "maximum context", "maximum context length", + "prompt is too long", "reduce the length", "too many tokens", + "token limit", "exceeds the maximum", "input is too long", + "input length", "exceeds context", "context window", + )) + + +def compress_history_offline( + pairs: List[Tuple[str, str]], + model, + config: Optional[ContextManagerConfig] = None, + previous_summary: Optional[str] = None, +) -> dict: + """Compress conversation history offline, without ContextManager or AgentMemory. + + This is a standalone function for **Static Compression Inspection** in + benchmarks. It takes plain-text (user, assistant) pairs and produces a + summary using the same prompts and schema as the in-agent compression path, + but without any stateful cache, offload store, or agent runtime. + + Args: + pairs: List of (user_text, assistant_text) tuples representing + conversation turns to compress. + model: An LLM model object compatible with smolagents' call interface. + config: ContextManagerConfig providing prompts, schema, and token budgets. + Defaults to a fresh ContextManagerConfig() if not provided. + previous_summary: Optional existing summary text for incremental + compression. If provided, uses the incremental prompt + to update rather than create from scratch. + + Returns: + dict with: + - "summary": the compressed summary text (str or None on failure) + - "is_incremental": whether incremental compression was used + - "is_fallback": whether the LLM failed and fallback truncation was used + - "input_text": the raw text that was fed to the LLM (for debugging) + - "input_chars": character count of the input text + """ + config = config or ContextManagerConfig() + # Same compensation as ContextManager.__init__: when max_summary_input_tokens + # is left at the default 0, derive it from token_threshold so that truncation + # logic doesn't accidentally chop all input. + if config.max_summary_input_tokens <= 0: + config.max_summary_input_tokens = int(config.token_threshold * 1.2) + if not pairs and not previous_summary: + return { + "summary": None, + "is_incremental": False, + "is_fallback": False, + "input_text": "", + "input_chars": 0, + } + + # Build input text from pairs + parts = [] + for user_text, assistant_text in pairs: + parts.append(f"user: {user_text}\nassistant: {assistant_text}") + pairs_text = "\n\n".join(parts) + + # Determine compression mode + is_incremental = previous_summary is not None + + if is_incremental: + input_text = ( + f"## Previous Summary\n{previous_summary}\n\n" + f"## New Conversations\n{pairs_text}" + ) + else: + input_text = pairs_text + + # Truncate if exceeds budget + from ..utils.token_estimation import estimate_tokens_text + input_tokens = estimate_tokens_text(input_text) + if input_tokens > config.max_summary_input_tokens: + # Simple tail-truncation for offline mode + approx_chars = int(config.max_summary_input_tokens * config.chars_per_token * 0.9) + input_text = "...[Earlier content truncated]...\n" + input_text[-approx_chars:] + + # Build prompt + schema_desc = json.dumps(config.summary_json_schema, ensure_ascii=False, indent=2) + if is_incremental: + system_prompt = config.incremental_summary_system_prompt + user_prompt = ( + f"Update the summary following this JSON structure:\n{schema_desc}\n\n" + f"{input_text}" + ) + else: + system_prompt = config.summary_system_prompt + user_prompt = ( + f"Create a structured checkpoint summary following this JSON structure:\n{schema_desc}\n\n" + f"TURNS TO SUMMARIZE:\n{input_text}" + ) + + messages = [ + ChatMessage(role=MessageRole.SYSTEM, + content=[{"type": "text", "text": system_prompt}]), + ChatMessage(role=MessageRole.USER, + content=[{"type": "text", "text": user_prompt}]), + ] + + # Call LLM with error handling + is_fallback = False + summary = None + + try: + response = model(messages, stop_sequences=[]) + raw_output = response.content + if isinstance(raw_output, list): + raw_output = " ".join( + block.get("text", "") + for block in raw_output + if isinstance(block, dict) and block.get("type") == "text" + ) + if not isinstance(raw_output, str): + raw_output = str(raw_output) + summary = format_summary_output(raw_output) + except Exception as e: + if _is_context_length_error(e): + logger.warning("Offline compression exceeds context limit; retrying with 2/3 budget") + approx_chars = int(config.max_summary_input_tokens * config.chars_per_token * 0.6) + truncated_input = input_text[-approx_chars:] if len(input_text) > approx_chars else input_text + if is_incremental: + user_prompt = ( + f"Update the summary following this JSON structure:\n{schema_desc}\n\n" + f"{truncated_input}" + ) + else: + user_prompt = ( + f"Create a structured checkpoint summary following this JSON structure:\n{schema_desc}\n\n" + f"TURNS TO SUMMARIZE:\n{truncated_input}" + ) + messages[-1] = ChatMessage( + role=MessageRole.USER, + content=[{"type": "text", "text": user_prompt}], + ) + try: + response = model(messages, stop_sequences=[]) + raw_output = response.content + if isinstance(raw_output, list): + raw_output = " ".join( + block.get("text", "") + for block in raw_output + if isinstance(block, dict) and block.get("type") == "text" + ) + if not isinstance(raw_output, str): + raw_output = str(raw_output) + summary = format_summary_output(raw_output) + except Exception as e2: + logger.error(f"Offline compression retry still failed: {e2}") + + if summary is None: + # L3 fallback: hard truncation + is_fallback = True + first_task = pairs[0][0][:200] if pairs else "" + reduced_chars = int(config.max_summary_reduce_tokens * config.chars_per_token) + reduced_text = pairs_text[-reduced_chars:] if len(pairs_text) > reduced_chars else pairs_text + summary = ( + "[CONTEXT COMPACTION — REFERENCE ONLY] Earlier steps were removed to free context space. " + "The removed content cannot be summarized. Continue based on the steps below.\n\n" + f"Original task: {first_task}\n\n" + f"Steps removed: {len(pairs)} of {len(pairs)}\n\n" + "Remaining compressed history:\n" + + reduced_text + ) + + return { + "summary": summary, + "is_incremental": is_incremental, + "is_fallback": is_fallback, + "input_text": input_text, + "input_chars": len(input_text), + } + + class ContextManager: def __init__(self, config: Optional[ContextManagerConfig] = None, max_steps: Optional[int] = None): self.config = config or ContextManagerConfig() self._previous_summary_cache: Optional[PreviousSummaryCache] = None self._current_summary_cache: Optional[CurrentSummaryCache] = None - # Run boundary self-detection. The current cache fingerprint is only reused - # within the current run and must be explicitly cleared at the start of a new run. - # The previous cache is managed and updated across runs. self._last_run_start_idx: Optional[int] = None if max_steps is not None and self.config.keep_recent_steps >= max_steps: @@ -59,11 +266,19 @@ def __init__(self, config: Optional[ContextManagerConfig] = None, max_steps: Opt self._step_local_log: List[CompressionCallRecord] = [] self._lock = threading.Lock() + # Token accounting for benchmark instrumentation. + # Recorded by compress_if_needed at each return point so benchmarks + # can compute token_reduction = 1 - last_compressed / last_uncompressed. + self._last_uncompressed_token_count: Optional[int] = None + self._last_compressed_token_count: Optional[int] = None + if self.config.max_summary_input_tokens <= 0: self.config.max_summary_input_tokens = int(self.config.token_threshold * 1.2) if self.config.max_summary_reduce_tokens <= 0: self.config.max_summary_reduce_tokens = int(self.config.token_threshold * 0.2) + self._components: List = [] + # ============================================================ # Cache validation # ============================================================ @@ -241,9 +456,13 @@ def compress_if_needed( # G1 if not self.config.enabled: return original_messages - + if self._estimate_tokens(memory) <= self.config.token_threshold: - return original_messages + # No compression needed; record that compressed == uncompressed + # so benchmark token_reduction reads as zero rather than stale. + self._last_uncompressed_token_count = self._msg_token_count(original_messages) + self._last_compressed_token_count = self._last_uncompressed_token_count + return original_messages with self._lock: # Run detection @@ -296,12 +515,17 @@ def compress_if_needed( self.compression_calls_log.append(record) self._step_local_log.append(record) - return self._build_messages( + compressed_msgs = self._build_messages( memory, prev_summary_step, prev_tail_steps, curr_kept_steps - ) + ) + self._last_uncompressed_token_count = self._msg_token_count(original_messages) + self._last_compressed_token_count = self._msg_token_count(compressed_msgs) + return compressed_msgs self._step_local_log.clear() + self._last_uncompressed_token_count = self._msg_token_count(original_messages) + prev_steps = memory.steps[:current_run_start_idx] curr_steps = memory.steps[current_run_start_idx:] @@ -334,7 +558,7 @@ def compress_if_needed( pairs_to_compress, model ) if summary_text: - if "Truncated" in summary_text: + if "[CONTEXT COMPACTION" in summary_text: prev_summary_step = SummaryTaskStep(task=summary_text, prefix="Context fallback, Truncated raw history:") else: prev_summary_step = SummaryTaskStep(task=summary_text) @@ -376,7 +600,7 @@ def compress_if_needed( curr_task, actions_to_compress, model ) if curr_summary_text: - if "Truncated" in curr_summary_text: + if "[CONTEXT COMPACTION" in curr_summary_text: curr_summary_step = SummaryTaskStep(task=curr_summary_text, prefix="Truncated recent action steps:") else: curr_summary_step = SummaryTaskStep(task=curr_summary_text) @@ -399,6 +623,7 @@ def compress_if_needed( memory, prev_summary_step, prev_tail_steps, curr_kept_steps ) final_tokens = self._msg_token_count(final_messages) + self._last_compressed_token_count = final_tokens # This situation is unlikely to occur unless the threshold itself is set unreasonably small if final_tokens > int(self.config.token_threshold * 1.1): logger.warning( @@ -463,7 +688,8 @@ def _compress_previous_with_cache( if input_tokens <= self.config.max_summary_input_tokens: summary_text = self._generate_summary( incremental_input, model, - call_type="previous_incremental" + call_type="previous_incremental", + prompt_type="incremental", ) if summary_text: last_t, last_a = pairs_to_compress[-1] @@ -534,10 +760,19 @@ def _summarize_pairs( logger.warning("previous full/truncated history summary generation failed, triggering L3 fallback truncation") reduced_pairs = self._trim_pairs_to_budget(pairs, self.config.max_summary_reduce_tokens, False) - reduced_text = "Truncated: " + self._render_steps_with_truncation( + reduced_text = self._render_steps_with_truncation( reduced_pairs, fmt="pair", max_tokens=self.config.max_summary_reduce_tokens ) - return reduced_text, False + first_task = pairs[0][0].task[:200] if pairs and pairs[0][0].task else "" + fallback_text = ( + "[CONTEXT COMPACTION — REFERENCE ONLY] Earlier steps were removed to free context space. " + "The removed content cannot be summarized. Continue based on the steps below.\n\n" + f"Original task: {first_task}\n\n" + f"Steps removed: {len(pairs) - len(reduced_pairs)} of {len(pairs)}\n\n" + "Remaining compressed history:\n" + + reduced_text + ) + return fallback_text, False # ============================================================ @@ -577,7 +812,9 @@ def _compress_current_with_cache( input_tokens = self._estimate_text_tokens(incremental_input) if input_tokens <= self.config.max_summary_input_tokens: summary_text = self._generate_summary( - incremental_input, model, call_type="current_incremental" + incremental_input, model, + call_type="current_incremental", + prompt_type="incremental", ) if summary_text: self._current_summary_cache = CurrentSummaryCache( @@ -622,8 +859,14 @@ def _compress_current_with_cache( actions_text = self._render_steps_with_truncation( reduced_actions, fmt="action", max_tokens=self.config.max_summary_reduce_tokens ) - reduced_text = "Truncated action steps: " + actions_text - return reduced_text + fallback_text = ( + "[CONTEXT COMPACTION — REFERENCE ONLY] Some recent action steps were removed to free context space. " + "Continue based on the remaining steps below.\n\n" + f"Steps removed: {len(actions_to_compress) - len(reduced_actions)} of {len(actions_to_compress)}\n\n" + "Remaining steps:\n" + + actions_text + ) + return fallback_text def _actions_to_text(self, actions: List[ActionStep]) -> str: parts = [] @@ -755,17 +998,12 @@ def _action_fingerprint(action: ActionStep) -> str: # ============================================================ def _is_context_length_error(self, err: Exception) -> bool: - msg = str(err).lower() - return any(k in msg for k in ( - "context_length", "context length", "maximum context", "maximum context length", - "prompt is too long", "reduce the length", "too many tokens", - "token limit", "exceeds the maximum", "input is too long", - "input length", "exceeds context", "context window", - )) - - def _generate_summary(self, text: str, model, call_type: str = "summary") -> Optional[str]: + return _is_context_length_error(err) + + def _generate_summary(self, text: str, model, call_type: str = "summary", + prompt_type: str = "initial") -> Optional[str]: try: - return self._do_generate_summary(text, model, call_type) + return self._do_generate_summary(text, model, call_type, prompt_type) except Exception as e: if self._is_context_length_error(e): logger.warning(f"{call_type} exceeds context limit; retrying with 2/3 budget truncation") @@ -773,24 +1011,62 @@ def _generate_summary(self, text: str, model, call_type: str = "summary") -> Opt text, int(self.config.max_summary_input_tokens * 0.66) ) try: - return self._do_generate_summary(shrunk, model, call_type + "_retry") + return self._do_generate_summary(shrunk, model, call_type + "_retry", prompt_type) except Exception as e2: + self._record_failed_compression(call_type + "_retry_failed", str(e2)) logger.error(f"Retry still failed: {e2}") return None + self._record_failed_compression(call_type + "_failed", str(e)) logger.error(f"Summary generation exception: {e}") return None - def _do_generate_summary(self, text: str, model, call_type: str = "summary") -> Optional[str]: + def _record_failed_compression(self, call_type: str, error_msg: str): + """Record a failed compression attempt so stats reflect actual compression triggers.""" + + record = CompressionCallRecord( + call_type=call_type, + input_tokens=0, + output_tokens=0, + input_chars=0, + output_chars=0, + cache_hit=False, + details={"error": error_msg}, + ) + self.compression_calls_log.append(record) + self._step_local_log.append(record) + + def _do_generate_summary(self, text: str, model, call_type: str = "summary", + prompt_type: str = "initial") -> Optional[str]: + # prompt_type selects which system prompt to render. For "incremental" + # we use the dedicated incremental_summary_system_prompt (with fallback + # to summary_system_prompt if it is empty) and a user prompt phrased + # as an update; "initial" keeps the original fresh-compaction phrasing. + if prompt_type == "incremental": + system_prompt = ( + self.config.incremental_summary_system_prompt + or self.config.summary_system_prompt + ) + else: + system_prompt = self.config.summary_system_prompt + schema_desc = json.dumps( self.config.summary_json_schema, ensure_ascii=False, indent=2 ) - user_prompt = ( - f"Output a summary following this JSON structure:\n{schema_desc}\n\n" - f"Conversation content to summarize:\n{text}" - ) + if prompt_type == "incremental": + # text already contains the "## Previous Summary" + "## New ..." + # sections; the prompt only needs to instruct the update. + user_prompt = ( + f"Update the summary following this JSON structure:\n{schema_desc}\n\n" + f"{text}" + ) + else: + user_prompt = ( + f"Output a summary following this JSON structure:\n{schema_desc}\n\n" + f"Conversation content to summarize:\n{text}" + ) messages = [ ChatMessage(role=MessageRole.SYSTEM, - content=[{"type": "text", "text": self.config.summary_system_prompt}]), + content=[{"type": "text", "text": system_prompt}]), ChatMessage(role=MessageRole.USER, content=[{"type": "text", "text": user_prompt}]), ] @@ -930,7 +1206,204 @@ def get_all_compression_stats(self) -> dict: real_calls = [r for r in self.compression_calls_log if not r.cache_hit] return { "total_calls": len(real_calls), + "total_attempts": len(self.compression_calls_log), "total_input_tokens": sum(r.input_tokens for r in real_calls), "total_output_tokens": sum(r.output_tokens for r in real_calls), "total_cache_hits": sum(1 for r in self.compression_calls_log if r.cache_hit), - } \ No newline at end of file + } + + # ============================================================ + # Benchmark export APIs + # ============================================================ + + def build_compressed_snapshot( + self, model, memory: AgentMemory, current_run_start_idx: int, + ) -> Tuple[List[ChatMessage], dict]: + """Build a frozen compressed message snapshot for probe evaluation. + + Returns (compressed_messages, metadata) without modifying internal + cache state. This enables the Probe Evaluation pattern where each + probe runs independently against a frozen compressed snapshot. + + metadata contains: token counts, which caches were used, and summary export. + """ + saved_prev_cache = self._previous_summary_cache + saved_curr_cache = self._current_summary_cache + saved_step_log = list(self._step_local_log) + saved_calls_log = list(self.compression_calls_log) + + try: + original_messages = memory.system_prompt.to_messages() if memory.system_prompt else [] + for step in memory.steps: + original_messages.extend(step.to_messages()) + + compressed_messages = self.compress_if_needed( + model, memory, original_messages, current_run_start_idx + ) + + metadata = { + "token_counts": self.get_token_counts(), + "summary": self.export_summary(), + "compression_stats": self.get_step_compression_stats(), + } + return compressed_messages, metadata + finally: + self._previous_summary_cache = saved_prev_cache + self._current_summary_cache = saved_curr_cache + self._step_local_log = saved_step_log + self.compression_calls_log = saved_calls_log + + def get_token_counts(self) -> dict: + """Return token counts from the most recent compression pass. + + Returns a dict with ``last_uncompressed`` and ``last_compressed`` token + counts, enabling accurate ``token_reduction = 1 - compressed/uncompressed`` + measurement in benchmarks. Values are None before the first compress_if_needed + call on this instance. + """ + with self._lock: + return { + "last_uncompressed": self._last_uncompressed_token_count, + "last_compressed": self._last_compressed_token_count, + } + + def export_summary(self) -> dict: + """Export current compression summary state for benchmark inspection. + + Returns a dict with the cached summary texts, cache metadata, and a + compression_boundary block describing which pairs/steps fed the + summary versus which were retained verbatim. Benchmarks use the + boundary block to validate probe design: probes should only target + information that was actually compressed. + """ + with self._lock: + prev_cache = self._previous_summary_cache + curr_cache = self._current_summary_cache + return { + "previous_summary": prev_cache.summary_text if prev_cache else None, + "current_summary": curr_cache.summary_text if curr_cache else None, + "previous_cache_info": ( + { + "covered_pairs": prev_cache.covered_pairs, + "is_fallback": "[CONTEXT COMPACTION" in (prev_cache.summary_text or ""), + } + if prev_cache else None + ), + "current_cache_info": ( + { + "end_steps": curr_cache.end_steps, + "is_fallback": "[CONTEXT COMPACTION" in (curr_cache.summary_text or ""), + } + if curr_cache else None + ), + "compression_boundary": { + "config_keep_recent_pairs": self.config.keep_recent_pairs, + "config_keep_recent_steps": self.config.keep_recent_steps, + "previous_compressed_pairs": ( + prev_cache.covered_pairs if prev_cache else 0 + ), + "previous_retained_pairs": self.config.keep_recent_pairs, + "current_compressed_steps": ( + curr_cache.end_steps if curr_cache else 0 + ), + "current_retained_steps": self.config.keep_recent_steps, + }, + } + + # ============================================================ + # Context Component Management + # ============================================================ + + def register_component(self, component) -> None: + """Register a context component for system prompt assembly. + + Components are accumulated and used by build_system_prompt(). + + Args: + component: A ContextComponent instance (e.g., ToolsComponent, + MemoryComponent, KnowledgeBaseComponent). + """ + with self._lock: + if component.token_estimate == 0: + component.token_estimate = component.estimate_tokens( + self.config.chars_per_token + ) + self._components.append(component) + + def clear_components(self) -> None: + """Clear all registered context components. + + Typically called at the start of a new agent run. + """ + with self._lock: + self._components.clear() + + def get_registered_components(self) -> List: + """Return copy of registered components.""" + with self._lock: + return list(self._components) + + def _get_strategy(self): + """Factory method to get strategy instance based on config.""" + from .agent_model import ( + FullStrategy, TokenBudgetStrategy, BufferedStrategy, PriorityWeightedStrategy + ) + strategy_map = { + "full": FullStrategy, + "token_budget": TokenBudgetStrategy, + "buffered": BufferedStrategy, + "priority": PriorityWeightedStrategy, + } + strategy_class = strategy_map.get(self.config.strategy, TokenBudgetStrategy) + + if self.config.strategy == "buffered": + return strategy_class(buffer_size=self.config.buffer_size_per_component) + elif self.config.strategy == "priority": + return strategy_class(relevance_threshold=0.5) + return strategy_class() + + def build_system_prompt(self, token_budget: Optional[int] = None) -> List: + """Build system prompt messages from registered components. + + Uses configured strategy to select components within token budget, + then converts each to message format. + + Args: + token_budget: Maximum tokens for all components. Defaults to + config.component_budgets total minus conversation_history. + + Returns: + List of message dicts with 'role' and 'content' keys. + """ + if not self._components: + return [] + + from .agent_model import SystemPromptComponent + + budget = token_budget or self._calculate_component_budget() + strategy = self._get_strategy() + selected = strategy.select_components( + self._components, budget, self.config.component_budgets + ) + + messages = [] + for comp in selected: + comp_messages = comp.to_messages() + for msg in comp_messages: + if not self._message_already_present(messages, msg): + messages.append(msg) + + return messages + + def _calculate_component_budget(self) -> int: + """Calculate total token budget for components (excluding conversation_history).""" + budgets = self.config.component_budgets + excluded = ["conversation_history"] + return sum(v for k, v in budgets.items() if k not in excluded) + + def _message_already_present(self, messages: List, new_msg: dict) -> bool: + """Check if identical message already exists.""" + for existing in messages: + if existing.get("role") == new_msg.get("role") and existing.get("content") == new_msg.get("content"): + return True + return False \ No newline at end of file diff --git a/sdk/nexent/core/agents/agent_model.py b/sdk/nexent/core/agents/agent_model.py index b48f1a85b..469bbb8f5 100644 --- a/sdk/nexent/core/agents/agent_model.py +++ b/sdk/nexent/core/agents/agent_model.py @@ -1,7 +1,11 @@ from __future__ import annotations +import logging +from abc import ABC, abstractmethod from threading import Event -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Literal, Optional, Tuple, Union + +logger = logging.getLogger("context_strategy") # Protocol type constants (must match backend/database/a2a_agent_db.py definitions) PROTOCOL_JSONRPC = "JSONRPC" @@ -16,6 +20,7 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: from .agent_context import ContextManagerConfig + from .summary_config import ContextManagerConfig as SummaryConfig class ModelConfig(BaseModel): @@ -30,13 +35,32 @@ class ModelConfig(BaseModel): description="Model provider identifier (e.g., openai, modelengine)", default=None ) + extra_body: Optional[Dict[str, Any]] = Field( + description=( + "Optional dict merged into every OpenAI-compatible " + "chat.completions.create request body. Used for provider-specific " + 'switches such as Qwen3 chat_template_kwargs={"enable_thinking": false}. ' + "Defaults to None so production behaviour is unchanged." + ), + default=None, + ) + max_tokens: Optional[int] = Field( + description=( + "Per-call completion output cap forwarded to chat.completions.create. " + "Defaults to None so production keeps the provider's own default " + "(typically the model's max output). Benchmarks set this explicitly " + "(e.g. 4096) to bound pathological generation loops where a model " + "regurgitates context." + ), + default=None, + ) timeout_seconds: Optional[float] = Field( description="Request timeout in seconds. If None, uses provider default.", default=None ) concurrency_limit: Optional[int] = Field( description="Maximum concurrent requests for this model. If None, no limit.", - default=None + default=None, ) @@ -59,7 +83,6 @@ class AgentConfig(BaseModel): max_steps: int = Field(description="Maximum number of steps for current Agent", default=5) model_name: str = Field(description="Model alias from ModelConfig") provide_run_summary: Optional[bool] = Field(description="Whether to provide run summary to upper-level Agent", default=False) - managed_agents: List[AgentConfig] = Field(description="Managed Agents", default=[]) instructions: Optional[str] = Field(description="Additional instructions to prepend to system prompt", default=None) managed_agents: List["AgentConfig"] = Field( description="Internal managed sub-agents created locally", @@ -73,6 +96,10 @@ class AgentConfig(BaseModel): description="Context manager configuration for conversation-level memory compression", default=None ) + context_components: Optional[List[Any]] = Field( + description="Pre-built context components for system prompt assembly", + default=None + ) class AgentHistory(BaseModel): @@ -202,6 +229,353 @@ def to_a2a_agent_info(self) -> "A2AAgentInfo": ) -# Rebuild models to resolve forward references +# ============================================================================= +# Context Component System - Building blocks for system prompt assembly +# ============================================================================= + +ComponentType = Literal["system_prompt", "tools", "skills", "memory", "knowledge_base", "managed_agents", "external_a2a_agents"] + + +class ContextComponent(BaseModel, ABC): + """Abstract base for all context components. + + Each component knows how to convert itself to LLM message format via to_messages(). + Follows smolagents MemoryStep.to_messages() pattern. + """ + component_type: ComponentType = Field(description="Type identifier for this component") + priority: int = Field(description="Selection priority (higher = more important)", default=10) + token_estimate: int = Field(description="Estimated token count", default=0) + metadata: Dict[str, Any] = Field(description="Additional metadata", default_factory=dict) + + @abstractmethod + def to_messages(self) -> List[Dict[str, str]]: + """Convert component content to message format for LLM. + + Returns: + List of message dicts with 'role' and 'content' keys. + """ + pass + + def estimate_tokens(self, chars_per_token: float = 1.5) -> int: + """Estimate token count from content length. + + Args: + chars_per_token: Average characters per token ratio. + + Returns: + Estimated token count. + """ + total_chars = sum(len(m.get("content", "")) for m in self.to_messages()) + return int(total_chars / chars_per_token) + + +class SystemPromptComponent(ContextComponent): + """System prompt component - base instructions for the agent.""" + component_type: ComponentType = Field(default="system_prompt") + content: str = Field(description="Rendered system prompt content") + template_name: Optional[str] = Field(description="Source template name", default=None) + + def to_messages(self) -> List[Dict[str, str]]: + return [{"role": "system", "content": self.content}] + + +class ToolsComponent(ContextComponent): + """Tool descriptions component - available tools for the agent.""" + component_type: ComponentType = Field(default="tools") + tools: List[Dict[str, Any]] = Field(description="List of tool definitions", default_factory=list) + formatted_description: str = Field(description="Pre-formatted tool descriptions text", default="") + + def to_messages(self) -> List[Dict[str, str]]: + if self.formatted_description: + return [{"role": "system", "content": self.formatted_description}] + return [] + + def add_tool(self, name: str, description: str, inputs: str, output_type: str) -> None: + """Add a tool definition.""" + self.tools.append({ + "name": name, + "description": description, + "inputs": inputs, + "output_type": output_type + }) + + +class SkillsComponent(ContextComponent): + """Skill summaries component - available skills for the agent.""" + component_type: ComponentType = Field(default="skills") + skills: List[Dict[str, Any]] = Field(description="List of skill definitions", default_factory=list) + formatted_description: str = Field(description="Pre-formatted skill summaries text", default="") + + def to_messages(self) -> List[Dict[str, str]]: + if self.formatted_description: + return [{"role": "system", "content": self.formatted_description}] + return [] + + def add_skill(self, name: str, description: str, examples: List[str] = None) -> None: + """Add a skill definition.""" + self.skills.append({ + "name": name, + "description": description, + "examples": examples or [] + }) + + +class MemoryComponent(ContextComponent): + """Memory context component - long-term memory (mem0) search results.""" + component_type: ComponentType = Field(default="memory") + memories: List[Dict[str, Any]] = Field(description="Memory search results", default_factory=list) + formatted_content: str = Field(description="Pre-formatted memory context text", default="") + search_query: Optional[str] = Field(description="Query used to search memory", default=None) + + def to_messages(self) -> List[Dict[str, str]]: + if self.formatted_content: + return [{"role": "system", "content": self.formatted_content}] + return [] + + def add_memory(self, content: str, memory_type: str = "user", metadata: Dict[str, Any] = None) -> None: + """Add a memory entry.""" + self.memories.append({ + "content": content, + "memory_type": memory_type, + "metadata": metadata or {} + }) + + +class KnowledgeBaseComponent(ContextComponent): + """Knowledge base component - KB summary context.""" + component_type: ComponentType = Field(default="knowledge_base") + summary: str = Field(description="Knowledge base summary text", default="") + kb_ids: List[str] = Field(description="Knowledge base IDs used", default_factory=list) + + def to_messages(self) -> List[Dict[str, str]]: + if self.summary: + return [{"role": "system", "content": self.summary}] + return [] + + +class ManagedAgentsComponent(ContextComponent): + """Managed agents component - internal sub-agent definitions.""" + component_type: ComponentType = Field(default="managed_agents") + agents: List[Dict[str, Any]] = Field(description="Managed agent definitions", default_factory=list) + formatted_description: str = Field(description="Pre-formatted agent descriptions", default="") + + def to_messages(self) -> List[Dict[str, str]]: + if self.formatted_description: + return [{"role": "system", "content": self.formatted_description}] + return [] + + def add_agent(self, name: str, description: str, tools: List[str] = None) -> None: + """Add a managed agent definition.""" + self.agents.append({ + "name": name, + "description": description, + "tools": tools or [] + }) + + +class ExternalAgentsComponent(ContextComponent): + """External A2A agents component - external agent definitions.""" + component_type: ComponentType = Field(default="external_a2a_agents") + agents: List[Dict[str, Any]] = Field(description="External A2A agent definitions", default_factory=list) + formatted_description: str = Field(description="Pre-formatted agent descriptions", default="") + + def to_messages(self) -> List[Dict[str, str]]: + if self.formatted_description: + return [{"role": "system", "content": self.formatted_description}] + return [] + + def add_agent(self, agent_id: str, name: str, description: str, url: str) -> None: + """Add an external A2A agent definition.""" + self.agents.append({ + "agent_id": agent_id, + "name": name, + "description": description, + "url": url + }) + + +# ============================================================================= +# Context Strategy System - Pluggable component selection algorithms +# ============================================================================= + +class ContextStrategy(ABC): + """Abstract base for context component selection strategies.""" + + @abstractmethod + def select_components( + self, + components: List[ContextComponent], + token_budget: int, + component_budgets: Dict[str, int] + ) -> List[ContextComponent]: + """Select components to include within constraints. + + Args: + components: All available context components. + token_budget: Maximum total tokens allowed. + component_budgets: Per-type token limits. + + Returns: + Selected components in priority order. + """ + pass + + @abstractmethod + def get_strategy_name(self) -> str: + """Return strategy identifier.""" + pass + + +class FullStrategy(ContextStrategy): + """Keep all components - for unlimited context models.""" + + def select_components( + self, + components: List[ContextComponent], + token_budget: int, + component_budgets: Dict[str, int] + ) -> List[ContextComponent]: + return sorted(components, key=lambda c: c.priority, reverse=True) + + def get_strategy_name(self) -> str: + return "full" + + +class TokenBudgetStrategy(ContextStrategy): + """Select components within total token budget by priority.""" + + def select_components( + self, + components: List[ContextComponent], + token_budget: int, + component_budgets: Dict[str, int] + ) -> List[ContextComponent]: + sorted_components = sorted(components, key=lambda c: c.priority, reverse=True) + selected: List[ContextComponent] = [] + total_tokens = 0 + type_totals: Dict[str, int] = {} + + for comp in sorted_components: + comp_tokens = comp.token_estimate or comp.estimate_tokens() + comp_budget = component_budgets.get(comp.component_type, token_budget) + current_type_total = type_totals.get(comp.component_type, 0) + + fits_total = total_tokens + comp_tokens <= token_budget + fits_type = current_type_total + comp_tokens <= comp_budget + + if fits_total and fits_type: + selected.append(comp) + total_tokens += comp_tokens + type_totals[comp.component_type] = current_type_total + comp_tokens + else: + # Surface the drop so operators can see when the prompt is + # being silently truncated by budget pressure. Identifying + # which constraint tripped (global vs per-type) is the most + # useful detail when tuning component_budgets. + reason = ( + "total_budget" + if not fits_total else "type_budget" + ) + logger.warning( + "TokenBudgetStrategy dropped component type=%s priority=%d " + "tokens=%d reason=%s (total %d/%d, type %d/%d)", + comp.component_type, comp.priority, comp_tokens, reason, + total_tokens, token_budget, + current_type_total, comp_budget, + ) + + return selected + + def get_strategy_name(self) -> str: + return "token_budget" + + +class BufferedStrategy(ContextStrategy): + """Keep last N components per type.""" + + def __init__(self, buffer_size: int = 10): + self.buffer_size = buffer_size + + def select_components( + self, + components: List[ContextComponent], + token_budget: int, + component_budgets: Dict[str, int] + ) -> List[ContextComponent]: + type_buckets: Dict[str, List[ContextComponent]] = {} + + for comp in components: + type_buckets.setdefault(comp.component_type, []).append(comp) + + selected: List[ContextComponent] = [] + for comp_type, bucket in type_buckets.items(): + recent = bucket[-self.buffer_size:] + dropped = len(bucket) - len(recent) + if dropped > 0: + logger.warning( + "BufferedStrategy dropped %d component(s) of type=%s " + "(buffer_size=%d, total=%d)", + dropped, comp_type, self.buffer_size, len(bucket), + ) + selected.extend(recent) + + return sorted(selected, key=lambda c: c.priority, reverse=True) + + def get_strategy_name(self) -> str: + return "buffered" + + +class PriorityWeightedStrategy(ContextStrategy): + """Select by weighted importance + relevance scores.""" + + def __init__(self, relevance_threshold: float = 0.5): + self.relevance_threshold = relevance_threshold + + def select_components( + self, + components: List[ContextComponent], + token_budget: int, + component_budgets: Dict[str, int] + ) -> List[ContextComponent]: + scored_components: List[Tuple[ContextComponent, float]] = [] + + for comp in components: + relevance = comp.metadata.get("relevance_score", 1.0) + score = comp.priority * 0.7 + relevance * 0.3 * 100 + if relevance >= self.relevance_threshold: + scored_components.append((comp, score)) + else: + logger.warning( + "PriorityWeightedStrategy dropped component type=%s " + "priority=%d relevance=%.3f str: + return "priority" + + + + + AgentConfig.model_rebuild() -AgentRunInfo.model_rebuild() diff --git a/sdk/nexent/core/agents/core_agent.py b/sdk/nexent/core/agents/core_agent.py index abf8c1a26..88462b3f7 100644 --- a/sdk/nexent/core/agents/core_agent.py +++ b/sdk/nexent/core/agents/core_agent.py @@ -418,6 +418,21 @@ def _step_stream(self, memory_step: ActionStep) -> Generator[Any]: observation += "Last output from code snippet:\n" + truncated_output memory_step.observations = observation + # Pre-truncate observations when ContextManager is enabled. Keeps the + # head + tail of long outputs around a truncation marker so downstream + # compression sees bounded-length step records and the model can still + # search/read for the elided portion. + if self.context_manager and self.context_manager.config.enabled: + max_obs = self.context_manager.config.max_observation_length + if max_obs > 0 and memory_step.observations and len(memory_step.observations) > max_obs: + obs_text = memory_step.observations + half = max_obs // 2 + truncation_marker = ( + f"\n...[Output truncated to {max_obs} characters. " + f"Use search or read tools to find specific results.]\n" + ) + memory_step.observations = obs_text[:half] + truncation_marker + obs_text[-half:] + if not code_output.is_final_answer and truncated_output is not None: execution_outputs_console += [ Text( @@ -458,8 +473,16 @@ def run(self, task: str, stream: bool = False, reset: bool = True, images: Optio You have been provided with these additional arguments, that you can access using the keys as variables in your python code: {str(additional_args)}.""" + system_prompt_content = self.system_prompt + if self.context_manager and self.context_manager.get_registered_components(): + component_messages = self.context_manager.build_system_prompt() + if component_messages: + system_prompt_content = "\n\n".join( + msg.get("content", "") for msg in component_messages if msg.get("role") == "system" + ) + self.memory.system_prompt = SystemPromptStep( - system_prompt=self.system_prompt) + system_prompt=system_prompt_content) if reset: self.memory.reset() self.monitor.reset() diff --git a/sdk/nexent/core/agents/nexent_agent.py b/sdk/nexent/core/agents/nexent_agent.py index 2ccf1d72a..a5efcbe82 100644 --- a/sdk/nexent/core/agents/nexent_agent.py +++ b/sdk/nexent/core/agents/nexent_agent.py @@ -184,6 +184,8 @@ def create_model(self, model_cite_name: str): ssl_verify=model_config.ssl_verify if model_config.ssl_verify is not None else True, model_factory=model_config.model_factory, display_name=model_config.cite_name, +extra_body=model_config.extra_body, + max_tokens=model_config.max_tokens, timeout_seconds=model_config.timeout_seconds, ) model.stop_event = self.stop_event @@ -423,6 +425,10 @@ def create_single_agent(self, agent_config: AgentConfig): config=ctx_config, max_steps=agent_config.max_steps ) + context_components = getattr(agent_config, 'context_components', None) + if context_components: + for component in context_components: + agent.context_manager.register_component(component) return agent except Exception as e: diff --git a/sdk/nexent/core/agents/summary_config.py b/sdk/nexent/core/agents/summary_config.py index a2a3e37dd..e271ddd34 100644 --- a/sdk/nexent/core/agents/summary_config.py +++ b/sdk/nexent/core/agents/summary_config.py @@ -1,12 +1,22 @@ -"""Configuration for agent context compression.""" +"""Configuration for agent context management and compression.""" from dataclasses import dataclass, field -from typing import Any, Dict +from typing import Any, Dict, Literal + + +StrategyType = Literal["full", "token_budget", "buffered", "priority"] @dataclass class ContextManagerConfig: - """Configuration for ContextManager compression behavior.""" + """Configuration for ContextManager - handles ALL context building. + + Extends existing compression config with: + - Strategy selection for component selection algorithms + - Injection flags to enable/disable individual context components + - Per-component token budgets for fine-grained control + """ + # === Compression Settings (existing) === enabled: bool = False token_threshold: int = 10000 keep_recent_steps: int = 4 @@ -21,6 +31,21 @@ class ContextManagerConfig: "pending items, and context to preserve. Output strict JSON format without markdown blocks." ) + # Separate prompt for INCREMENTAL summary updates ("here is the previous + # summary + new turns; produce an updated summary"). When empty the + # incremental compression path falls back to summary_system_prompt for + # backwards compatibility. + incremental_summary_system_prompt: str = ( + "You are a conversation summarization assistant updating an existing " + "structured summary. The input has two sections: '## Previous Summary' " + "(the prior compaction) and '## New Conversations' or '## New Steps' " + "(turns that occurred after the prior compaction). Produce an updated " + "JSON summary that PRESERVES information from the previous summary " + "(do not drop it unless clearly obsolete), MERGES the new turns into " + "the appropriate fields, and KEEPS the same JSON schema. Do not include " + "narration outside the JSON. No markdown code blocks." + ) + summary_json_schema: Dict[str, Any] = field(default_factory=lambda: { "task_overview": "User's core request and success criteria (<=150 words)", "completed_work": "Work completed, files or results produced (<=200 words)", @@ -32,4 +57,65 @@ class ContextManagerConfig: max_summary_input_tokens: int = 0 max_summary_reduce_tokens: int = 0 estimated_chunk_summary_tokens: int = 400 - chars_per_token: float = 1.5 \ No newline at end of file + chars_per_token: float = 1.5 + + # Pre-truncate single observations (model/tool outputs) longer than this + # character limit at execute_action time, before they reach memory. + # 0 = disabled (production default). Only takes effect when ``enabled`` + # is True, so production callers that do not opt in see no behaviour + # change. + max_observation_length: int = 0 + + # === NEW: Strategy Selection === + strategy: StrategyType = "token_budget" + """Context component selection strategy. + + Options: + - 'full': Keep all components (for unlimited context models) + - 'token_budget': Select components within token budget by priority + - 'buffered': Keep last N components per type + - 'priority': Weight by importance + relevance scores + """ + + # === NEW: Component Injection Flags === + inject_system_prompt: bool = True + """Whether to inject system prompt into context.""" + + inject_tools: bool = True + """Whether to inject tool descriptions into system prompt.""" + + inject_skills: bool = True + """Whether to inject skill summaries into system prompt.""" + + inject_memory: bool = True + """Whether to search and inject long-term memory (mem0) into system prompt.""" + + inject_knowledge_base: bool = True + """Whether to inject knowledge base summaries into system prompt.""" + + inject_agent_definitions: bool = True + """Whether to inject sub-agent (managed_agents + external_a2a_agents) definitions.""" + + inject_app_context: bool = True + """Whether to inject APP_NAME, APP_DESCRIPTION, time, user_id.""" + + # === NEW: Per-Component Token Budgets === + component_budgets: Dict[str, int] = field(default_factory=lambda: { + "system_prompt": 4000, + "tools": 3000, + "skills": 1000, + "memory": 2000, + "knowledge_base": 1500, + "managed_agents": 500, + "external_a2a_agents": 500, + "conversation_history": 4000, # Reserved for conversation compression + }) + """Token budget for each context component type. + + Used by token_budget strategy to allocate tokens across components. + Total of all budgets should not exceed token_threshold. + """ + + # === NEW: Buffered Strategy Settings === + buffer_size_per_component: int = 10 + """Number of items to keep per component type for 'buffered' strategy.""" \ No newline at end of file diff --git a/sdk/nexent/core/models/openai_llm.py b/sdk/nexent/core/models/openai_llm.py index 8e4833394..a9127595c 100644 --- a/sdk/nexent/core/models/openai_llm.py +++ b/sdk/nexent/core/models/openai_llm.py @@ -25,8 +25,11 @@ class OpenAIModel(OpenAIServerModel): def __init__(self, observer: MessageObserver = MessageObserver, temperature=0.2, top_p=0.95, - ssl_verify=True, timeout_seconds: Optional[float] = None, model_factory: Optional[str] = None, - display_name: Optional[str] = None, *args, **kwargs): +ssl_verify=True, model_factory: Optional[str] = None, + display_name: Optional[str] = None, + extra_body: Optional[Dict[str, Any]] = None, + max_tokens: Optional[int] = None, + timeout_seconds: Optional[float] = None, *args, **kwargs): """ Initialize OpenAI Model with observer and SSL verification option. @@ -39,6 +42,13 @@ def __init__(self, observer: MessageObserver = MessageObserver, temperature=0.2, timeout_seconds: Timeout in seconds for HTTP requests (default: None, uses client default). model_factory: Provider identifier (e.g., openai, modelengine) display_name: Human-readable display name for monitoring + extra_body: Optional dict merged into every chat.completions.create + request body. Defaults to None so production behaviour + is unchanged for callers that do not opt in. + max_tokens: Per-call completion output cap. Defaults to None so + production keeps the provider default (unbounded / + model max). Benchmarks set this explicitly (e.g. 4096) + to bound degenerate generation loops on long contexts. *args: Additional positional arguments for OpenAIServerModel **kwargs: Additional keyword arguments for OpenAIServerModel """ @@ -49,6 +59,8 @@ def __init__(self, observer: MessageObserver = MessageObserver, temperature=0.2, self._monitoring = get_monitoring_manager() self.model_factory = (model_factory or "").lower() self.display_name = display_name + self.extra_body = extra_body or None + self.max_tokens = max_tokens # Create http_client based on ssl_verify parameter and timeout if not ssl_verify or timeout_seconds is not None: @@ -160,6 +172,17 @@ def __call__(self, messages: List[Dict[str, Any]], stop_sequences: Optional[List completion_kwargs["stream_options"] = {"include_usage": True} + # Provider-specific extras (e.g. Qwen3 chat_template_kwargs) - only + # set when the caller actually supplied something so default OpenAI + # behaviour is unchanged for everyone else. + if self.extra_body: + completion_kwargs["extra_body"] = self.extra_body + + # Bound completion length unless the caller passed their own override + # via kwargs (which already landed in completion_kwargs above). + if self.max_tokens is not None and "max_tokens" not in completion_kwargs: + completion_kwargs["max_tokens"] = self.max_tokens + current_request = self.client.chat.completions.create( stream=True, **completion_kwargs) diff --git a/test/backend/agents/test_create_agent_info.py b/test/backend/agents/test_create_agent_info.py index 20340f2ea..1cb6195ef 100644 --- a/test/backend/agents/test_create_agent_info.py +++ b/test/backend/agents/test_create_agent_info.py @@ -78,13 +78,22 @@ class ToolExecutionException(Exception): # Utilities --------------------------------------------------------------- def _create_stub_module(name: str, **attrs): - """Return a lightweight module stub with the provided attributes.""" module = types.ModuleType(name) - for attr_name, attr_value in attrs.items(): - setattr(module, attr_name, attr_value) + module.__dict__.update(attrs) + sys.modules[name] = module return module +def _create_stub_component_class(name: str): + class StubComponent: + def __init__(self, **kwargs): + for k, v in kwargs.items(): + setattr(self, k, v) + self.component_type = name.lower().replace("component", "") + StubComponent.__name__ = name + return StubComponent + + # Configure required constants via shared bootstrap env consts_const.MINIO_ENDPOINT = "http://localhost:9000" consts_const.MINIO_ACCESS_KEY = "test_access_key" @@ -106,10 +115,13 @@ def _create_stub_module(name: str, **attrs): utils_mock = MagicMock() utils_mock.auth_utils = MagicMock() utils_mock.auth_utils.get_current_user_id = MagicMock(return_value=("test_user_id", "test_tenant_id")) +utils_mock.context_utils = MagicMock() +utils_mock.context_utils.build_context_components = MagicMock(return_value=[]) # Add the mocked utils module to sys.modules sys.modules['utils'] = utils_mock sys.modules['utils.auth_utils'] = utils_mock.auth_utils +sys.modules['utils.context_utils'] = utils_mock.context_utils # Provide a stub for the `boto3` module so that it can be imported safely even # if the testing environment does not have it available. @@ -148,6 +160,14 @@ def _create_stub_module(name: str, **attrs): ExternalA2AAgentConfig=MagicMock(), AgentRunInfo=MagicMock(), MessageObserver=MagicMock(), + ContextComponent=_create_stub_component_class("ContextComponent"), + ToolsComponent=_create_stub_component_class("ToolsComponent"), + SkillsComponent=_create_stub_component_class("SkillsComponent"), + MemoryComponent=_create_stub_component_class("MemoryComponent"), + KnowledgeBaseComponent=_create_stub_component_class("KnowledgeBaseComponent"), + ManagedAgentsComponent=_create_stub_component_class("ManagedAgentsComponent"), + ExternalAgentsComponent=_create_stub_component_class("ExternalAgentsComponent"), + SystemPromptComponent=_create_stub_component_class("SystemPromptComponent"), ) sys.modules['nexent.core.agents.agent_context'] = _create_stub_module( "nexent.core.agents.agent_context", @@ -1456,13 +1476,14 @@ async def test_create_agent_config_basic(self): name="test_agent", description="test description", prompt_templates={"system_prompt": "populated_system_prompt"}, - tools=[], + tools=ANY, max_steps=5, model_name="test_model", provide_run_summary=True, managed_agents=[], external_a2a_agents=[], - context_manager_config=ANY + context_manager_config=ANY, + context_components=ANY ) @pytest.mark.asyncio @@ -1524,13 +1545,14 @@ async def test_create_agent_config_with_sub_agents(self): description="test description", prompt_templates={ "system_prompt": "populated_system_prompt"}, - tools=[], + tools=ANY, max_steps=5, model_name="test_model", provide_run_summary=True, managed_agents=[mock_sub_agent_config], external_a2a_agents=[], - context_manager_config=ANY + context_manager_config=ANY, + context_components=ANY ) @pytest.mark.asyncio @@ -1720,18 +1742,18 @@ async def test_create_agent_config_model_id_none(self): result = await create_agent_config("agent_1", "tenant_1", "user_1", "zh", "test query") - # Verify that AgentConfig was called with "main_model" as fallback mock_agent_config.assert_called_with( name="test_agent", description="test description", prompt_templates={"system_prompt": "populated_system_prompt"}, - tools=[], + tools=ANY, max_steps=5, - model_name="main_model", # Should fallback to "main_model" + model_name="main_model", provide_run_summary=True, managed_agents=[], external_a2a_agents=[], - context_manager_config=ANY + context_manager_config=ANY, + context_components=ANY ) @pytest.mark.asyncio diff --git a/test/backend/utils/test_context_utils.py b/test/backend/utils/test_context_utils.py new file mode 100644 index 000000000..66e789477 --- /dev/null +++ b/test/backend/utils/test_context_utils.py @@ -0,0 +1,229 @@ +import pytest +import sys +from pathlib import Path + +TEST_ROOT = Path(__file__).resolve().parents[2] +PROJECT_ROOT = TEST_ROOT.parent + +for _path in (str(PROJECT_ROOT), str(TEST_ROOT)): + if _path not in sys.path: + sys.path.insert(0, _path) + + +class TestFormatFunctions: + def test_format_tools_empty(self): + from backend.utils.context_utils import _format_tools_description + result = _format_tools_description({}, language="zh") + assert result == "- 当前没有可用的工具" + + def test_format_tools_single(self): + from backend.utils.context_utils import _format_tools_description + class MockTool: + name = "search" + description = "Search tool" + inputs = '{"query": "str"}' + output_type = "string" + source = "local" + result = _format_tools_description({"search": MockTool()}, language="zh") + assert "search" in result + assert "Search tool" in result + + def test_format_skills_empty(self): + from backend.utils.context_utils import _format_skills_description + result = _format_skills_description([], language="zh") + assert result == "" + + def test_format_skills_single(self): + from backend.utils.context_utils import _format_skills_description + skills = [{"name": "skill1", "description": "Test skill"}] + result = _format_skills_description(skills, language="zh") + assert "skill1" in result + assert "Test skill" in result + + def test_format_memory_empty(self): + from backend.utils.context_utils import _format_memory_context + result = _format_memory_context([], language="zh") + assert result == "" + + def test_format_memory_dict(self): + from backend.utils.context_utils import _format_memory_context + memory = [{"memory": "test memory", "memory_level": "user", "score": 0.9}] + result = _format_memory_context(memory, language="zh") + assert "test memory" in result + + def test_format_memory_string(self): + from backend.utils.context_utils import _format_memory_context + memory = [{"memory": "simple string", "memory_level": "user", "score": 0.5}] + result = _format_memory_context(memory, language="zh") + assert "simple string" in result + + def test_format_managed_agents_empty(self): + from backend.utils.context_utils import _format_managed_agents_description + result = _format_managed_agents_description({}, language="zh") + assert result == "" + + def test_format_managed_agents_single(self): + from backend.utils.context_utils import _format_managed_agents_description + class MockAgent: + name = "research" + description = "Research assistant" + result = _format_managed_agents_description({"research": MockAgent()}, language="zh") + assert "research" in result + + def test_format_external_agents_empty(self): + from backend.utils.context_utils import _format_external_agents_description + result = _format_external_agents_description({}, language="zh") + assert result == "" + + def test_format_external_agents_single(self): + from backend.utils.context_utils import _format_external_agents_description + class MockAgent: + agent_id = "ext-1" + name = "External" + description = "External agent" + result = _format_external_agents_description({"ext-1": MockAgent()}, language="zh") + assert "External" in result + + +class TestBuildComponents: + def test_build_tools_component_empty(self): + from backend.utils.context_utils import build_tools_component + comp = build_tools_component({}, language="zh") + assert comp.tools == [] + + def test_build_tools_component_with_tools(self): + from backend.utils.context_utils import build_tools_component + class MockTool: + name = "tool" + description = "desc" + inputs = "{}" + output_type = "str" + source = "local" + comp = build_tools_component({"tool": MockTool()}, language="zh") + assert len(comp.tools) == 1 + + def test_build_skills_component_empty(self): + from backend.utils.context_utils import build_skills_component + comp = build_skills_component([], language="zh") + assert comp.skills == [] + + def test_build_skills_component_with_skills(self): + from backend.utils.context_utils import build_skills_component + comp = build_skills_component([{"name": "skill"}], language="zh") + assert len(comp.skills) == 1 + + def test_build_memory_component_empty(self): + from backend.utils.context_utils import build_memory_component + comp = build_memory_component([], language="zh") + assert comp.memories == [] + + def test_build_memory_component_with_search_query(self): + from backend.utils.context_utils import build_memory_component + comp = build_memory_component([], search_query="test query", language="zh") + assert comp.search_query == "test query" + + def test_build_knowledge_base_component_empty(self): + from backend.utils.context_utils import build_knowledge_base_component + comp = build_knowledge_base_component("") + assert comp.summary == "" + + def test_build_knowledge_base_component_with_summary(self): + from backend.utils.context_utils import build_knowledge_base_component + comp = build_knowledge_base_component("KB text", kb_ids=["kb-1"]) + assert comp.summary == "KB text" + + def test_build_managed_agents_component_empty(self): + from backend.utils.context_utils import build_managed_agents_component + comp = build_managed_agents_component({}, language="zh") + assert comp.agents == [] + + def test_build_external_agents_component_empty(self): + from backend.utils.context_utils import build_external_agents_component + comp = build_external_agents_component({}, language="zh") + assert comp.agents == [] + + def test_build_system_prompt_component_empty(self): + from backend.utils.context_utils import build_system_prompt_component + comp = build_system_prompt_component("") + assert comp.content == "" + + def test_build_system_prompt_component_with_template(self): + from backend.utils.context_utils import build_system_prompt_component + comp = build_system_prompt_component("test", template_name="template.yaml") + assert comp.template_name == "template.yaml" + + +class TestBuildContextComponents: + def test_empty_inputs_produces_skeleton(self): + from backend.utils.context_utils import build_context_components + components = build_context_components( + duty="Help users.", + constraint="Be helpful.", + few_shots="Q: hi?\nA: Hello!", + app_name="Test", + app_description="Test", + time_str="2026-01-01", + user_id="test", + language="zh", + is_manager=False, + ) + types = [c.component_type for c in components] + assert "system_prompt" in types + + def test_with_tools_only(self): + from backend.utils.context_utils import build_context_components + class MockTool: + name = "tool" + description = "desc" + inputs = "{}" + output_type = "str" + source = "local" + components = build_context_components( + duty="Help users.", + constraint="Be helpful.", + few_shots="Q?", + app_name="Test", + app_description="Test", + time_str="2026-01-01", + user_id="test", + language="zh", + is_manager=False, + tools={"tool": MockTool()}, + ) + types = [c.component_type for c in components] + assert "tools" in types + + def test_include_flags_skip_tools(self): + from backend.utils.context_utils import build_context_components + class MockTool: + name = "tool" + description = "desc" + inputs = "{}" + output_type = "str" + source = "local" + components = build_context_components( + duty="Help users.", + constraint="Be helpful.", + few_shots="Q?", + app_name="Test", + app_description="Test", + time_str="2026-01-01", + user_id="test", + language="zh", + is_manager=False, + tools={"tool": MockTool()}, + include_tools=False, + ) + types = [c.component_type for c in components] + assert "tools" not in types + + def test_app_context_string(self): + from backend.utils.context_utils import build_app_context_string + result = build_app_context_string("Nexent", "Platform", "user-1") + assert "Nexent" in result + assert "Platform" in result + assert "user-1" in result + + +if __name__ == "__main__": + pytest.main([__file__]) \ No newline at end of file diff --git a/test/common/test_mocks.py b/test/common/test_mocks.py index c57941780..fb7e5d8b5 100644 --- a/test/common/test_mocks.py +++ b/test/common/test_mocks.py @@ -22,7 +22,6 @@ def _ensure_path(path: Path) -> None: def _create_module(name: str, **attrs: Any) -> types.ModuleType: - """Create a module with the given attributes.""" module = types.ModuleType(name) for attr_name, attr_value in attrs.items(): setattr(module, attr_name, attr_value) diff --git a/test/sdk/core/agents/test_agent_context/loader.py b/test/sdk/core/agents/test_agent_context/loader.py index 6ff37d050..3d41c07a0 100644 --- a/test/sdk/core/agents/test_agent_context/loader.py +++ b/test/sdk/core/agents/test_agent_context/loader.py @@ -19,7 +19,7 @@ from types import ModuleType sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from stubs import register_smolagents_mocks +from stubs import register_smolagents_mocks, restore_real_smolagents # ── 1. Register smolagents mocks (idempotent) ────────────────── register_smolagents_mocks() @@ -156,9 +156,18 @@ def _register_stub_packages(): "sdk.nexent.core", "sdk.nexent.core.agents", "sdk.nexent.core.utils", + "sdk.nexent.core.utils.observer", + "sdk.nexent.core.agents.a2a_agent_proxy", ]: if name not in sys.modules: - sys.modules[name] = ModuleType(name) + m = ModuleType(name) + if name == "sdk.nexent.core.utils.observer": + m.MessageObserver = type("MessageObserver", (), {}) + if name == "sdk.nexent.core.agents.a2a_agent_proxy": + m.A2AAgentInfo = type("A2AAgentInfo", (), { + "__init__": lambda self, **kwargs: None + }) + sys.modules[name] = m token_est_key = "sdk.nexent.core.utils.token_estimation" if token_est_key not in sys.modules: @@ -238,7 +247,34 @@ def _load_agent_context(): _ctx_mod = _load_agent_context() -# ── 5. Re-export public names (mirrors original monolithic imports) ── +# ── 5. Load agent_model.py for ContextComponent classes ────────────────── + +def _load_agent_model(): + """Load agent_model.py containing ContextComponent and ContextStrategy classes.""" + module_name = "sdk.nexent.core.agents.agent_model" + if module_name in sys.modules: + return sys.modules[module_name] + + target = _locate_module("agent_model") + spec = importlib.util.spec_from_file_location(module_name, target) + module = importlib.util.module_from_spec(spec) + module.__package__ = "sdk.nexent.core.agents" + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + + +_agent_model_mod = _load_agent_model() + +# Restore real smolagents in sys.modules so sibling test trees (e.g. +# test/backend/utils/test_context_utils.py) that import the real +# nexent.core.agents path can do "from smolagents.memory import AgentMemory" +# without picking up our mock. The mock classes captured above as +# module-level attributes on _ctx_mod / _agent_model_mod stay valid for our +# own unit tests, which never touch sys.modules['smolagents.*'] at runtime. +restore_real_smolagents() + +# ── 6. Re-export public names (mirrors original monolithic imports) ── ContextManager = _ctx_mod.ContextManager ContextManagerConfig = _ctx_mod.ContextManagerConfig @@ -251,4 +287,22 @@ def _load_agent_context(): ChatMessage = _ctx_mod.ChatMessage MessageRole = _ctx_mod.MessageRole CompressionCallRecord = _ctx_mod.CompressionCallRecord + +# Export ContextComponent classes +ContextComponent = _agent_model_mod.ContextComponent +SystemPromptComponent = _agent_model_mod.SystemPromptComponent +ToolsComponent = _agent_model_mod.ToolsComponent +SkillsComponent = _agent_model_mod.SkillsComponent +MemoryComponent = _agent_model_mod.MemoryComponent +KnowledgeBaseComponent = _agent_model_mod.KnowledgeBaseComponent +ManagedAgentsComponent = _agent_model_mod.ManagedAgentsComponent +ExternalAgentsComponent = _agent_model_mod.ExternalAgentsComponent + +# Export ContextStrategy classes +ContextStrategy = _agent_model_mod.ContextStrategy +FullStrategy = _agent_model_mod.FullStrategy +TokenBudgetStrategy = _agent_model_mod.TokenBudgetStrategy +BufferedStrategy = _agent_model_mod.BufferedStrategy +PriorityWeightedStrategy = _agent_model_mod.PriorityWeightedStrategy + from stubs import _SystemPromptStep as SystemPromptStep \ No newline at end of file diff --git a/test/sdk/core/agents/test_agent_context/stubs.py b/test/sdk/core/agents/test_agent_context/stubs.py index 799b8c3f9..41eb1917c 100644 --- a/test/sdk/core/agents/test_agent_context/stubs.py +++ b/test/sdk/core/agents/test_agent_context/stubs.py @@ -135,9 +135,6 @@ def register_smolagents_mocks() -> ModuleType: Idempotent: subsequent calls return the already-registered module. Returns the top-level mock module. """ - if "smolagents" in sys.modules: - return sys.modules["smolagents"] - mock = build_smolagents_mock() sys.modules.update({ "smolagents": mock, @@ -145,4 +142,35 @@ def register_smolagents_mocks() -> ModuleType: "smolagents.models": mock.models, "smolagents.agents": mock.agents, }) - return mock \ No newline at end of file + return mock + + +def restore_real_smolagents() -> None: + """ + Remove smolagents mock entries from sys.modules and force-reimport the + real packages. Safe to call after loader.py has finished loading + agent_context via importlib: by then the mock classes are already + captured as module-level attributes in the loaded modules, so swapping + sys.modules back to real packages does not invalidate those references. + + Required to prevent cross-test contamination: sibling test trees (e.g. + test/backend/utils/test_context_utils.py) import the real + nexent.core.agents.agent_context, which itself does + "from smolagents.memory import AgentMemory" at module load time. Without + restoration, that import resolves to the bare mock ModuleType and fails + with ImportError("unknown location"). + """ + import importlib + + for key in ("smolagents.memory", "smolagents.models", "smolagents.agents", "smolagents"): + mod = sys.modules.get(key) + # Heuristic for mock: ModuleType without __spec__ and __file__. + if mod is not None and getattr(mod, "__spec__", None) is None and not hasattr(mod, "__file__"): + del sys.modules[key] + + for key in ("smolagents", "smolagents.memory", "smolagents.models", "smolagents.agents"): + try: + importlib.import_module(key) + except ImportError: + # Real smolagents may not have every submodule we mocked; tolerate. + pass \ No newline at end of file diff --git a/test/sdk/core/agents/test_agent_context/unit/test_component_management.py b/test/sdk/core/agents/test_agent_context/unit/test_component_management.py new file mode 100644 index 000000000..5f25e1119 --- /dev/null +++ b/test/sdk/core/agents/test_agent_context/unit/test_component_management.py @@ -0,0 +1,278 @@ +""" +Unit tests for ContextManager component management methods. + +Tests: +- register_component() +- clear_components() +- get_registered_components() +- build_system_prompt() +- _get_strategy() +- _calculate_component_budget() +""" +import sys +import os +from pathlib import Path + +TEST_ROOT = Path(__file__).resolve().parents[2] +PROJECT_ROOT = TEST_ROOT.parent + +for _path in (str(PROJECT_ROOT), str(TEST_ROOT)): + if _path not in sys.path: + sys.path.insert(0, _path) + +from loader import ContextManager, ContextManagerConfig +from stubs import _SystemPromptStep + + +class MockComponent: + """Mock context component for testing.""" + + def __init__(self, component_type="test", content="", priority=10, token_estimate=0): + self.component_type = component_type + self.priority = priority + self.token_estimate = token_estimate + self._content = content + self.metadata = {} + + def to_messages(self): + if self._content: + return [{"role": "system", "content": self._content}] + return [] + + def estimate_tokens(self, chars_per_token=1.5): + return int(len(self._content) / chars_per_token) + + +class TestRegisterComponent: + """Tests for register_component() method.""" + + def test_register_single_component(self): + cm = ContextManager() + comp = MockComponent(component_type="test", content="test content") + cm.register_component(comp) + assert len(cm.get_registered_components()) == 1 + + def test_register_multiple_components(self): + cm = ContextManager() + cm.register_component(MockComponent(content="comp1")) + cm.register_component(MockComponent(content="comp2")) + cm.register_component(MockComponent(content="comp3")) + assert len(cm.get_registered_components()) == 3 + + def test_register_sets_token_estimate(self): + cm = ContextManager() + comp = MockComponent(content="test content here", token_estimate=0) + cm.register_component(comp) + registered = cm.get_registered_components() + assert registered[0].token_estimate > 0 + + def test_register_preserves_existing_token_estimate(self): + cm = ContextManager() + comp = MockComponent(content="test", token_estimate=100) + cm.register_component(comp) + registered = cm.get_registered_components() + assert registered[0].token_estimate == 100 + + +class TestClearComponents: + """Tests for clear_components() method.""" + + def test_clear_removes_all_components(self): + cm = ContextManager() + cm.register_component(MockComponent(content="comp1")) + cm.register_component(MockComponent(content="comp2")) + cm.clear_components() + assert cm.get_registered_components() == [] + + def test_clear_on_empty_manager(self): + cm = ContextManager() + cm.clear_components() + assert cm.get_registered_components() == [] + + def test_clear_allows_new_registration(self): + cm = ContextManager() + cm.register_component(MockComponent(content="old")) + cm.clear_components() + cm.register_component(MockComponent(content="new")) + assert len(cm.get_registered_components()) == 1 + assert cm.get_registered_components()[0]._content == "new" + + +class TestGetRegisteredComponents: + """Tests for get_registered_components() method.""" + + def test_returns_copy_not_reference(self): + cm = ContextManager() + cm.register_component(MockComponent(content="original")) + copy1 = cm.get_registered_components() + copy2 = cm.get_registered_components() + copy1.clear() + assert len(copy2) == 1 + + def test_returns_empty_list_when_no_components(self): + cm = ContextManager() + result = cm.get_registered_components() + assert result == [] + + def test_preserves_component_order(self): + cm = ContextManager() + cm.register_component(MockComponent(content="first", priority=10)) + cm.register_component(MockComponent(content="second", priority=20)) + registered = cm.get_registered_components() + assert registered[0]._content == "first" + assert registered[1]._content == "second" + + +class TestGetStrategy: + """Tests for _get_strategy() method.""" + + def test_default_returns_token_budget_strategy(self): + cm = ContextManager() + strategy = cm._get_strategy() + assert strategy.get_strategy_name() == "token_budget" + + def test_full_strategy(self): + config = ContextManagerConfig(strategy="full") + cm = ContextManager(config) + strategy = cm._get_strategy() + assert strategy.get_strategy_name() == "full" + + def test_buffered_strategy_with_custom_buffer_size(self): + config = ContextManagerConfig(strategy="buffered", buffer_size_per_component=5) + cm = ContextManager(config) + strategy = cm._get_strategy() + assert strategy.get_strategy_name() == "buffered" + assert strategy.buffer_size == 5 + + def test_priority_strategy(self): + config = ContextManagerConfig(strategy="priority") + cm = ContextManager(config) + strategy = cm._get_strategy() + assert strategy.get_strategy_name() == "priority" + + def test_unknown_strategy_defaults_to_token_budget(self): + config = ContextManagerConfig(strategy="unknown") + cm = ContextManager(config) + strategy = cm._get_strategy() + assert strategy.get_strategy_name() == "token_budget" + + +class TestBuildSystemPrompt: + """Tests for build_system_prompt() method.""" + + def test_empty_components_returns_empty_messages(self): + cm = ContextManager() + messages = cm.build_system_prompt() + assert messages == [] + + def test_single_component_returns_messages(self): + cm = ContextManager() + cm.register_component(MockComponent(content="test prompt")) + messages = cm.build_system_prompt() + assert len(messages) == 1 + assert messages[0]["role"] == "system" + assert messages[0]["content"] == "test prompt" + + def test_multiple_components_combined(self): + cm = ContextManager() + cm.register_component(MockComponent(content="prompt1", priority=20)) + cm.register_component(MockComponent(content="prompt2", priority=10)) + messages = cm.build_system_prompt() + assert len(messages) == 2 + + def test_custom_token_budget(self): + cm = ContextManager() + cm.register_component(MockComponent(content="short", token_estimate=50)) + cm.register_component(MockComponent(content="very long content here", token_estimate=500)) + messages = cm.build_system_prompt(token_budget=100) + total_content = sum(len(m["content"]) for m in messages) + assert total_content < 500 + + def test_deduplicates_identical_messages(self): + cm = ContextManager() + cm.register_component(MockComponent(content="same content")) + cm.register_component(MockComponent(content="same content")) + messages = cm.build_system_prompt() + assert len(messages) == 1 + + +class TestCalculateComponentBudget: + """Tests for _calculate_component_budget() method.""" + + def test_excludes_conversation_history(self): + cm = ContextManager() + budget = cm._calculate_component_budget() + budgets = cm.config.component_budgets + assert "conversation_history" in budgets + assert budget == sum(v for k, v in budgets.items() if k != "conversation_history") + + def test_sum_of_non_excluded_budgets(self): + cm = ContextManager() + budget = cm._calculate_component_budget() + expected = ( + cm.config.component_budgets["system_prompt"] + + cm.config.component_budgets["tools"] + + cm.config.component_budgets["skills"] + + cm.config.component_budgets["memory"] + + cm.config.component_budgets["knowledge_base"] + + cm.config.component_budgets["managed_agents"] + + cm.config.component_budgets["external_a2a_agents"] + ) + assert budget == expected + + +class TestMessageAlreadyPresent: + """Tests for _message_already_present() method.""" + + def test_identical_message_detected(self): + cm = ContextManager() + messages = [{"role": "system", "content": "test"}] + new_msg = {"role": "system", "content": "test"} + assert cm._message_already_present(messages, new_msg) is True + + def test_different_content_not_detected(self): + cm = ContextManager() + messages = [{"role": "system", "content": "test"}] + new_msg = {"role": "system", "content": "different"} + assert cm._message_already_present(messages, new_msg) is False + + def test_different_role_not_detected(self): + cm = ContextManager() + messages = [{"role": "system", "content": "test"}] + new_msg = {"role": "user", "content": "test"} + assert cm._message_already_present(messages, new_msg) is False + + def test_empty_messages_list(self): + cm = ContextManager() + new_msg = {"role": "system", "content": "test"} + assert cm._message_already_present([], new_msg) is False + + +class TestComponentManagementWithConfig: + """Tests for component management with custom ContextManagerConfig.""" + + def test_strategy_selection_from_config(self): + config = ContextManagerConfig(strategy="full") + cm = ContextManager(config) + strategy = cm._get_strategy() + assert strategy.get_strategy_name() == "full" + + def test_component_budgets_from_config(self): + custom_budgets = {"system_prompt": 2000, "tools": 1000, "conversation_history": 3000} + config = ContextManagerConfig(component_budgets=custom_budgets) + cm = ContextManager(config) + budget = cm._calculate_component_budget() + assert budget == 3000 + + def test_chars_per_token_used_in_estimation(self): + config = ContextManagerConfig(chars_per_token=2.0) + cm = ContextManager(config) + comp = MockComponent(content="test content") + cm.register_component(comp) + registered = cm.get_registered_components() + assert registered[0].token_estimate > 0 + + +if __name__ == "__main__": + import pytest + pytest.main([__file__]) \ No newline at end of file diff --git a/test/sdk/core/agents/test_agent_context/unit/test_compress_with_cache_extra.py b/test/sdk/core/agents/test_agent_context/unit/test_compress_with_cache_extra.py index f3440e9dc..a0fcf0ff0 100644 --- a/test/sdk/core/agents/test_agent_context/unit/test_compress_with_cache_extra.py +++ b/test/sdk/core/agents/test_agent_context/unit/test_compress_with_cache_extra.py @@ -90,7 +90,7 @@ def test_P3_incremental_llm_none_falls_through_to_fresh(self): cm._previous_summary_cache = PreviousSummaryCache("old summary", 2, fp) call_count = [0] - def side_effect(text, model_, call_type="summary"): + def side_effect(text, model_, call_type="summary", prompt_type="initial"): call_count[0] += 1 if call_count[0] == 1: return None @@ -181,7 +181,7 @@ def test_C4_incremental_llm_none_falls_through_to_fresh(self): cm._current_summary_cache = CurrentSummaryCache("old summary", 2, fp) call_count = [0] - def side_effect(text, model_, call_type="summary"): + def side_effect(text, model_, call_type="summary", prompt_type="initial"): call_count[0] += 1 if call_count[0] == 1: return None @@ -232,7 +232,7 @@ def test_C6_fresh_llm_none_writes_none_to_cache(self): with patch.object(cm, '_generate_summary', return_value=None): result = cm._compress_current_with_cache(TaskStep(task="t"), actions, MagicMock()) - assert "Truncated" in result + assert "[CONTEXT COMPACTION" in result assert cm._current_summary_cache is None def test_C6_vs_previous_asymmetry(self): diff --git a/test/sdk/core/agents/test_context_component.py b/test/sdk/core/agents/test_context_component.py new file mode 100644 index 000000000..4b8b50ba4 --- /dev/null +++ b/test/sdk/core/agents/test_context_component.py @@ -0,0 +1,812 @@ +""" +Unit tests for sdk.nexent.core.agents ContextComponent and ContextStrategy classes. + +This module tests: +- ContextComponent abstract base class +- SystemPromptComponent +- ToolsComponent +- SkillsComponent +- MemoryComponent +- KnowledgeBaseComponent +- ManagedAgentsComponent +- ExternalAgentsComponent +- ContextStrategy abstract base class +- FullStrategy +- TokenBudgetStrategy +- BufferedStrategy +- PriorityWeightedStrategy +- Extended ContextManagerConfig +""" +import os +import sys +import types +import importlib.util +from pathlib import Path +from types import ModuleType +from unittest.mock import MagicMock + +import pytest + +TEST_ROOT = Path(__file__).resolve().parents[2] +PROJECT_ROOT = TEST_ROOT.parent + +for _path in (str(PROJECT_ROOT), str(TEST_ROOT)): + if _path not in sys.path: + sys.path.insert(0, _path) + + +def _create_mock_smolagents(): + mock_smolagents = ModuleType("smolagents") + mock_smolagents.__dict__.update({}) + mock_smolagents.__path__ = [] + + mock_smolagents.ActionStep = MagicMock() + mock_smolagents.TaskStep = MagicMock() + mock_smolagents.Timing = MagicMock() + mock_smolagents.AgentText = MagicMock() + mock_smolagents.handle_agent_output_types = MagicMock() + + agents_mod = ModuleType("smolagents.agents") + for _name in ["CodeAgent", "populate_template", "handle_agent_output_types", "AgentError", "ActionOutput", "RunResult"]: + setattr(agents_mod, _name, MagicMock(name=f"smolagents.agents.{_name}")) + setattr(mock_smolagents, "agents", agents_mod) + + local_python_mod = ModuleType("smolagents.local_python_executor") + setattr(local_python_mod, "fix_final_answer_code", MagicMock(name="fix_final_answer_code")) + setattr(mock_smolagents, "local_python_executor", local_python_mod) + + memory_mod = ModuleType("smolagents.memory") + for _name in ["ActionStep", "ToolCall", "TaskStep", "SystemPromptStep", "PlanningStep", "FinalAnswerStep"]: + setattr(memory_mod, _name, MagicMock(name=f"smolagents.memory.{_name}")) + setattr(mock_smolagents, "memory", memory_mod) + + models_mod = ModuleType("smolagents.models") + setattr(models_mod, "ChatMessage", MagicMock(name="ChatMessage")) + setattr(models_mod, "MessageRole", MagicMock(name="MessageRole")) + setattr(models_mod, "CODEAGENT_RESPONSE_FORMAT", MagicMock(name="CODEAGENT_RESPONSE_FORMAT")) + setattr(models_mod, "OpenAIServerModel", MagicMock(name="OpenAIServerModel")) + setattr(mock_smolagents, "models", models_mod) + + monitoring_mod = ModuleType("smolagents.monitoring") + setattr(monitoring_mod, "LogLevel", MagicMock(name="LogLevel")) + setattr(monitoring_mod, "Timing", MagicMock(name="Timing")) + setattr(monitoring_mod, "YELLOW_HEX", MagicMock(name="YELLOW_HEX")) + setattr(monitoring_mod, "TokenUsage", MagicMock(name="TokenUsage")) + setattr(mock_smolagents, "monitoring", monitoring_mod) + + utils_mod = ModuleType("smolagents.utils") + for _name in ["AgentExecutionError", "AgentGenerationError", "AgentParsingError", + "AgentMaxStepsError", "truncate_content", "extract_code_from_text"]: + setattr(utils_mod, _name, MagicMock(name=f"smolagents.utils.{_name}")) + setattr(mock_smolagents, "utils", utils_mod) + + tools_mod = ModuleType("smolagents.tools") + mock_tool_class = MagicMock() + mock_tool_class.from_langchain = MagicMock() + setattr(tools_mod, "Tool", mock_tool_class) + setattr(mock_smolagents, "tools", tools_mod) + + return mock_smolagents + + +def _create_mock_modules(): + mock_smolagents = _create_mock_smolagents() + + mock_rich_console = ModuleType("rich.console") + mock_rich_text = ModuleType("rich.text") + mock_rich = ModuleType("rich") + setattr(mock_rich, "Group", MagicMock(side_effect=lambda *args: args)) + setattr(mock_rich_text, "Text", MagicMock()) + setattr(mock_rich, "console", mock_rich_console) + setattr(mock_rich, "text", mock_rich_text) + setattr(mock_rich_console, "Group", MagicMock(side_effect=lambda *args: args)) + + mock_jinja2 = ModuleType("jinja2") + setattr(mock_jinja2, "Template", MagicMock()) + setattr(mock_jinja2, "StrictUndefined", MagicMock()) + + mock_langchain_core = ModuleType("langchain_core") + mock_langchain_core.tools = ModuleType("langchain_core.tools") + setattr(mock_langchain_core.tools, "BaseTool", MagicMock()) + + mock_exa_py = ModuleType("exa_py") + setattr(mock_exa_py, "Exa", MagicMock()) + + mock_openai = ModuleType("openai") + mock_openai.types = ModuleType("openai.types") + mock_openai.types.chat = ModuleType("openai.types.chat") + setattr(mock_openai.types.chat, "chat_completion_message", MagicMock()) + setattr(mock_openai.types.chat, "chat_completion_message_param", MagicMock()) + + class ProcessType: + STEP_COUNT = "STEP_COUNT" + PARSE = "PARSE" + EXECUTION_LOGS = "EXECUTION_LOGS" + AGENT_NEW_RUN = "AGENT_NEW_RUN" + AGENT_FINISH = "AGENT_FINISH" + FINAL_ANSWER = "FINAL_ANSWER" + ERROR = "ERROR" + OTHER = "OTHER" + SEARCH_CONTENT = "SEARCH_CONTENT" + TOKEN_COUNT = "TOKEN_COUNT" + PICTURE_WEB = "PICTURE_WEB" + CARD = "CARD" + TOOL = "TOOL" + MEMORY_SEARCH = "MEMORY_SEARCH" + MODEL_OUTPUT_DEEP_THINKING = "MODEL_OUTPUT_DEEP_THINKING" + MODEL_OUTPUT_THINKING = "MODEL_OUTPUT_THINKING" + MODEL_OUTPUT_CODE = "MODEL_OUTPUT_CODE" + + class MessageObserver: + def __init__(self): + self.messages = [] + self.add_message = MagicMock() + + def add_message(self, agent_name=None, process_type=None, content=None): + self.messages.append({ + "agent_name": agent_name, + "process_type": process_type, + "content": content + }) + + mock_observer = ModuleType("sdk.nexent.core.utils.observer") + setattr(mock_observer, "MessageObserver", MessageObserver) + setattr(mock_observer, "ProcessType", ProcessType) + + mock_tools_common_message_module = ModuleType("nexent.core.utils.tools_common_message") + + mock_botocore_module = ModuleType("botocore") + mock_botocore_exceptions = ModuleType("botocore.exceptions") + mock_botocore_exceptions.ClientError = MagicMock() + mock_botocore_module.exceptions = mock_botocore_exceptions + mock_botocore_client = ModuleType("botocore.client") + mock_botocore_client.Config = MagicMock() + mock_botocore_args = ModuleType("botocore.args") + mock_botocore_args.ClientArgsCreator = MagicMock() + mock_botocore_regions = ModuleType("botocore.regions") + mock_botocore_regions.EndpointResolverBuiltins = MagicMock() + mock_botocore_crt = ModuleType("botocore.crt") + mock_botocore_crt.CRT_SUPPORTED_AUTH_TYPES = [] + + mock_a2a_agent_proxy = ModuleType("sdk.nexent.core.agents.a2a_agent_proxy") + mock_a2a_agent_proxy_class = MagicMock() + setattr(mock_a2a_agent_proxy, "A2AAgentInfo", mock_a2a_agent_proxy_class) + + return { + "smolagents": mock_smolagents, + "smolagents.agents": mock_smolagents.agents, + "smolagents.memory": mock_smolagents.memory, + "smolagents.models": mock_smolagents.models, + "smolagents.monitoring": mock_smolagents.monitoring, + "smolagents.utils": mock_smolagents.utils, + "smolagents.local_python_executor": mock_smolagents.local_python_executor, + "smolagents.tools": mock_smolagents.tools, + "rich.console": mock_rich_console, + "rich.text": mock_rich_text, + "rich": mock_rich, + "jinja2": mock_jinja2, + "langchain_core": mock_langchain_core, + "langchain_core.tools": mock_langchain_core.tools, + "exa_py": mock_exa_py, + "openai": mock_openai, + "openai.types": mock_openai.types, + "openai.types.chat": mock_openai.types.chat, + "sdk.nexent.core.utils.observer": mock_observer, + "sdk.nexent.core.utils.observer.MessageObserver": MessageObserver, + "sdk.nexent.core.utils.observer.ProcessType": ProcessType, + "nexent.core.utils.observer": mock_observer, + "nexent.core.utils.tools_common_message": mock_tools_common_message_module, + "botocore": mock_botocore_module, + "botocore.client": mock_botocore_client, + "botocore.exceptions": mock_botocore_exceptions, + "botocore.args": mock_botocore_args, + "botocore.regions": mock_botocore_regions, + "botocore.crt": mock_botocore_crt, + "sdk.nexent.core.agents.a2a_agent_proxy": mock_a2a_agent_proxy, + "paramiko": MagicMock(), + "boto3": MagicMock(), + "tiktoken": MagicMock(), + "aiohttp": MagicMock(), + "tavily": MagicMock(), + "linkup": MagicMock(), + } + + +_module_mocks = _create_mock_modules() +_original_modules = {} +for name, module in _module_mocks.items(): + if name in sys.modules: + _original_modules[name] = sys.modules[name] + sys.modules[name] = module + + +def _load_agent_model_module(): + project_root = os.path.dirname( + os.path.dirname( + os.path.dirname( + os.path.dirname( + os.path.dirname(os.path.abspath(__file__)) + ) + ) + ) + ) + agent_model_path = os.path.join(project_root, "sdk", "nexent", "core", "agents", "agent_model.py") + + sys.modules["sdk"] = ModuleType("sdk") + sys.modules["sdk.nexent"] = ModuleType("sdk.nexent") + sys.modules["sdk.nexent.core"] = ModuleType("sdk.nexent.core") + sys.modules["sdk.nexent.core.agents"] = ModuleType("sdk.nexent.core.agents") + + spec = importlib.util.spec_from_file_location("sdk.nexent.core.agents.agent_model", agent_model_path) + module = importlib.util.module_from_spec(spec) + module.__package__ = "sdk.nexent.core.agents" + sys.modules["sdk.nexent.core.agents.agent_model"] = module + + spec.loader.exec_module(module) + return module + + +def _load_summary_config_module(): + project_root = os.path.dirname( + os.path.dirname( + os.path.dirname( + os.path.dirname( + os.path.dirname(os.path.abspath(__file__)) + ) + ) + ) + ) + summary_config_path = os.path.join(project_root, "sdk", "nexent", "core", "agents", "summary_config.py") + + spec = importlib.util.spec_from_file_location("sdk.nexent.core.agents.summary_config", summary_config_path) + module = importlib.util.module_from_spec(spec) + module.__package__ = "sdk.nexent.core.agents" + sys.modules["sdk.nexent.core.agents.summary_config"] = module + + spec.loader.exec_module(module) + return module + + +agent_model_module = _load_agent_model_module() +summary_config_module = _load_summary_config_module() + + +def _restore_real_modules() -> None: + """ + Roll back every sys.modules entry this file installed at import time so + sibling test trees (e.g. test_context_utils.py) can still import the + real packages. agent_model_module already captured the mock classes it + needs as module-level attributes, so swapping sys.modules back is safe + for our own tests. + + Strategy: for every name we injected, drop it from sys.modules if it + still points at a bare ModuleType (no __spec__, no __file__), then + force-reimport so real packages reload from disk. + """ + import importlib + + injected_names = list(_module_mocks.keys()) + + for key in injected_names: + mod = sys.modules.get(key) + if mod is not None and getattr(mod, "__spec__", None) is None and not hasattr(mod, "__file__"): + del sys.modules[key] + + for key in injected_names: + try: + importlib.import_module(key) + except (ImportError, Exception): + # Some mocked names (e.g. botocore.crt, sdk.nexent.core.agents.a2a_agent_proxy) + # may not exist as real packages — tolerate. + pass + + +_restore_real_modules() + + +class TestSystemPromptComponent: + """Tests for SystemPromptComponent.""" + + def test_creation_with_content(self): + comp = agent_model_module.SystemPromptComponent( + content="You are a helpful assistant.", + priority=20 + ) + assert comp.component_type == "system_prompt" + assert comp.content == "You are a helpful assistant." + assert comp.priority == 20 + assert comp.template_name is None + + def test_to_messages_returns_system_role(self): + comp = agent_model_module.SystemPromptComponent( + content="Test prompt content" + ) + messages = comp.to_messages() + assert len(messages) == 1 + assert messages[0]["role"] == "system" + assert messages[0]["content"] == "Test prompt content" + + def test_with_template_name(self): + comp = agent_model_module.SystemPromptComponent( + content="Rendered content", + template_name="managed_system_prompt_template_en.yaml" + ) + assert comp.template_name == "managed_system_prompt_template_en.yaml" + + def test_estimate_tokens(self): + comp = agent_model_module.SystemPromptComponent( + content="This is a test prompt with some words." + ) + tokens = comp.estimate_tokens(chars_per_token=1.5) + assert tokens > 0 + assert tokens == int(len("This is a test prompt with some words.") / 1.5) + + def test_default_priority(self): + comp = agent_model_module.SystemPromptComponent(content="test") + assert comp.priority == 10 + + +class TestToolsComponent: + """Tests for ToolsComponent.""" + + def test_creation_empty(self): + comp = agent_model_module.ToolsComponent() + assert comp.component_type == "tools" + assert comp.tools == [] + assert comp.formatted_description == "" + + def test_creation_with_tools(self): + comp = agent_model_module.ToolsComponent( + tools=[{"name": "search", "description": "Web search"}], + formatted_description="Available tools: search, calculator" + ) + assert len(comp.tools) == 1 + assert comp.formatted_description == "Available tools: search, calculator" + + def test_to_messages_with_formatted_description(self): + comp = agent_model_module.ToolsComponent( + formatted_description="Tool descriptions here" + ) + messages = comp.to_messages() + assert len(messages) == 1 + assert messages[0]["role"] == "system" + + def test_to_messages_empty_returns_empty_list(self): + comp = agent_model_module.ToolsComponent() + messages = comp.to_messages() + assert messages == [] + + def test_add_tool(self): + comp = agent_model_module.ToolsComponent() + comp.add_tool("web_search", "Search the web", '{"query": "str"}', "string") + assert len(comp.tools) == 1 + assert comp.tools[0]["name"] == "web_search" + assert comp.tools[0]["description"] == "Search the web" + + def test_add_multiple_tools(self): + comp = agent_model_module.ToolsComponent() + comp.add_tool("tool1", "desc1", "input1", "output1") + comp.add_tool("tool2", "desc2", "input2", "output2") + assert len(comp.tools) == 2 + + +class TestSkillsComponent: + """Tests for SkillsComponent.""" + + def test_creation_empty(self): + comp = agent_model_module.SkillsComponent() + assert comp.component_type == "skills" + assert comp.skills == [] + assert comp.formatted_description == "" + + def test_creation_with_skills(self): + comp = agent_model_module.SkillsComponent( + skills=[{"name": "coding", "description": "Write code"}], + formatted_description="Skills: coding, debugging" + ) + assert len(comp.skills) == 1 + + def test_to_messages_with_content(self): + comp = agent_model_module.SkillsComponent(formatted_description="Skill summaries") + messages = comp.to_messages() + assert len(messages) == 1 + assert messages[0]["role"] == "system" + + def test_to_messages_empty(self): + comp = agent_model_module.SkillsComponent() + assert comp.to_messages() == [] + + def test_add_skill(self): + comp = agent_model_module.SkillsComponent() + comp.add_skill("python_coding", "Write Python code", ["example1", "example2"]) + assert len(comp.skills) == 1 + assert comp.skills[0]["name"] == "python_coding" + assert comp.skills[0]["examples"] == ["example1", "example2"] + + def test_add_skill_without_examples(self): + comp = agent_model_module.SkillsComponent() + comp.add_skill("skill_name", "skill desc") + assert comp.skills[0]["examples"] == [] + + +class TestMemoryComponent: + """Tests for MemoryComponent.""" + + def test_creation_empty(self): + comp = agent_model_module.MemoryComponent() + assert comp.component_type == "memory" + assert comp.memories == [] + assert comp.formatted_content == "" + assert comp.search_query is None + + def test_creation_with_memories(self): + comp = agent_model_module.MemoryComponent( + memories=[{"content": "User prefers Python"}], + formatted_content="Memory context: user preferences", + search_query="user preferences" + ) + assert len(comp.memories) == 1 + assert comp.search_query == "user preferences" + + def test_to_messages_with_content(self): + comp = agent_model_module.MemoryComponent(formatted_content="Retrieved memories") + messages = comp.to_messages() + assert len(messages) == 1 + + def test_to_messages_empty(self): + comp = agent_model_module.MemoryComponent() + assert comp.to_messages() == [] + + def test_add_memory(self): + comp = agent_model_module.MemoryComponent() + comp.add_memory("User likes dark mode", "user", {"timestamp": "2024-01-01"}) + assert len(comp.memories) == 1 + assert comp.memories[0]["content"] == "User likes dark mode" + assert comp.memories[0]["memory_type"] == "user" + + def test_add_memory_without_metadata(self): + comp = agent_model_module.MemoryComponent() + comp.add_memory("test memory", "agent") + assert comp.memories[0]["metadata"] == {} + + +class TestKnowledgeBaseComponent: + """Tests for KnowledgeBaseComponent.""" + + def test_creation_empty(self): + comp = agent_model_module.KnowledgeBaseComponent() + assert comp.component_type == "knowledge_base" + assert comp.summary == "" + assert comp.kb_ids == [] + + def test_creation_with_summary(self): + comp = agent_model_module.KnowledgeBaseComponent( + summary="KB summary content", + kb_ids=["kb-1", "kb-2"], + priority=15 + ) + assert comp.summary == "KB summary content" + assert comp.kb_ids == ["kb-1", "kb-2"] + assert comp.priority == 15 + + def test_to_messages_with_summary(self): + comp = agent_model_module.KnowledgeBaseComponent(summary="Knowledge base summary") + messages = comp.to_messages() + assert len(messages) == 1 + + def test_to_messages_empty(self): + comp = agent_model_module.KnowledgeBaseComponent() + assert comp.to_messages() == [] + + +class TestManagedAgentsComponent: + """Tests for ManagedAgentsComponent.""" + + def test_creation_empty(self): + comp = agent_model_module.ManagedAgentsComponent() + assert comp.component_type == "managed_agents" + assert comp.agents == [] + assert comp.formatted_description == "" + + def test_creation_with_agents(self): + comp = agent_model_module.ManagedAgentsComponent( + agents=[{"name": "sub_agent", "description": "A sub agent"}], + formatted_description="Sub-agents available" + ) + assert len(comp.agents) == 1 + + def test_to_messages_with_content(self): + comp = agent_model_module.ManagedAgentsComponent(formatted_description="Managed agents list") + messages = comp.to_messages() + assert len(messages) == 1 + + def test_to_messages_empty(self): + comp = agent_model_module.ManagedAgentsComponent() + assert comp.to_messages() == [] + + def test_add_agent(self): + comp = agent_model_module.ManagedAgentsComponent() + comp.add_agent("research_agent", "Research assistant", ["web_search", "read_file"]) + assert len(comp.agents) == 1 + assert comp.agents[0]["name"] == "research_agent" + assert comp.agents[0]["tools"] == ["web_search", "read_file"] + + def test_add_agent_without_tools(self): + comp = agent_model_module.ManagedAgentsComponent() + comp.add_agent("agent_name", "agent desc") + assert comp.agents[0]["tools"] == [] + + +class TestExternalAgentsComponent: + """Tests for ExternalAgentsComponent.""" + + def test_creation_empty(self): + comp = agent_model_module.ExternalAgentsComponent() + assert comp.component_type == "external_a2a_agents" + assert comp.agents == [] + assert comp.formatted_description == "" + + def test_creation_with_agents(self): + comp = agent_model_module.ExternalAgentsComponent( + agents=[{"agent_id": "ext-1", "name": "External Agent"}], + formatted_description="External A2A agents" + ) + assert len(comp.agents) == 1 + + def test_to_messages_with_content(self): + comp = agent_model_module.ExternalAgentsComponent(formatted_description="External agents") + messages = comp.to_messages() + assert len(messages) == 1 + + def test_to_messages_empty(self): + comp = agent_model_module.ExternalAgentsComponent() + assert comp.to_messages() == [] + + def test_add_agent(self): + comp = agent_model_module.ExternalAgentsComponent() + comp.add_agent("ext-agent-123", "External Helper", "An external A2A agent", "https://external.com/a2a") + assert len(comp.agents) == 1 + assert comp.agents[0]["agent_id"] == "ext-agent-123" + assert comp.agents[0]["url"] == "https://external.com/a2a" + + +class TestFullStrategy: + """Tests for FullStrategy.""" + + def test_select_components_returns_all(self): + strategy = agent_model_module.FullStrategy() + components = [ + agent_model_module.SystemPromptComponent(content="test1", priority=10), + agent_model_module.ToolsComponent(formatted_description="test2", priority=20), + agent_model_module.MemoryComponent(formatted_content="test3", priority=5), + ] + selected = strategy.select_components(components, 1000, {}) + assert len(selected) == 3 + + def test_select_components_sorted_by_priority(self): + strategy = agent_model_module.FullStrategy() + components = [ + agent_model_module.SystemPromptComponent(content="low", priority=5), + agent_model_module.ToolsComponent(formatted_description="high", priority=30), + agent_model_module.MemoryComponent(formatted_content="mid", priority=15), + ] + selected = strategy.select_components(components, 1000, {}) + assert selected[0].priority == 30 + assert selected[1].priority == 15 + assert selected[2].priority == 5 + + def test_get_strategy_name(self): + strategy = agent_model_module.FullStrategy() + assert strategy.get_strategy_name() == "full" + + +class TestTokenBudgetStrategy: + """Tests for TokenBudgetStrategy.""" + + def test_select_within_budget(self): + strategy = agent_model_module.TokenBudgetStrategy() + components = [ + agent_model_module.SystemPromptComponent(content="short", priority=10, token_estimate=100), + agent_model_module.ToolsComponent(formatted_description="medium", priority=20, token_estimate=300), + agent_model_module.MemoryComponent(formatted_content="large", priority=5, token_estimate=500), + ] + selected = strategy.select_components(components, 400, {}) + assert len(selected) == 2 + total_tokens = sum(c.token_estimate for c in selected) + assert total_tokens <= 400 + + def test_select_respects_component_budget(self): + strategy = agent_model_module.TokenBudgetStrategy() + components = [ + agent_model_module.SystemPromptComponent(content="test", priority=10, token_estimate=200), + agent_model_module.ToolsComponent(formatted_description="test", priority=20, token_estimate=200), + ] + component_budgets = {"system_prompt": 100} + selected = strategy.select_components(components, 1000, component_budgets) + system_comps = [c for c in selected if c.component_type == "system_prompt"] + assert len(system_comps) == 0 + + def test_select_empty_components(self): + strategy = agent_model_module.TokenBudgetStrategy() + selected = strategy.select_components([], 1000, {}) + assert selected == [] + + def test_get_strategy_name(self): + strategy = agent_model_module.TokenBudgetStrategy() + assert strategy.get_strategy_name() == "token_budget" + + def test_uses_estimate_tokens_when_no_token_estimate(self): + strategy = agent_model_module.TokenBudgetStrategy() + comp = agent_model_module.SystemPromptComponent(content="test content here") + comp.token_estimate = 0 + tokens = comp.estimate_tokens() + assert tokens > 0 + + +class TestBufferedStrategy: + """Tests for BufferedStrategy.""" + + def test_default_buffer_size(self): + strategy = agent_model_module.BufferedStrategy() + assert strategy.buffer_size == 10 + + def test_custom_buffer_size(self): + strategy = agent_model_module.BufferedStrategy(buffer_size=5) + assert strategy.buffer_size == 5 + + def test_select_keeps_last_n_per_type(self): + strategy = agent_model_module.BufferedStrategy(buffer_size=2) + components = [ + agent_model_module.ToolsComponent(formatted_description="tool1", priority=10), + agent_model_module.ToolsComponent(formatted_description="tool2", priority=11), + agent_model_module.ToolsComponent(formatted_description="tool3", priority=12), + agent_model_module.SkillsComponent(formatted_description="skill1", priority=20), + ] + selected = strategy.select_components(components, 1000, {}) + tools_selected = [c for c in selected if c.component_type == "tools"] + assert len(tools_selected) == 2 + + def test_select_empty_components(self): + strategy = agent_model_module.BufferedStrategy() + selected = strategy.select_components([], 1000, {}) + assert selected == [] + + def test_get_strategy_name(self): + strategy = agent_model_module.BufferedStrategy() + assert strategy.get_strategy_name() == "buffered" + + +class TestPriorityWeightedStrategy: + """Tests for PriorityWeightedStrategy.""" + + def test_default_relevance_threshold(self): + strategy = agent_model_module.PriorityWeightedStrategy() + assert strategy.relevance_threshold == 0.5 + + def test_custom_relevance_threshold(self): + strategy = agent_model_module.PriorityWeightedStrategy(relevance_threshold=0.8) + assert strategy.relevance_threshold == 0.8 + + def test_select_with_relevance_scores(self): + strategy = agent_model_module.PriorityWeightedStrategy(relevance_threshold=0.5) + components = [ + agent_model_module.SystemPromptComponent( + content="high relevance", priority=10, + metadata={"relevance_score": 0.9} + ), + agent_model_module.ToolsComponent( + formatted_description="low relevance", priority=20, + metadata={"relevance_score": 0.3} + ), + ] + selected = strategy.select_components(components, 1000, {}) + high_rel = [c for c in selected if c.metadata.get("relevance_score", 1.0) >= 0.5] + assert len(high_rel) >= 1 + + def test_select_filters_below_threshold(self): + strategy = agent_model_module.PriorityWeightedStrategy(relevance_threshold=0.7) + components = [ + agent_model_module.SystemPromptComponent( + content="below", priority=10, + metadata={"relevance_score": 0.5} + ), + agent_model_module.ToolsComponent( + formatted_description="above", priority=20, + metadata={"relevance_score": 0.8} + ), + ] + selected = strategy.select_components(components, 1000, {}) + for c in selected: + assert c.metadata.get("relevance_score", 1.0) >= 0.7 + + def test_get_strategy_name(self): + strategy = agent_model_module.PriorityWeightedStrategy() + assert strategy.get_strategy_name() == "priority" + + +class TestExtendedContextManagerConfig: + """Tests for extended ContextManagerConfig.""" + + def test_default_strategy(self): + config = summary_config_module.ContextManagerConfig() + assert config.strategy == "token_budget" + + def test_all_injection_flags_default_true(self): + config = summary_config_module.ContextManagerConfig() + assert config.inject_system_prompt is True + assert config.inject_tools is True + assert config.inject_skills is True + assert config.inject_memory is True + assert config.inject_knowledge_base is True + assert config.inject_agent_definitions is True + assert config.inject_app_context is True + + def test_component_budgets_defaults(self): + config = summary_config_module.ContextManagerConfig() + assert "system_prompt" in config.component_budgets + assert "tools" in config.component_budgets + assert config.component_budgets["system_prompt"] == 4000 + + def test_custom_strategy(self): + config = summary_config_module.ContextManagerConfig(strategy="full") + assert config.strategy == "full" + + def test_disable_injection_flags(self): + config = summary_config_module.ContextManagerConfig( + inject_memory=False, + inject_knowledge_base=False + ) + assert config.inject_memory is False + assert config.inject_knowledge_base is False + + def test_custom_component_budgets(self): + config = summary_config_module.ContextManagerConfig( + component_budgets={"system_prompt": 2000, "tools": 1500} + ) + assert config.component_budgets["system_prompt"] == 2000 + + def test_buffer_size_per_component(self): + config = summary_config_module.ContextManagerConfig() + assert config.buffer_size_per_component == 10 + + def test_existing_fields_preserved(self): + config = summary_config_module.ContextManagerConfig( + enabled=True, + token_threshold=5000, + keep_recent_steps=3 + ) + assert config.enabled is True + assert config.token_threshold == 5000 + assert config.keep_recent_steps == 3 + + +class TestAgentConfigWithContextComponents: + """Tests for AgentConfig with context_components field.""" + + def test_agent_config_with_context_components(self): + components = [ + agent_model_module.SystemPromptComponent(content="test prompt"), + agent_model_module.ToolsComponent(formatted_description="test tools"), + ] + config = agent_model_module.AgentConfig( + name="test_agent", + description="Test agent", + tools=[], + model_name="test-model", + context_components=components + ) + assert len(config.context_components) == 2 + assert config.context_components[0].component_type == "system_prompt" + + def test_agent_config_default_context_components_none(self): + config = agent_model_module.AgentConfig( + name="test_agent", + description="Test agent", + tools=[], + model_name="test-model" + ) + assert config.context_components is None + + +if __name__ == "__main__": + pytest.main([__file__]) \ No newline at end of file diff --git a/test/sdk/core/agents/test_nexent_agent.py b/test/sdk/core/agents/test_nexent_agent.py index ab09b95b6..ff8da11f8 100644 --- a/test/sdk/core/agents/test_nexent_agent.py +++ b/test/sdk/core/agents/test_nexent_agent.py @@ -470,6 +470,8 @@ def test_create_model_success(nexent_agent_with_models, mock_model_config): top_p=mock_model_config.top_p, ssl_verify=True, display_name=mock_model_config.cite_name, + extra_body=mock_model_config.extra_body, + max_tokens=mock_model_config.max_tokens, timeout_seconds=mock_model_config.timeout_seconds, ) @@ -500,6 +502,8 @@ def test_create_model_deep_thinking_success(nexent_agent_with_models, mock_deep_ top_p=mock_deep_thinking_model_config.top_p, ssl_verify=True, display_name=mock_deep_thinking_model_config.cite_name, + extra_body=mock_deep_thinking_model_config.extra_body, + max_tokens=mock_deep_thinking_model_config.max_tokens, timeout_seconds=mock_deep_thinking_model_config.timeout_seconds, ) diff --git a/test/sdk/core/agents/test_nexent_agent_component_integration.py b/test/sdk/core/agents/test_nexent_agent_component_integration.py new file mode 100644 index 000000000..49483d94b --- /dev/null +++ b/test/sdk/core/agents/test_nexent_agent_component_integration.py @@ -0,0 +1,199 @@ +"""Integration tests for context component registration in NexentAgent and CoreAgent.""" + +import pytest +from unittest.mock import MagicMock, patch + +from sdk.nexent.core.agents.agent_model import ( + SystemPromptComponent, + ToolsComponent, + AgentConfig, + ToolConfig, +) +from sdk.nexent.core.agents.summary_config import ContextManagerConfig + + +STRATEGY_TOKEN_BUDGET = "token_budget" + + +class TestNexentAgentComponentRegistration: + """Tests for NexentAgent.register_context_components functionality.""" + + @pytest.fixture + def mock_context_manager(self): + manager = MagicMock() + manager._components = [] + manager.get_registered_components = lambda: list(manager._components) + manager.register_component = lambda c: manager._components.append(c) + return manager + + @pytest.fixture + def agent_config_with_components(self): + ctx_config = ContextManagerConfig( + token_threshold=1000, + strategy=STRATEGY_TOKEN_BUDGET, + component_budgets={"tools": 200, "skills": 100}, + ) + + components = [ + ToolsComponent(content="Tool descriptions", token_estimate=50), + SystemPromptComponent(content="System prompt", token_estimate=100), + ] + + return AgentConfig( + name="test_agent", + description="Test agent", + model_name="test-model", + tools=[], + context_manager_config=ctx_config, + context_components=components, + ) + + def test_context_manager_mounted_when_config_present(self, agent_config_with_components): + agent = MagicMock() + agent.context_manager = None + + ctx_config = getattr(agent_config_with_components, 'context_manager_config', None) + if ctx_config: + from sdk.nexent.core.agents.agent_context import ContextManager + agent.context_manager = ContextManager( + config=ctx_config, + max_steps=10 + ) + + components = getattr(agent_config_with_components, 'context_components', None) + if components: + for component in components: + agent.context_manager.register_component(component) + + assert agent.context_manager is not None + assert len(agent.context_manager.get_registered_components()) == 2 + + def test_no_context_manager_when_config_absent(self): + agent_config = AgentConfig( + name="test_agent", + description="Test agent", + model_name="test-model", + tools=[], + ) + + ctx_config = getattr(agent_config, 'context_manager_config', None) + agent = MagicMock() + agent.context_manager = None + + assert ctx_config is None + assert agent.context_manager is None + + def test_components_registered_in_order(self, mock_context_manager, agent_config_with_components): + components = getattr(agent_config_with_components, 'context_components', []) + + for component in components: + mock_context_manager.register_component(component) + + registered = mock_context_manager.get_registered_components() + assert len(registered) == 2 + assert registered[0].component_type == "tools" + assert registered[1].component_type == "system_prompt" + + +class TestCoreAgentSystemPromptAssembly: + """Tests for CoreAgent._build_system_prompt_from_components functionality.""" + + @pytest.fixture + def mock_context_manager_with_components(self): + manager = MagicMock() + manager.get_registered_components = lambda: [ + SystemPromptComponent(content="Base prompt", token_estimate=50), + ToolsComponent(content="Tool info", token_estimate=30), + ] + manager.build_system_prompt = lambda: [ + {"role": "system", "content": "Base prompt\n\nTool info"}, + ] + return manager + + def test_system_prompt_uses_components_when_registered(self, mock_context_manager_with_components): + base_prompt = "Original system prompt" + + if mock_context_manager_with_components and mock_context_manager_with_components.get_registered_components(): + component_messages = mock_context_manager_with_components.build_system_prompt() + if component_messages: + final_prompt = "\n\n".join( + msg.get("content", "") for msg in component_messages if msg.get("role") == "system" + ) + + assert final_prompt == "Base prompt\n\nTool info" + + def test_system_prompt_fallback_when_no_components(self): + base_prompt = "Original system prompt" + context_manager = MagicMock() + context_manager.get_registered_components = lambda: [] + + if context_manager and context_manager.get_registered_components(): + component_messages = context_manager.build_system_prompt() + if component_messages: + final_prompt = "\n\n".join( + msg.get("content", "") for msg in component_messages if msg.get("role") == "system" + ) + else: + final_prompt = base_prompt + else: + final_prompt = base_prompt + + assert final_prompt == "Original system prompt" + + def test_system_prompt_fallback_when_no_context_manager(self): + base_prompt = "Original system prompt" + context_manager = None + + if context_manager and context_manager.get_registered_components(): + component_messages = context_manager.build_system_prompt() + if component_messages: + final_prompt = "\n\n".join( + msg.get("content", "") for msg in component_messages if msg.get("role") == "system" + ) + else: + final_prompt = base_prompt + else: + final_prompt = base_prompt + + assert final_prompt == "Original system prompt" + + def test_empty_component_messages_fallback(self): + base_prompt = "Original system prompt" + context_manager = MagicMock() + context_manager.get_registered_components = lambda: [MagicMock()] + context_manager.build_system_prompt = lambda: [] + + if context_manager and context_manager.get_registered_components(): + component_messages = context_manager.build_system_prompt() + if component_messages: + final_prompt = "\n\n".join( + msg.get("content", "") for msg in component_messages if msg.get("role") == "system" + ) + else: + final_prompt = base_prompt + else: + final_prompt = base_prompt + + assert final_prompt == "Original system prompt" + + +class TestBackwardCompatibility: + """Tests for backward compatibility with existing agent creation.""" + + def test_agent_config_without_components_still_works(self): + config = AgentConfig( + name="legacy_agent", + description="Legacy agent", + model_name="test-model", + tools=[], + context_manager_config=ContextManagerConfig(token_threshold=1000), + ) + + components = getattr(config, 'context_components', None) + assert components is None + + def test_context_manager_config_without_strategy_defaults(self): + config = ContextManagerConfig(token_threshold=2000) + + assert config.strategy == STRATEGY_TOKEN_BUDGET + assert "system_prompt" in config.component_budgets \ No newline at end of file