From 6baa994b14f103cb4d6033e756760de9586c3894 Mon Sep 17 00:00:00 2001 From: suluyan Date: Fri, 6 Feb 2026 15:03:04 +0800 Subject: [PATCH 1/5] fix: video gen exclude edit_file --- projects/singularity_cinema/agent.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/projects/singularity_cinema/agent.yaml b/projects/singularity_cinema/agent.yaml index d171fc7fc..dc1756486 100644 --- a/projects/singularity_cinema/agent.yaml +++ b/projects/singularity_cinema/agent.yaml @@ -279,6 +279,7 @@ tools: mcp: false allow_read_all_files: true exclude: + - edit_file - list_files - search_file_content - search_file_name From 45698aa83d7731a7899630f0af24cda77a1957d1 Mon Sep 17 00:00:00 2001 From: suluyan Date: Thu, 5 Mar 2026 16:47:16 +0800 Subject: [PATCH 2/5] feat: support multimodal model --- MULTIMODAL_SUPPORT.md | 306 ++++++++++++++++++ config/cfg_model_multimodal.yaml | 57 ++++ examples/agent/test_llm_agent_multimodal.py | 333 ++++++++++++++++++++ ms_agent/agent/llm_agent.py | 19 +- ms_agent/llm/openai_llm.py | 30 +- 5 files changed, 734 insertions(+), 11 deletions(-) create mode 100644 MULTIMODAL_SUPPORT.md create mode 100644 config/cfg_model_multimodal.yaml create mode 100644 examples/agent/test_llm_agent_multimodal.py diff --git a/MULTIMODAL_SUPPORT.md b/MULTIMODAL_SUPPORT.md new file mode 100644 index 000000000..fcc52352c --- /dev/null +++ b/MULTIMODAL_SUPPORT.md @@ -0,0 +1,306 @@ +# ms-agent 多模态支持指南 + +本文档介绍如何使用 ms-agent 进行多模态对话,包括图片理解和分析功能。 + +## 概述 + +ms-agent 已经支持多模态模型,如阿里云的 `qwen3.5-plus` 模型。多模态模型能够: +- 分析图片内容 +- 识别图片中的对象、场景和文字 +- 结合图片内容进行对话 + +## 前置要求 + +### 1. 安装依赖 + +确保已安装必要的依赖包: + +```bash +pip install openai +``` + +### 2. 配置 API Key + +(以qwen3.5-plus为例)获取 DashScope API Key 并设置环境变量: + +```bash +export DASHSCOPE_API_KEY='your-dashscope-api-key' +``` + +或者在配置文件中直接设置 `dashscope_api_key`。 + +## 配置多模态模型 + +### 使用配置文件 + +可以使用预定义的多模态配置文件 `config/cfg_model_multimodal.yaml`: + +```yaml +llm: + service: dashscope + model: qwen3.5-plus + dashscope_api_key: your-api-key # 或使用环境变量 + modelscope_base_url: https://dashscope.aliyuncs.com/compatible-mode/v1 + +generation_config: + temperature: 0.7 + top_k: 50 + top_p: 0.8 + max_tokens: 2048 + stream: true + extra_body: + enable_thinking: false +``` + +### 在代码中配置 + +```python +from ms_agent.config import Config +from ms_agent.llm import LLM + +config = Config.from_task('path/to/config') +config.llm.model = 'qwen3.5-plus' +config.llm.service = 'dashscope' +config.llm.dashscope_api_key = 'your-api-key' +config.llm.modelscope_base_url = 'https://dashscope.aliyuncs.com/compatible-mode/v1' + +llm = LLM.from_config(config) +``` + +## 使用 LLMAgent 进行多模态对话 + +推荐使用 `LLMAgent` 来进行多模态对话,它提供了更完整的功能,包括记忆管理、工具调用和回调支持。 + +### 基本用法 + +```python +import asyncio +import os +from ms_agent import LLMAgent +from ms_agent.config import Config +from ms_agent.llm.utils import Message + +async def multimodal_chat(): + # 创建配置 + config = Config.from_task('ms_agent/agent/agent.yaml') + config.llm.model = 'qwen3.5-plus' + config.llm.service = 'dashscope' + config.llm.dashscope_api_key = os.environ.get('DASHSCOPE_API_KEY', '') + config.llm.modelscope_base_url = 'https://dashscope.aliyuncs.com/compatible-mode/v1' + + # 创建 LLMAgent + agent = LLMAgent(config=config) + + # 构建多模态消息 + multimodal_content = [ + {"type": "text", "text": "请描述这张图片。"}, + {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}} + ] + + # 调用 agent + response = await agent.run(messages=[Message(role="user", content=multimodal_content)]) + print(response[-1].content) + +asyncio.run(multimodal_chat()) +``` + +### 非 Stream 模式 + +```python +# 配置中禁用 stream +config.generation_config.stream = False + +agent = LLMAgent(config=config) + +multimodal_content = [ + {"type": "text", "text": "请描述这张图片。"}, + {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}} +] + +# 非 stream 模式:直接返回完整响应 +response = await agent.run(messages=[Message(role="user", content=multimodal_content)]) +print(f"[回复] {response[-1].content}") +print(f"[Token使用] 输入: {response[-1].prompt_tokens}, 输出: {response[-1].completion_tokens}") +``` + +### Stream 模式 + +```python +# 配置中启用 stream +config.generation_config.stream = True + +agent = LLMAgent(config=config) + +multimodal_content = [ + {"type": "text", "text": "请描述这张图片。"}, + {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}} +] + +# stream 模式:返回生成器 +generator = await agent.run( + messages=[Message(role="user", content=multimodal_content)], + stream=True +) + +full_response = "" +async for response_chunk in generator: + if response_chunk and len(response_chunk) > 0: + last_msg = response_chunk[-1] + if last_msg.content: + # 流式输出新增内容 + print(last_msg.content[len(full_response):], end='', flush=True) + full_response = last_msg.content + +print(f"\n[完整回复] {full_response}") +``` + +### 多轮对话 + +LLMAgent 支持多轮对话,可以在对话中混合使用图片和文本: + +```python +agent = LLMAgent(config=config, tag="multimodal_conversation") + +# 第一轮:发送图片 +multimodal_content = [ + {"type": "text", "text": "这张图片里有几个人?"}, + {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}} +] + +messages = [Message(role="user", content=multimodal_content)] +response = await agent.run(messages=messages) +print(f"[第一轮回复] {response[-1].content}") + +# 第二轮:继续追问(纯文本,保留上下文) +messages = response # 使用上一轮的回复作为上下文 +messages.append(Message(role="user", content="他们在做什么?")) +response = await agent.run(messages=messages) +print(f"[第二轮回复] {response[-1].content}") +``` + +## 多模态消息格式 + +ms-agent 使用 OpenAI 兼容的多模态消息格式。图片可以通过以下三种方式提供: + +### 1. 图片 URL + +```python +from ms_agent.llm.utils import Message + +multimodal_content = [ + {"type": "text", "text": "请描述这张图片。"}, + {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}} +] + +messages = [ + Message(role="user", content=multimodal_content) +] + +response = llm.generate(messages=messages) +``` + +### 2. Base64 编码 + +```python +import base64 + +# 读取并编码图片 +with open('image.jpg', 'rb') as f: + image_data = base64.b64encode(f.read()).decode('utf-8') + +multimodal_content = [ + {"type": "text", "text": "这是什么?"}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_data}" + } + } +] + +messages = [Message(role="user", content=multimodal_content)] +response = llm.generate(messages=messages) +``` + +### 3. 本地文件路径 + +```python +import base64 +import os + +image_path = 'path/to/image.png' + +# 获取 MIME 类型 +ext = os.path.splitext(image_path)[1].lower() +mime_type = { + '.png': 'image/png', + '.jpg': 'image/jpeg', + '.jpeg': 'image/jpeg', + '.gif': 'image/gif', + '.webp': 'image/webp' +}.get(ext, 'image/png') + +# 读取并编码 +with open(image_path, 'rb') as f: + image_data = base64.b64encode(f.read()).decode('utf-8') + +multimodal_content = [ + {"type": "text", "text": "描述这张图片。"}, + { + "type": "image_url", + "image_url": { + "url": f"data:{mime_type};base64,{image_data}" + } + } +] + +messages = [Message(role="user", content=multimodal_content)] +response = llm.generate(messages=messages) +``` + +## 运行示例 + +### 运行 Agent 示例 + +```bash +# 运行完整测试套件(包括 stream 和非 stream 模式) +python examples/agent/test_llm_agent_multimodal.py +``` + +## 常见问题 + +### Q: 图片大小有限制吗? + +A: 是的,不同模型有不同的限制: +- qwen3.5-plus: 推荐图片大小不超过 4MB +- 分辨率建议不超过 2048x2048 + +### Q: 支持哪些图片格式? + +A: 通常支持: +- JPEG / JPG +- PNG +- GIF +- WebP + +### Q: 可以一次发送多张图片吗? + +A: 是的,可以在消息中添加多个 `image_url` 块: + +```python +multimodal_content = [ + {"type": "text", "text": "比较这两张图片。"}, + {"type": "image_url", "image_url": {"url": "https://example.com/img1.jpg"}}, + {"type": "image_url", "image_url": {"url": "https://example.com/img2.jpg"}} +] +``` + +### Q: 流式输出支持吗? + +A: 是的,多模态对话支持流式输出。设置 `stream: true` 即可: + +```python +config.generation_config.stream = True +response = llm.generate(messages=messages, stream=True) +``` + diff --git a/config/cfg_model_multimodal.yaml b/config/cfg_model_multimodal.yaml new file mode 100644 index 000000000..f660e9e3c --- /dev/null +++ b/config/cfg_model_multimodal.yaml @@ -0,0 +1,57 @@ +# 多模态模型配置示例 +# 用于配置 qwen3.5-plus 等多模态模型 + +llm: + # 使用 dashscope 服务 + service: dashscope + + # 多模态模型名称 + model: qwen3.5-plus + + # DashScope API Key + # 也可以通过环境变量 DASHSCOPE_API_KEY 设置 + dashscope_api_key: + + # DashScope 兼容模式 endpoint + modelscope_base_url: https://dashscope.aliyuncs.com/compatible-mode/v1 + +# 生成配置 +generation_config: + temperature: 0.7 + top_k: 50 + top_p: 0.8 + max_tokens: 2048 + stream: true + + # 禁用思考模式(可选) + extra_body: + enable_thinking: false + +# 系统提示词 +prompt: + system: | + 你是一个多模态助手,能够理解和分析图片内容。 + + 当用户发送图片时,请仔细观察图片内容,并提供: + 1. 图片中的主要元素和对象 + 2. 场景描述(如适用) + 3. 任何文字信息(如图片中包含文字) + 4. 颜色、布局等视觉特征 + + 请用用户相同的语言回答。 + +# 最大对话轮次 +max_chat_round: 10 + +# 回调函数 +callbacks: + - input_callback + +# 工具配置(多模态对话通常不需要工具) +tools: + +help: | + 多模态模型配置示例 + 使用方法: + 1. 设置 DASHSCOPE_API_KEY 环境变量 + 2. 使用此配置运行多模态对话 diff --git a/examples/agent/test_llm_agent_multimodal.py b/examples/agent/test_llm_agent_multimodal.py new file mode 100644 index 000000000..e91b3cfc8 --- /dev/null +++ b/examples/agent/test_llm_agent_multimodal.py @@ -0,0 +1,333 @@ +""" +LLMAgent 多模态对话测试 + +从 LLMAgent 层面测试多模态功能,覆盖 stream 和非 stream 两种模式。 +""" +import asyncio +import os +import sys +import uuid + +from ms_agent import LLMAgent +from ms_agent.config import Config +from ms_agent.llm.utils import Message + +# 获取脚本所在目录 +path = os.path.dirname(os.path.abspath(__file__)) +agent_config = os.path.join(path, '..', '..', 'ms_agent', 'agent', 'agent.yaml') + + +async def test_llm_agent_multimodal_non_stream(): + """ + 测试 LLMAgent 非 stream 模式的多模态对话 + """ + print("=" * 70) + print("测试 1: LLMAgent 非 stream 模式 - 多模态对话 (URL 图片)") + print("=" * 70) + + # 创建配置 + config = Config.from_task(agent_config) + + # 配置多模态模型 + config.llm.model = 'qwen3.5-plus' + config.llm.service = 'dashscope' + config.llm.dashscope_api_key = os.environ.get('DASHSCOPE_API_KEY', '') + config.llm.modelscope_base_url = 'https://dashscope.aliyuncs.com/compatible-mode/v1' + + # 禁用 stream、load_cache 和 callbacks(避免交互式输入问题) + config.generation_config.stream = False + config.load_cache = False + config.callbacks = [] + + if not config.llm.dashscope_api_key: + print("[错误] 未设置 DASHSCOPE_API_KEY 环境变量") + print("请先设置: export DASHSCOPE_API_KEY='your-api-key'") + return False + + # 创建 LLMAgent,使用唯一 tag 避免历史记录的干扰 + tag = f"multimodal_test_{uuid.uuid4().hex[:8]}" + agent = LLMAgent(config=config, tag=tag) + + # 测试图片 URL + test_image_url = "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg" + + # 构建多模态内容 + multimodal_content = [ + {"type": "text", "text": "请详细描述这张图片中的内容。"}, + {"type": "image_url", "image_url": {"url": test_image_url}} + ] + + try: + print(f"\n[发送] 请描述这张图片: {test_image_url}") + print("-" * 70) + + messages = [ + Message(role="system", content="你是一个多模态助手。"), + Message(role="user", content=multimodal_content) + ] + + response = await agent.run(messages=messages) + + print(f"\n[回复] {response[-1].content}") + print("-" * 70) + print(f"\n[Token使用] 输入: {response[-1].prompt_tokens}, 输出: {response[-1].completion_tokens}") + + return True + except Exception as e: + print(f"\n[错误] 非 stream 多模态对话失败: {e}") + import traceback + traceback.print_exc() + return False + + +async def test_llm_agent_multimodal_stream(): + """ + 测试 LLMAgent stream 模式的多模态对话 + """ + print("\n" + "=" * 70) + print("测试 2: LLMAgent stream 模式 - 多模态对话 (URL 图片)") + print("=" * 70) + + # 创建配置 + config = Config.from_task(agent_config) + + # 配置多模态模型 + config.llm.model = 'qwen3.5-plus' + config.llm.service = 'dashscope' + config.llm.dashscope_api_key = os.environ.get('DASHSCOPE_API_KEY', '') + config.llm.modelscope_base_url = 'https://dashscope.aliyuncs.com/compatible-mode/v1' + + # 启用 stream,禁用 load_cache 和 callbacks + config.generation_config.stream = True + config.load_cache = False + config.callbacks = [] + + if not config.llm.dashscope_api_key: + print("[错误] 未设置 DASHSCOPE_API_KEY 环境变量") + return False + + # 创建 LLMAgent,使用唯一 tag + tag = f"multimodal_stream_{uuid.uuid4().hex[:8]}" + agent = LLMAgent(config=config, tag=tag) + + # 测试图片 URL + test_image_url = "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg" + + # 构建多模态内容 + multimodal_content = [ + {"type": "text", "text": "请用中文描述这张图片中的内容。"}, + {"type": "image_url", "image_url": {"url": test_image_url}} + ] + + try: + print(f"\n[发送] 请描述这张图片: {test_image_url}") + print("-" * 70) + print("[回复开始]") + + messages = [ + Message(role="system", content="你是一个多模态助手。"), + Message(role="user", content=multimodal_content) + ] + + # stream 模式调用 + generator = await agent.run(messages=messages, stream=True) + + full_response = "" + async for response_chunk in generator: + if response_chunk and len(response_chunk) > 0: + last_msg = response_chunk[-1] + if last_msg.content and len(last_msg.content) > len(full_response): + # 流式输出新增内容 + sys.stdout.write(last_msg.content[len(full_response):]) + sys.stdout.flush() + full_response = last_msg.content + + print("\n" + "-" * 70) + print(f"\n[完整回复长度] {len(full_response)} 字符") + return True + except Exception as e: + print(f"\n[错误] stream 多模态对话失败: {e}") + import traceback + traceback.print_exc() + return False + + +async def test_llm_agent_multimodal_base64_non_stream(): + """ + 测试 LLMAgent 非 stream 模式 - Base64 编码图片 + """ + print("\n" + "=" * 70) + print("测试 3: LLMAgent 非 stream 模式 - Base64 编码图片") + print("=" * 70) + + import base64 + + # 创建配置 + config = Config.from_task(agent_config) + config.llm.model = 'qwen3.5-plus' + config.llm.service = 'dashscope' + config.llm.dashscope_api_key = os.environ.get('DASHSCOPE_API_KEY', '') + config.llm.modelscope_base_url = 'https://dashscope.aliyuncs.com/compatible-mode/v1' + config.generation_config.stream = False + config.load_cache = False + config.callbacks = [] + + if not config.llm.dashscope_api_key: + print("[错误] 未设置 DASHSCOPE_API_KEY 环境变量") + return False + + # 创建 LLMAgent,使用唯一 tag + tag = f"multimodal_base64_{uuid.uuid4().hex[:8]}" + agent = LLMAgent(config=config, tag=tag) + + # 一个简单的测试图片 base64 (1x1 像素) + test_image_base64 = "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNk+M9Qz0AEYBxVSF+FABJADq0/8ZEPAAAAAElFTkSuQmCC" + + multimodal_content = [ + {"type": "text", "text": "这是一个什么颜色的图片?请用中文简短回答。"}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{test_image_base64}" + } + } + ] + + try: + print("\n[发送] 这是什么颜色的图片?(Base64 编码)") + print("-" * 70) + + messages = [ + Message(role="system", content="你是一个多模态助手。"), + Message(role="user", content=multimodal_content) + ] + + response = await agent.run(messages=messages) + + print(f"\n[回复] {response[-1].content}") + print("-" * 70) + return True + except Exception as e: + print(f"\n[错误] Base64 多模态对话失败: {e}") + import traceback + traceback.print_exc() + return False + + +async def test_llm_agent_multimodal_conversation(): + """ + 测试 LLMAgent 多轮对话中的多模态功能 + """ + print("\n" + "=" * 70) + print("测试 4: LLMAgent 多轮对话 - 多模态 + 文本混合") + print("=" * 70) + + # 创建配置 + config = Config.from_task(agent_config) + config.llm.model = 'qwen3.5-plus' + config.llm.service = 'dashscope' + config.llm.dashscope_api_key = os.environ.get('DASHSCOPE_API_KEY', '') + config.llm.modelscope_base_url = 'https://dashscope.aliyuncs.com/compatible-mode/v1' + config.generation_config.stream = False + config.load_cache = False + config.callbacks = [] + + if not config.llm.dashscope_api_key: + print("[错误] 未设置 DASHSCOPE_API_KEY 环境变量") + return False + + # 创建 LLMAgent,使用唯一 tag + tag = f"multimodal_conv_{uuid.uuid4().hex[:8]}" + agent = LLMAgent(config=config, tag=tag) + + # 测试图片 URL + test_image_url = "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg" + + try: + # 第一轮:发送图片 + print("\n[第一轮] 发送图片并询问") + print("-" * 70) + + multimodal_content = [ + {"type": "text", "text": "这张图片里有几个人?"}, + {"type": "image_url", "image_url": {"url": test_image_url}} + ] + + messages = [ + Message(role="system", content="你是一个多模态助手。"), + Message(role="user", content=multimodal_content) + ] + response = await agent.run(messages=messages) + print(f"\n[第一轮回复] {response[-1].content[:200]}...") + + # 第二轮:继续追问(纯文本) + print("\n[第二轮] 继续追问") + print("-" * 70) + + # 保留历史记录,添加新的用户消息 + messages = response + messages.append(Message(role="user", content="图片中的场景是在室内还是室外?")) + response = await agent.run(messages=messages) + print(f"\n[第二轮回复] {response[-1].content[:200]}...") + + # 第三轮:再次追问(纯文本) + print("\n[第三轮] 再次追问") + print("-" * 70) + + messages = response + messages.append(Message(role="user", content="用一句话总结这张图片。")) + response = await agent.run(messages=messages) + print(f"\n[第三轮回复] {response[-1].content[:200]}...") + + print("-" * 70) + return True + except Exception as e: + print(f"\n[错误] 多轮对话失败: {e}") + import traceback + traceback.print_exc() + return False + + +async def main(): + """运行所有测试""" + print("\n" + "=" * 70) + print("LLMAgent 多模态对话测试套件") + print("=" * 70) + print("\n请确保已设置环境变量: export DASHSCOPE_API_KEY='your-api-key'\n") + + results = [] + + # 测试 1: 非 stream 模式 + result1 = await test_llm_agent_multimodal_non_stream() + results.append(("非 stream 模式 (URL图片)", result1)) + + # 测试 2: stream 模式 + result2 = await test_llm_agent_multimodal_stream() + results.append(("stream 模式 (URL图片)", result2)) + + # 测试 3: Base64 非 stream + result3 = await test_llm_agent_multimodal_base64_non_stream() + results.append(("非 stream 模式 (Base64)", result3)) + + # 测试 4: 多轮对话 + result4 = await test_llm_agent_multimodal_conversation() + results.append(("多轮对话", result4)) + + # 总结 + print("\n" + "=" * 70) + print("测试总结") + print("=" * 70) + for name, result in results: + status = "✓ 通过" if result else "✗ 失败" + print(f" {status} - {name}") + + passed = sum(1 for _, r in results if r) + total = len(results) + print(f"\n总计: {passed}/{total} 测试通过") + + return passed == total + + +if __name__ == '__main__': + success = asyncio.run(main()) + sys.exit(0 if success else 1) diff --git a/ms_agent/agent/llm_agent.py b/ms_agent/agent/llm_agent.py index 8602d1621..68aa5ce6c 100644 --- a/ms_agent/agent/llm_agent.py +++ b/ms_agent/agent/llm_agent.py @@ -725,8 +725,25 @@ def log_output(self, content: str): Log formatted output with a tag prefix. Args: - content (str): Content to log. + content (str): Content to log. Can be a string or a list (for multimodal content). """ + # Handle multimodal content (list type) + if isinstance(content, list): + # Extract text from multimodal content + text_parts = [] + for item in content: + if isinstance(item, dict): + if item.get('type') == 'text': + text_parts.append(item.get('text', '')) + elif item.get('type') == 'image_url': + img_url = item.get('image_url', {}).get('url', '') + text_parts.append(f'[Image: {img_url[:50]}...]') + content = ' '.join(text_parts) + + # Ensure content is a string + if not isinstance(content, str): + content = str(content) + if len(content) > 1024: content = content[:512] + '\n...\n' + content[-512:] for line in content.split('\n'): diff --git a/ms_agent/llm/openai_llm.py b/ms_agent/llm/openai_llm.py index d9b9179ba..beee76388 100644 --- a/ms_agent/llm/openai_llm.py +++ b/ms_agent/llm/openai_llm.py @@ -572,6 +572,7 @@ def _format_input_message(self, openai_messages = [] for idx, message in enumerate(messages): if isinstance(message, Message): + # Only strip string content, keep list content as-is for multimodal if isinstance(message.content, str): message.content = message.content.strip() message = message.to_dict_clean() @@ -579,25 +580,34 @@ def _format_input_message(self, message = dict(message) content = message.get('content', '') + # Only strip string content, multimodal content (list) should be kept as-is if isinstance(content, str): content = content.strip() # Apply prefix cache structured content transformation + # Only for string content, multimodal content is already structured if cache_indice is not None and idx == cache_indice: content = self._to_structured_content( content, add_cache_control=True, provider=self._prefix_cache_provider) - message = { - key: value.strip() if isinstance(value, str) else value - for key, value in message.items() - if key in self.input_msg and value - } - if 'content' not in message: - message['content'] = '' - message['content'] = content if content else '' - - openai_messages.append(message) + # Build the message dict, handling both string and multimodal content + formatted_message = {} + for key, value in message.items(): + if key in self.input_msg: + # Only strip string values, keep other types as-is + if isinstance(value, str): + formatted_message[key] = value.strip() if value else '' + else: + formatted_message[key] = value + + # Ensure content field is set correctly + if 'content' not in formatted_message: + formatted_message['content'] = '' + elif not formatted_message['content']: + formatted_message['content'] = content + + openai_messages.append(formatted_message) return openai_messages From 6707f2c9b9fbd9745991010f8239aa18e6e8f81a Mon Sep 17 00:00:00 2001 From: suluyan Date: Thu, 5 Mar 2026 19:39:43 +0800 Subject: [PATCH 3/5] fix comment --- MULTIMODAL_SUPPORT.md | 2 +- examples/agent/test_llm_agent_multimodal.py | 102 ++++++++------------ ms_agent/agent/llm_agent.py | 4 +- ms_agent/llm/openai_llm.py | 8 +- 4 files changed, 45 insertions(+), 71 deletions(-) diff --git a/MULTIMODAL_SUPPORT.md b/MULTIMODAL_SUPPORT.md index fcc52352c..62caddf47 100644 --- a/MULTIMODAL_SUPPORT.md +++ b/MULTIMODAL_SUPPORT.md @@ -222,7 +222,7 @@ messages = [Message(role="user", content=multimodal_content)] response = llm.generate(messages=messages) ``` -### 3. 本地文件路径 +### 3. 使用本地文件(Base64 编码) ```python import base64 diff --git a/examples/agent/test_llm_agent_multimodal.py b/examples/agent/test_llm_agent_multimodal.py index e91b3cfc8..1cf8bb915 100644 --- a/examples/agent/test_llm_agent_multimodal.py +++ b/examples/agent/test_llm_agent_multimodal.py @@ -16,16 +16,20 @@ path = os.path.dirname(os.path.abspath(__file__)) agent_config = os.path.join(path, '..', '..', 'ms_agent', 'agent', 'agent.yaml') +# 测试图片 URL +TEST_IMAGE_URL = "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg" -async def test_llm_agent_multimodal_non_stream(): - """ - 测试 LLMAgent 非 stream 模式的多模态对话 + +def _create_multimodal_config(stream: bool = False): """ - print("=" * 70) - print("测试 1: LLMAgent 非 stream 模式 - 多模态对话 (URL 图片)") - print("=" * 70) + 创建多模态配置 - # 创建配置 + Args: + stream: 是否启用流式输出 + + Returns: + Config: 配置好的 Config 对象,如果 API Key 未设置则返回 None + """ config = Config.from_task(agent_config) # 配置多模态模型 @@ -34,31 +38,43 @@ async def test_llm_agent_multimodal_non_stream(): config.llm.dashscope_api_key = os.environ.get('DASHSCOPE_API_KEY', '') config.llm.modelscope_base_url = 'https://dashscope.aliyuncs.com/compatible-mode/v1' - # 禁用 stream、load_cache 和 callbacks(避免交互式输入问题) - config.generation_config.stream = False + # 禁用 load_cache 和 callbacks(避免交互式输入问题) + config.generation_config.stream = stream config.load_cache = False config.callbacks = [] if not config.llm.dashscope_api_key: print("[错误] 未设置 DASHSCOPE_API_KEY 环境变量") print("请先设置: export DASHSCOPE_API_KEY='your-api-key'") + return None + + return config + + +async def test_llm_agent_multimodal_non_stream(): + """ + 测试 LLMAgent 非 stream 模式的多模态对话 + """ + print("=" * 70) + print("测试 1: LLMAgent 非 stream 模式 - 多模态对话 (URL 图片)") + print("=" * 70) + + config = _create_multimodal_config(stream=False) + if not config: return False # 创建 LLMAgent,使用唯一 tag 避免历史记录的干扰 tag = f"multimodal_test_{uuid.uuid4().hex[:8]}" agent = LLMAgent(config=config, tag=tag) - # 测试图片 URL - test_image_url = "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg" - # 构建多模态内容 multimodal_content = [ {"type": "text", "text": "请详细描述这张图片中的内容。"}, - {"type": "image_url", "image_url": {"url": test_image_url}} + {"type": "image_url", "image_url": {"url": TEST_IMAGE_URL}} ] try: - print(f"\n[发送] 请描述这张图片: {test_image_url}") + print(f"\n[发送] 请描述这张图片: {TEST_IMAGE_URL}") print("-" * 70) messages = [ @@ -88,39 +104,22 @@ async def test_llm_agent_multimodal_stream(): print("测试 2: LLMAgent stream 模式 - 多模态对话 (URL 图片)") print("=" * 70) - # 创建配置 - config = Config.from_task(agent_config) - - # 配置多模态模型 - config.llm.model = 'qwen3.5-plus' - config.llm.service = 'dashscope' - config.llm.dashscope_api_key = os.environ.get('DASHSCOPE_API_KEY', '') - config.llm.modelscope_base_url = 'https://dashscope.aliyuncs.com/compatible-mode/v1' - - # 启用 stream,禁用 load_cache 和 callbacks - config.generation_config.stream = True - config.load_cache = False - config.callbacks = [] - - if not config.llm.dashscope_api_key: - print("[错误] 未设置 DASHSCOPE_API_KEY 环境变量") + config = _create_multimodal_config(stream=True) + if not config: return False # 创建 LLMAgent,使用唯一 tag tag = f"multimodal_stream_{uuid.uuid4().hex[:8]}" agent = LLMAgent(config=config, tag=tag) - # 测试图片 URL - test_image_url = "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg" - # 构建多模态内容 multimodal_content = [ {"type": "text", "text": "请用中文描述这张图片中的内容。"}, - {"type": "image_url", "image_url": {"url": test_image_url}} + {"type": "image_url", "image_url": {"url": TEST_IMAGE_URL}} ] try: - print(f"\n[发送] 请描述这张图片: {test_image_url}") + print(f"\n[发送] 请描述这张图片: {TEST_IMAGE_URL}") print("-" * 70) print("[回复开始]") @@ -162,18 +161,8 @@ async def test_llm_agent_multimodal_base64_non_stream(): import base64 - # 创建配置 - config = Config.from_task(agent_config) - config.llm.model = 'qwen3.5-plus' - config.llm.service = 'dashscope' - config.llm.dashscope_api_key = os.environ.get('DASHSCOPE_API_KEY', '') - config.llm.modelscope_base_url = 'https://dashscope.aliyuncs.com/compatible-mode/v1' - config.generation_config.stream = False - config.load_cache = False - config.callbacks = [] - - if not config.llm.dashscope_api_key: - print("[错误] 未设置 DASHSCOPE_API_KEY 环境变量") + config = _create_multimodal_config(stream=False) + if not config: return False # 创建 LLMAgent,使用唯一 tag @@ -222,27 +211,14 @@ async def test_llm_agent_multimodal_conversation(): print("测试 4: LLMAgent 多轮对话 - 多模态 + 文本混合") print("=" * 70) - # 创建配置 - config = Config.from_task(agent_config) - config.llm.model = 'qwen3.5-plus' - config.llm.service = 'dashscope' - config.llm.dashscope_api_key = os.environ.get('DASHSCOPE_API_KEY', '') - config.llm.modelscope_base_url = 'https://dashscope.aliyuncs.com/compatible-mode/v1' - config.generation_config.stream = False - config.load_cache = False - config.callbacks = [] - - if not config.llm.dashscope_api_key: - print("[错误] 未设置 DASHSCOPE_API_KEY 环境变量") + config = _create_multimodal_config(stream=False) + if not config: return False # 创建 LLMAgent,使用唯一 tag tag = f"multimodal_conv_{uuid.uuid4().hex[:8]}" agent = LLMAgent(config=config, tag=tag) - # 测试图片 URL - test_image_url = "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg" - try: # 第一轮:发送图片 print("\n[第一轮] 发送图片并询问") @@ -250,7 +226,7 @@ async def test_llm_agent_multimodal_conversation(): multimodal_content = [ {"type": "text", "text": "这张图片里有几个人?"}, - {"type": "image_url", "image_url": {"url": test_image_url}} + {"type": "image_url", "image_url": {"url": TEST_IMAGE_URL}} ] messages = [ diff --git a/ms_agent/agent/llm_agent.py b/ms_agent/agent/llm_agent.py index 68aa5ce6c..740eab690 100644 --- a/ms_agent/agent/llm_agent.py +++ b/ms_agent/agent/llm_agent.py @@ -720,12 +720,12 @@ async def condense_memory(self, messages: List[Message]) -> List[Message]: messages = await memory_tool.run(messages) return messages - def log_output(self, content: str): + def log_output(self, content: Union[str, list]): """ Log formatted output with a tag prefix. Args: - content (str): Content to log. Can be a string or a list (for multimodal content). + content (Union[str, list]): Content to log. Can be a string or a list (for multimodal content). """ # Handle multimodal content (list type) if isinstance(content, list): diff --git a/ms_agent/llm/openai_llm.py b/ms_agent/llm/openai_llm.py index beee76388..dadc1bf1c 100644 --- a/ms_agent/llm/openai_llm.py +++ b/ms_agent/llm/openai_llm.py @@ -602,11 +602,9 @@ def _format_input_message(self, else: formatted_message[key] = value - # Ensure content field is set correctly - if 'content' not in formatted_message: - formatted_message['content'] = '' - elif not formatted_message['content']: - formatted_message['content'] = content + # Always use the transformed content to support features like prefix caching + # The content variable has been processed by _to_structured_content() if needed + formatted_message['content'] = content openai_messages.append(formatted_message) From 6c1bab2b70ec73e2634214fd85dccfc4db0d1b0e Mon Sep 17 00:00:00 2001 From: suluyan Date: Thu, 5 Mar 2026 19:40:31 +0800 Subject: [PATCH 4/5] fix lint --- MULTIMODAL_SUPPORT.md | 1 - examples/agent/test_llm_agent_multimodal.py | 162 ++++++++++---------- 2 files changed, 81 insertions(+), 82 deletions(-) diff --git a/MULTIMODAL_SUPPORT.md b/MULTIMODAL_SUPPORT.md index 62caddf47..65ea71731 100644 --- a/MULTIMODAL_SUPPORT.md +++ b/MULTIMODAL_SUPPORT.md @@ -303,4 +303,3 @@ A: 是的,多模态对话支持流式输出。设置 `stream: true` 即可: config.generation_config.stream = True response = llm.generate(messages=messages, stream=True) ``` - diff --git a/examples/agent/test_llm_agent_multimodal.py b/examples/agent/test_llm_agent_multimodal.py index 1cf8bb915..3ae2eaa5e 100644 --- a/examples/agent/test_llm_agent_multimodal.py +++ b/examples/agent/test_llm_agent_multimodal.py @@ -17,7 +17,7 @@ agent_config = os.path.join(path, '..', '..', 'ms_agent', 'agent', 'agent.yaml') # 测试图片 URL -TEST_IMAGE_URL = "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg" +TEST_IMAGE_URL = 'https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg' def _create_multimodal_config(stream: bool = False): @@ -44,7 +44,7 @@ def _create_multimodal_config(stream: bool = False): config.callbacks = [] if not config.llm.dashscope_api_key: - print("[错误] 未设置 DASHSCOPE_API_KEY 环境变量") + print('[错误] 未设置 DASHSCOPE_API_KEY 环境变量') print("请先设置: export DASHSCOPE_API_KEY='your-api-key'") return None @@ -55,42 +55,42 @@ async def test_llm_agent_multimodal_non_stream(): """ 测试 LLMAgent 非 stream 模式的多模态对话 """ - print("=" * 70) - print("测试 1: LLMAgent 非 stream 模式 - 多模态对话 (URL 图片)") - print("=" * 70) + print('=' * 70) + print('测试 1: LLMAgent 非 stream 模式 - 多模态对话 (URL 图片)') + print('=' * 70) config = _create_multimodal_config(stream=False) if not config: return False # 创建 LLMAgent,使用唯一 tag 避免历史记录的干扰 - tag = f"multimodal_test_{uuid.uuid4().hex[:8]}" + tag = f'multimodal_test_{uuid.uuid4().hex[:8]}' agent = LLMAgent(config=config, tag=tag) # 构建多模态内容 multimodal_content = [ - {"type": "text", "text": "请详细描述这张图片中的内容。"}, - {"type": "image_url", "image_url": {"url": TEST_IMAGE_URL}} + {'type': 'text', 'text': '请详细描述这张图片中的内容。'}, + {'type': 'image_url', 'image_url': {'url': TEST_IMAGE_URL}} ] try: - print(f"\n[发送] 请描述这张图片: {TEST_IMAGE_URL}") - print("-" * 70) + print(f'\n[发送] 请描述这张图片: {TEST_IMAGE_URL}') + print('-' * 70) messages = [ - Message(role="system", content="你是一个多模态助手。"), - Message(role="user", content=multimodal_content) + Message(role='system', content='你是一个多模态助手。'), + Message(role='user', content=multimodal_content) ] response = await agent.run(messages=messages) - print(f"\n[回复] {response[-1].content}") - print("-" * 70) - print(f"\n[Token使用] 输入: {response[-1].prompt_tokens}, 输出: {response[-1].completion_tokens}") + print(f'\n[回复] {response[-1].content}') + print('-' * 70) + print(f'\n[Token使用] 输入: {response[-1].prompt_tokens}, 输出: {response[-1].completion_tokens}') return True except Exception as e: - print(f"\n[错误] 非 stream 多模态对话失败: {e}") + print(f'\n[错误] 非 stream 多模态对话失败: {e}') import traceback traceback.print_exc() return False @@ -100,38 +100,38 @@ async def test_llm_agent_multimodal_stream(): """ 测试 LLMAgent stream 模式的多模态对话 """ - print("\n" + "=" * 70) - print("测试 2: LLMAgent stream 模式 - 多模态对话 (URL 图片)") - print("=" * 70) + print('\n' + '=' * 70) + print('测试 2: LLMAgent stream 模式 - 多模态对话 (URL 图片)') + print('=' * 70) config = _create_multimodal_config(stream=True) if not config: return False # 创建 LLMAgent,使用唯一 tag - tag = f"multimodal_stream_{uuid.uuid4().hex[:8]}" + tag = f'multimodal_stream_{uuid.uuid4().hex[:8]}' agent = LLMAgent(config=config, tag=tag) # 构建多模态内容 multimodal_content = [ - {"type": "text", "text": "请用中文描述这张图片中的内容。"}, - {"type": "image_url", "image_url": {"url": TEST_IMAGE_URL}} + {'type': 'text', 'text': '请用中文描述这张图片中的内容。'}, + {'type': 'image_url', 'image_url': {'url': TEST_IMAGE_URL}} ] try: - print(f"\n[发送] 请描述这张图片: {TEST_IMAGE_URL}") - print("-" * 70) - print("[回复开始]") + print(f'\n[发送] 请描述这张图片: {TEST_IMAGE_URL}') + print('-' * 70) + print('[回复开始]') messages = [ - Message(role="system", content="你是一个多模态助手。"), - Message(role="user", content=multimodal_content) + Message(role='system', content='你是一个多模态助手。'), + Message(role='user', content=multimodal_content) ] # stream 模式调用 generator = await agent.run(messages=messages, stream=True) - full_response = "" + full_response = '' async for response_chunk in generator: if response_chunk and len(response_chunk) > 0: last_msg = response_chunk[-1] @@ -141,11 +141,11 @@ async def test_llm_agent_multimodal_stream(): sys.stdout.flush() full_response = last_msg.content - print("\n" + "-" * 70) - print(f"\n[完整回复长度] {len(full_response)} 字符") + print('\n' + '-' * 70) + print(f'\n[完整回复长度] {len(full_response)} 字符') return True except Exception as e: - print(f"\n[错误] stream 多模态对话失败: {e}") + print(f'\n[错误] stream 多模态对话失败: {e}') import traceback traceback.print_exc() return False @@ -155,9 +155,9 @@ async def test_llm_agent_multimodal_base64_non_stream(): """ 测试 LLMAgent 非 stream 模式 - Base64 编码图片 """ - print("\n" + "=" * 70) - print("测试 3: LLMAgent 非 stream 模式 - Base64 编码图片") - print("=" * 70) + print('\n' + '=' * 70) + print('测试 3: LLMAgent 非 stream 模式 - Base64 编码图片') + print('=' * 70) import base64 @@ -166,38 +166,38 @@ async def test_llm_agent_multimodal_base64_non_stream(): return False # 创建 LLMAgent,使用唯一 tag - tag = f"multimodal_base64_{uuid.uuid4().hex[:8]}" + tag = f'multimodal_base64_{uuid.uuid4().hex[:8]}' agent = LLMAgent(config=config, tag=tag) # 一个简单的测试图片 base64 (1x1 像素) - test_image_base64 = "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNk+M9Qz0AEYBxVSF+FABJADq0/8ZEPAAAAAElFTkSuQmCC" + test_image_base64 = 'iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNk+M9Qz0AEYBxVSF+FABJADq0/8ZEPAAAAAElFTkSuQmCC' multimodal_content = [ - {"type": "text", "text": "这是一个什么颜色的图片?请用中文简短回答。"}, + {'type': 'text', 'text': '这是一个什么颜色的图片?请用中文简短回答。'}, { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{test_image_base64}" + 'type': 'image_url', + 'image_url': { + 'url': f'data:image/png;base64,{test_image_base64}' } } ] try: - print("\n[发送] 这是什么颜色的图片?(Base64 编码)") - print("-" * 70) + print('\n[发送] 这是什么颜色的图片?(Base64 编码)') + print('-' * 70) messages = [ - Message(role="system", content="你是一个多模态助手。"), - Message(role="user", content=multimodal_content) + Message(role='system', content='你是一个多模态助手。'), + Message(role='user', content=multimodal_content) ] response = await agent.run(messages=messages) - print(f"\n[回复] {response[-1].content}") - print("-" * 70) + print(f'\n[回复] {response[-1].content}') + print('-' * 70) return True except Exception as e: - print(f"\n[错误] Base64 多模态对话失败: {e}") + print(f'\n[错误] Base64 多模态对话失败: {e}') import traceback traceback.print_exc() return False @@ -207,58 +207,58 @@ async def test_llm_agent_multimodal_conversation(): """ 测试 LLMAgent 多轮对话中的多模态功能 """ - print("\n" + "=" * 70) - print("测试 4: LLMAgent 多轮对话 - 多模态 + 文本混合") - print("=" * 70) + print('\n' + '=' * 70) + print('测试 4: LLMAgent 多轮对话 - 多模态 + 文本混合') + print('=' * 70) config = _create_multimodal_config(stream=False) if not config: return False # 创建 LLMAgent,使用唯一 tag - tag = f"multimodal_conv_{uuid.uuid4().hex[:8]}" + tag = f'multimodal_conv_{uuid.uuid4().hex[:8]}' agent = LLMAgent(config=config, tag=tag) try: # 第一轮:发送图片 - print("\n[第一轮] 发送图片并询问") - print("-" * 70) + print('\n[第一轮] 发送图片并询问') + print('-' * 70) multimodal_content = [ - {"type": "text", "text": "这张图片里有几个人?"}, - {"type": "image_url", "image_url": {"url": TEST_IMAGE_URL}} + {'type': 'text', 'text': '这张图片里有几个人?'}, + {'type': 'image_url', 'image_url': {'url': TEST_IMAGE_URL}} ] messages = [ - Message(role="system", content="你是一个多模态助手。"), - Message(role="user", content=multimodal_content) + Message(role='system', content='你是一个多模态助手。'), + Message(role='user', content=multimodal_content) ] response = await agent.run(messages=messages) - print(f"\n[第一轮回复] {response[-1].content[:200]}...") + print(f'\n[第一轮回复] {response[-1].content[:200]}...') # 第二轮:继续追问(纯文本) - print("\n[第二轮] 继续追问") - print("-" * 70) + print('\n[第二轮] 继续追问') + print('-' * 70) # 保留历史记录,添加新的用户消息 messages = response - messages.append(Message(role="user", content="图片中的场景是在室内还是室外?")) + messages.append(Message(role='user', content='图片中的场景是在室内还是室外?')) response = await agent.run(messages=messages) - print(f"\n[第二轮回复] {response[-1].content[:200]}...") + print(f'\n[第二轮回复] {response[-1].content[:200]}...') # 第三轮:再次追问(纯文本) - print("\n[第三轮] 再次追问") - print("-" * 70) + print('\n[第三轮] 再次追问') + print('-' * 70) messages = response - messages.append(Message(role="user", content="用一句话总结这张图片。")) + messages.append(Message(role='user', content='用一句话总结这张图片。')) response = await agent.run(messages=messages) - print(f"\n[第三轮回复] {response[-1].content[:200]}...") + print(f'\n[第三轮回复] {response[-1].content[:200]}...') - print("-" * 70) + print('-' * 70) return True except Exception as e: - print(f"\n[错误] 多轮对话失败: {e}") + print(f'\n[错误] 多轮对话失败: {e}') import traceback traceback.print_exc() return False @@ -266,40 +266,40 @@ async def test_llm_agent_multimodal_conversation(): async def main(): """运行所有测试""" - print("\n" + "=" * 70) - print("LLMAgent 多模态对话测试套件") - print("=" * 70) + print('\n' + '=' * 70) + print('LLMAgent 多模态对话测试套件') + print('=' * 70) print("\n请确保已设置环境变量: export DASHSCOPE_API_KEY='your-api-key'\n") results = [] # 测试 1: 非 stream 模式 result1 = await test_llm_agent_multimodal_non_stream() - results.append(("非 stream 模式 (URL图片)", result1)) + results.append(('非 stream 模式 (URL图片)', result1)) # 测试 2: stream 模式 result2 = await test_llm_agent_multimodal_stream() - results.append(("stream 模式 (URL图片)", result2)) + results.append(('stream 模式 (URL图片)', result2)) # 测试 3: Base64 非 stream result3 = await test_llm_agent_multimodal_base64_non_stream() - results.append(("非 stream 模式 (Base64)", result3)) + results.append(('非 stream 模式 (Base64)', result3)) # 测试 4: 多轮对话 result4 = await test_llm_agent_multimodal_conversation() - results.append(("多轮对话", result4)) + results.append(('多轮对话', result4)) # 总结 - print("\n" + "=" * 70) - print("测试总结") - print("=" * 70) + print('\n' + '=' * 70) + print('测试总结') + print('=' * 70) for name, result in results: - status = "✓ 通过" if result else "✗ 失败" - print(f" {status} - {name}") + status = '✓ 通过' if result else '✗ 失败' + print(f' {status} - {name}') passed = sum(1 for _, r in results if r) total = len(results) - print(f"\n总计: {passed}/{total} 测试通过") + print(f'\n总计: {passed}/{total} 测试通过') return passed == total From 76ed804477cda648a08ff090d5be7de968210b73 Mon Sep 17 00:00:00 2001 From: suluyan Date: Fri, 6 Mar 2026 10:28:37 +0800 Subject: [PATCH 5/5] fix comment --- config/cfg_model_multimodal.yaml | 57 ------------------- .../zh/Components/multimodal-support.md | 50 +++++++--------- docs/zh/Components/supported-models.md | 4 ++ 3 files changed, 26 insertions(+), 85 deletions(-) delete mode 100644 config/cfg_model_multimodal.yaml rename MULTIMODAL_SUPPORT.md => docs/zh/Components/multimodal-support.md (89%) diff --git a/config/cfg_model_multimodal.yaml b/config/cfg_model_multimodal.yaml deleted file mode 100644 index f660e9e3c..000000000 --- a/config/cfg_model_multimodal.yaml +++ /dev/null @@ -1,57 +0,0 @@ -# 多模态模型配置示例 -# 用于配置 qwen3.5-plus 等多模态模型 - -llm: - # 使用 dashscope 服务 - service: dashscope - - # 多模态模型名称 - model: qwen3.5-plus - - # DashScope API Key - # 也可以通过环境变量 DASHSCOPE_API_KEY 设置 - dashscope_api_key: - - # DashScope 兼容模式 endpoint - modelscope_base_url: https://dashscope.aliyuncs.com/compatible-mode/v1 - -# 生成配置 -generation_config: - temperature: 0.7 - top_k: 50 - top_p: 0.8 - max_tokens: 2048 - stream: true - - # 禁用思考模式(可选) - extra_body: - enable_thinking: false - -# 系统提示词 -prompt: - system: | - 你是一个多模态助手,能够理解和分析图片内容。 - - 当用户发送图片时,请仔细观察图片内容,并提供: - 1. 图片中的主要元素和对象 - 2. 场景描述(如适用) - 3. 任何文字信息(如图片中包含文字) - 4. 颜色、布局等视觉特征 - - 请用用户相同的语言回答。 - -# 最大对话轮次 -max_chat_round: 10 - -# 回调函数 -callbacks: - - input_callback - -# 工具配置(多模态对话通常不需要工具) -tools: - -help: | - 多模态模型配置示例 - 使用方法: - 1. 设置 DASHSCOPE_API_KEY 环境变量 - 2. 使用此配置运行多模态对话 diff --git a/MULTIMODAL_SUPPORT.md b/docs/zh/Components/multimodal-support.md similarity index 89% rename from MULTIMODAL_SUPPORT.md rename to docs/zh/Components/multimodal-support.md index 65ea71731..513e27a6a 100644 --- a/MULTIMODAL_SUPPORT.md +++ b/docs/zh/Components/multimodal-support.md @@ -1,4 +1,10 @@ -# ms-agent 多模态支持指南 +--- +slug: multimodal-support +title: 多模态支持 +description: Ms-Agent 多模态对话使用指南:图片理解、分析功能配置与使用方法。 +--- + +# 多模态支持 本文档介绍如何使用 ms-agent 进行多模态对话,包括图片理解和分析功能。 @@ -21,7 +27,7 @@ pip install openai ### 2. 配置 API Key -(以qwen3.5-plus为例)获取 DashScope API Key 并设置环境变量: +(以 qwen3.5-plus 为例)获取 DashScope API Key 并设置环境变量: ```bash export DASHSCOPE_API_KEY='your-dashscope-api-key' @@ -31,40 +37,28 @@ export DASHSCOPE_API_KEY='your-dashscope-api-key' ## 配置多模态模型 -### 使用配置文件 - -可以使用预定义的多模态配置文件 `config/cfg_model_multimodal.yaml`: - -```yaml -llm: - service: dashscope - model: qwen3.5-plus - dashscope_api_key: your-api-key # 或使用环境变量 - modelscope_base_url: https://dashscope.aliyuncs.com/compatible-mode/v1 - -generation_config: - temperature: 0.7 - top_k: 50 - top_p: 0.8 - max_tokens: 2048 - stream: true - extra_body: - enable_thinking: false -``` +多模态功能主要取决于两点: +1. **选择支持多模态的模型**(如 `qwen3.5-plus`) +2. **使用正确的消息格式**(包含 `image_url` 块) -### 在代码中配置 +你可以在现有配置基础上,通过代码动态修改模型配置: ```python from ms_agent.config import Config -from ms_agent.llm import LLM +from ms_agent import LLMAgent +import os + +# 使用现有配置文件(如 ms_agent/agent/agent.yaml) +config = Config.from_task('ms_agent/agent/agent.yaml') -config = Config.from_task('path/to/config') +# 覆盖配置为多模态模型 config.llm.model = 'qwen3.5-plus' config.llm.service = 'dashscope' -config.llm.dashscope_api_key = 'your-api-key' +config.llm.dashscope_api_key = os.environ.get('DASHSCOPE_API_KEY', '') config.llm.modelscope_base_url = 'https://dashscope.aliyuncs.com/compatible-mode/v1' -llm = LLM.from_config(config) +# 创建 LLMAgent +agent = LLMAgent(config=config) ``` ## 使用 LLMAgent 进行多模态对话 @@ -222,7 +216,7 @@ messages = [Message(role="user", content=multimodal_content)] response = llm.generate(messages=messages) ``` -### 3. 使用本地文件(Base64 编码) +### 3. 本地文件路径 ```python import base64 diff --git a/docs/zh/Components/supported-models.md b/docs/zh/Components/supported-models.md index a75013f03..654607e9e 100644 --- a/docs/zh/Components/supported-models.md +++ b/docs/zh/Components/supported-models.md @@ -48,3 +48,7 @@ llm: ``` > 如果你有其他模型provider,请协助更新此文档。 + +## 多模态支持 + +关于如何使用多模态模型(如图片理解、分析功能),请参考 [多模态支持指南](./multimodal-support.md)。