diff --git a/docs/zh/Components/multimodal-support.md b/docs/zh/Components/multimodal-support.md new file mode 100644 index 000000000..513e27a6a --- /dev/null +++ b/docs/zh/Components/multimodal-support.md @@ -0,0 +1,299 @@ +--- +slug: multimodal-support +title: 多模态支持 +description: Ms-Agent 多模态对话使用指南:图片理解、分析功能配置与使用方法。 +--- + +# 多模态支持 + +本文档介绍如何使用 ms-agent 进行多模态对话,包括图片理解和分析功能。 + +## 概述 + +ms-agent 已经支持多模态模型,如阿里云的 `qwen3.5-plus` 模型。多模态模型能够: +- 分析图片内容 +- 识别图片中的对象、场景和文字 +- 结合图片内容进行对话 + +## 前置要求 + +### 1. 安装依赖 + +确保已安装必要的依赖包: + +```bash +pip install openai +``` + +### 2. 配置 API Key + +(以 qwen3.5-plus 为例)获取 DashScope API Key 并设置环境变量: + +```bash +export DASHSCOPE_API_KEY='your-dashscope-api-key' +``` + +或者在配置文件中直接设置 `dashscope_api_key`。 + +## 配置多模态模型 + +多模态功能主要取决于两点: +1. **选择支持多模态的模型**(如 `qwen3.5-plus`) +2. **使用正确的消息格式**(包含 `image_url` 块) + +你可以在现有配置基础上,通过代码动态修改模型配置: + +```python +from ms_agent.config import Config +from ms_agent import LLMAgent +import os + +# 使用现有配置文件(如 ms_agent/agent/agent.yaml) +config = Config.from_task('ms_agent/agent/agent.yaml') + +# 覆盖配置为多模态模型 +config.llm.model = 'qwen3.5-plus' +config.llm.service = 'dashscope' +config.llm.dashscope_api_key = os.environ.get('DASHSCOPE_API_KEY', '') +config.llm.modelscope_base_url = 'https://dashscope.aliyuncs.com/compatible-mode/v1' + +# 创建 LLMAgent +agent = LLMAgent(config=config) +``` + +## 使用 LLMAgent 进行多模态对话 + +推荐使用 `LLMAgent` 来进行多模态对话,它提供了更完整的功能,包括记忆管理、工具调用和回调支持。 + +### 基本用法 + +```python +import asyncio +import os +from ms_agent import LLMAgent +from ms_agent.config import Config +from ms_agent.llm.utils import Message + +async def multimodal_chat(): + # 创建配置 + config = Config.from_task('ms_agent/agent/agent.yaml') + config.llm.model = 'qwen3.5-plus' + config.llm.service = 'dashscope' + config.llm.dashscope_api_key = os.environ.get('DASHSCOPE_API_KEY', '') + config.llm.modelscope_base_url = 'https://dashscope.aliyuncs.com/compatible-mode/v1' + + # 创建 LLMAgent + agent = LLMAgent(config=config) + + # 构建多模态消息 + multimodal_content = [ + {"type": "text", "text": "请描述这张图片。"}, + {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}} + ] + + # 调用 agent + response = await agent.run(messages=[Message(role="user", content=multimodal_content)]) + print(response[-1].content) + +asyncio.run(multimodal_chat()) +``` + +### 非 Stream 模式 + +```python +# 配置中禁用 stream +config.generation_config.stream = False + +agent = LLMAgent(config=config) + +multimodal_content = [ + {"type": "text", "text": "请描述这张图片。"}, + {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}} +] + +# 非 stream 模式:直接返回完整响应 +response = await agent.run(messages=[Message(role="user", content=multimodal_content)]) +print(f"[回复] {response[-1].content}") +print(f"[Token使用] 输入: {response[-1].prompt_tokens}, 输出: {response[-1].completion_tokens}") +``` + +### Stream 模式 + +```python +# 配置中启用 stream +config.generation_config.stream = True + +agent = LLMAgent(config=config) + +multimodal_content = [ + {"type": "text", "text": "请描述这张图片。"}, + {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}} +] + +# stream 模式:返回生成器 +generator = await agent.run( + messages=[Message(role="user", content=multimodal_content)], + stream=True +) + +full_response = "" +async for response_chunk in generator: + if response_chunk and len(response_chunk) > 0: + last_msg = response_chunk[-1] + if last_msg.content: + # 流式输出新增内容 + print(last_msg.content[len(full_response):], end='', flush=True) + full_response = last_msg.content + +print(f"\n[完整回复] {full_response}") +``` + +### 多轮对话 + +LLMAgent 支持多轮对话,可以在对话中混合使用图片和文本: + +```python +agent = LLMAgent(config=config, tag="multimodal_conversation") + +# 第一轮:发送图片 +multimodal_content = [ + {"type": "text", "text": "这张图片里有几个人?"}, + {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}} +] + +messages = [Message(role="user", content=multimodal_content)] +response = await agent.run(messages=messages) +print(f"[第一轮回复] {response[-1].content}") + +# 第二轮:继续追问(纯文本,保留上下文) +messages = response # 使用上一轮的回复作为上下文 +messages.append(Message(role="user", content="他们在做什么?")) +response = await agent.run(messages=messages) +print(f"[第二轮回复] {response[-1].content}") +``` + +## 多模态消息格式 + +ms-agent 使用 OpenAI 兼容的多模态消息格式。图片可以通过以下三种方式提供: + +### 1. 图片 URL + +```python +from ms_agent.llm.utils import Message + +multimodal_content = [ + {"type": "text", "text": "请描述这张图片。"}, + {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}} +] + +messages = [ + Message(role="user", content=multimodal_content) +] + +response = llm.generate(messages=messages) +``` + +### 2. Base64 编码 + +```python +import base64 + +# 读取并编码图片 +with open('image.jpg', 'rb') as f: + image_data = base64.b64encode(f.read()).decode('utf-8') + +multimodal_content = [ + {"type": "text", "text": "这是什么?"}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_data}" + } + } +] + +messages = [Message(role="user", content=multimodal_content)] +response = llm.generate(messages=messages) +``` + +### 3. 本地文件路径 + +```python +import base64 +import os + +image_path = 'path/to/image.png' + +# 获取 MIME 类型 +ext = os.path.splitext(image_path)[1].lower() +mime_type = { + '.png': 'image/png', + '.jpg': 'image/jpeg', + '.jpeg': 'image/jpeg', + '.gif': 'image/gif', + '.webp': 'image/webp' +}.get(ext, 'image/png') + +# 读取并编码 +with open(image_path, 'rb') as f: + image_data = base64.b64encode(f.read()).decode('utf-8') + +multimodal_content = [ + {"type": "text", "text": "描述这张图片。"}, + { + "type": "image_url", + "image_url": { + "url": f"data:{mime_type};base64,{image_data}" + } + } +] + +messages = [Message(role="user", content=multimodal_content)] +response = llm.generate(messages=messages) +``` + +## 运行示例 + +### 运行 Agent 示例 + +```bash +# 运行完整测试套件(包括 stream 和非 stream 模式) +python examples/agent/test_llm_agent_multimodal.py +``` + +## 常见问题 + +### Q: 图片大小有限制吗? + +A: 是的,不同模型有不同的限制: +- qwen3.5-plus: 推荐图片大小不超过 4MB +- 分辨率建议不超过 2048x2048 + +### Q: 支持哪些图片格式? + +A: 通常支持: +- JPEG / JPG +- PNG +- GIF +- WebP + +### Q: 可以一次发送多张图片吗? + +A: 是的,可以在消息中添加多个 `image_url` 块: + +```python +multimodal_content = [ + {"type": "text", "text": "比较这两张图片。"}, + {"type": "image_url", "image_url": {"url": "https://example.com/img1.jpg"}}, + {"type": "image_url", "image_url": {"url": "https://example.com/img2.jpg"}} +] +``` + +### Q: 流式输出支持吗? + +A: 是的,多模态对话支持流式输出。设置 `stream: true` 即可: + +```python +config.generation_config.stream = True +response = llm.generate(messages=messages, stream=True) +``` diff --git a/docs/zh/Components/supported-models.md b/docs/zh/Components/supported-models.md index a75013f03..654607e9e 100644 --- a/docs/zh/Components/supported-models.md +++ b/docs/zh/Components/supported-models.md @@ -48,3 +48,7 @@ llm: ``` > 如果你有其他模型provider,请协助更新此文档。 + +## 多模态支持 + +关于如何使用多模态模型(如图片理解、分析功能),请参考 [多模态支持指南](./multimodal-support.md)。 diff --git a/examples/agent/test_llm_agent_multimodal.py b/examples/agent/test_llm_agent_multimodal.py new file mode 100644 index 000000000..3ae2eaa5e --- /dev/null +++ b/examples/agent/test_llm_agent_multimodal.py @@ -0,0 +1,309 @@ +""" +LLMAgent 多模态对话测试 + +从 LLMAgent 层面测试多模态功能,覆盖 stream 和非 stream 两种模式。 +""" +import asyncio +import os +import sys +import uuid + +from ms_agent import LLMAgent +from ms_agent.config import Config +from ms_agent.llm.utils import Message + +# 获取脚本所在目录 +path = os.path.dirname(os.path.abspath(__file__)) +agent_config = os.path.join(path, '..', '..', 'ms_agent', 'agent', 'agent.yaml') + +# 测试图片 URL +TEST_IMAGE_URL = 'https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg' + + +def _create_multimodal_config(stream: bool = False): + """ + 创建多模态配置 + + Args: + stream: 是否启用流式输出 + + Returns: + Config: 配置好的 Config 对象,如果 API Key 未设置则返回 None + """ + config = Config.from_task(agent_config) + + # 配置多模态模型 + config.llm.model = 'qwen3.5-plus' + config.llm.service = 'dashscope' + config.llm.dashscope_api_key = os.environ.get('DASHSCOPE_API_KEY', '') + config.llm.modelscope_base_url = 'https://dashscope.aliyuncs.com/compatible-mode/v1' + + # 禁用 load_cache 和 callbacks(避免交互式输入问题) + config.generation_config.stream = stream + config.load_cache = False + config.callbacks = [] + + if not config.llm.dashscope_api_key: + print('[错误] 未设置 DASHSCOPE_API_KEY 环境变量') + print("请先设置: export DASHSCOPE_API_KEY='your-api-key'") + return None + + return config + + +async def test_llm_agent_multimodal_non_stream(): + """ + 测试 LLMAgent 非 stream 模式的多模态对话 + """ + print('=' * 70) + print('测试 1: LLMAgent 非 stream 模式 - 多模态对话 (URL 图片)') + print('=' * 70) + + config = _create_multimodal_config(stream=False) + if not config: + return False + + # 创建 LLMAgent,使用唯一 tag 避免历史记录的干扰 + tag = f'multimodal_test_{uuid.uuid4().hex[:8]}' + agent = LLMAgent(config=config, tag=tag) + + # 构建多模态内容 + multimodal_content = [ + {'type': 'text', 'text': '请详细描述这张图片中的内容。'}, + {'type': 'image_url', 'image_url': {'url': TEST_IMAGE_URL}} + ] + + try: + print(f'\n[发送] 请描述这张图片: {TEST_IMAGE_URL}') + print('-' * 70) + + messages = [ + Message(role='system', content='你是一个多模态助手。'), + Message(role='user', content=multimodal_content) + ] + + response = await agent.run(messages=messages) + + print(f'\n[回复] {response[-1].content}') + print('-' * 70) + print(f'\n[Token使用] 输入: {response[-1].prompt_tokens}, 输出: {response[-1].completion_tokens}') + + return True + except Exception as e: + print(f'\n[错误] 非 stream 多模态对话失败: {e}') + import traceback + traceback.print_exc() + return False + + +async def test_llm_agent_multimodal_stream(): + """ + 测试 LLMAgent stream 模式的多模态对话 + """ + print('\n' + '=' * 70) + print('测试 2: LLMAgent stream 模式 - 多模态对话 (URL 图片)') + print('=' * 70) + + config = _create_multimodal_config(stream=True) + if not config: + return False + + # 创建 LLMAgent,使用唯一 tag + tag = f'multimodal_stream_{uuid.uuid4().hex[:8]}' + agent = LLMAgent(config=config, tag=tag) + + # 构建多模态内容 + multimodal_content = [ + {'type': 'text', 'text': '请用中文描述这张图片中的内容。'}, + {'type': 'image_url', 'image_url': {'url': TEST_IMAGE_URL}} + ] + + try: + print(f'\n[发送] 请描述这张图片: {TEST_IMAGE_URL}') + print('-' * 70) + print('[回复开始]') + + messages = [ + Message(role='system', content='你是一个多模态助手。'), + Message(role='user', content=multimodal_content) + ] + + # stream 模式调用 + generator = await agent.run(messages=messages, stream=True) + + full_response = '' + async for response_chunk in generator: + if response_chunk and len(response_chunk) > 0: + last_msg = response_chunk[-1] + if last_msg.content and len(last_msg.content) > len(full_response): + # 流式输出新增内容 + sys.stdout.write(last_msg.content[len(full_response):]) + sys.stdout.flush() + full_response = last_msg.content + + print('\n' + '-' * 70) + print(f'\n[完整回复长度] {len(full_response)} 字符') + return True + except Exception as e: + print(f'\n[错误] stream 多模态对话失败: {e}') + import traceback + traceback.print_exc() + return False + + +async def test_llm_agent_multimodal_base64_non_stream(): + """ + 测试 LLMAgent 非 stream 模式 - Base64 编码图片 + """ + print('\n' + '=' * 70) + print('测试 3: LLMAgent 非 stream 模式 - Base64 编码图片') + print('=' * 70) + + import base64 + + config = _create_multimodal_config(stream=False) + if not config: + return False + + # 创建 LLMAgent,使用唯一 tag + tag = f'multimodal_base64_{uuid.uuid4().hex[:8]}' + agent = LLMAgent(config=config, tag=tag) + + # 一个简单的测试图片 base64 (1x1 像素) + test_image_base64 = 'iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNk+M9Qz0AEYBxVSF+FABJADq0/8ZEPAAAAAElFTkSuQmCC' + + multimodal_content = [ + {'type': 'text', 'text': '这是一个什么颜色的图片?请用中文简短回答。'}, + { + 'type': 'image_url', + 'image_url': { + 'url': f'data:image/png;base64,{test_image_base64}' + } + } + ] + + try: + print('\n[发送] 这是什么颜色的图片?(Base64 编码)') + print('-' * 70) + + messages = [ + Message(role='system', content='你是一个多模态助手。'), + Message(role='user', content=multimodal_content) + ] + + response = await agent.run(messages=messages) + + print(f'\n[回复] {response[-1].content}') + print('-' * 70) + return True + except Exception as e: + print(f'\n[错误] Base64 多模态对话失败: {e}') + import traceback + traceback.print_exc() + return False + + +async def test_llm_agent_multimodal_conversation(): + """ + 测试 LLMAgent 多轮对话中的多模态功能 + """ + print('\n' + '=' * 70) + print('测试 4: LLMAgent 多轮对话 - 多模态 + 文本混合') + print('=' * 70) + + config = _create_multimodal_config(stream=False) + if not config: + return False + + # 创建 LLMAgent,使用唯一 tag + tag = f'multimodal_conv_{uuid.uuid4().hex[:8]}' + agent = LLMAgent(config=config, tag=tag) + + try: + # 第一轮:发送图片 + print('\n[第一轮] 发送图片并询问') + print('-' * 70) + + multimodal_content = [ + {'type': 'text', 'text': '这张图片里有几个人?'}, + {'type': 'image_url', 'image_url': {'url': TEST_IMAGE_URL}} + ] + + messages = [ + Message(role='system', content='你是一个多模态助手。'), + Message(role='user', content=multimodal_content) + ] + response = await agent.run(messages=messages) + print(f'\n[第一轮回复] {response[-1].content[:200]}...') + + # 第二轮:继续追问(纯文本) + print('\n[第二轮] 继续追问') + print('-' * 70) + + # 保留历史记录,添加新的用户消息 + messages = response + messages.append(Message(role='user', content='图片中的场景是在室内还是室外?')) + response = await agent.run(messages=messages) + print(f'\n[第二轮回复] {response[-1].content[:200]}...') + + # 第三轮:再次追问(纯文本) + print('\n[第三轮] 再次追问') + print('-' * 70) + + messages = response + messages.append(Message(role='user', content='用一句话总结这张图片。')) + response = await agent.run(messages=messages) + print(f'\n[第三轮回复] {response[-1].content[:200]}...') + + print('-' * 70) + return True + except Exception as e: + print(f'\n[错误] 多轮对话失败: {e}') + import traceback + traceback.print_exc() + return False + + +async def main(): + """运行所有测试""" + print('\n' + '=' * 70) + print('LLMAgent 多模态对话测试套件') + print('=' * 70) + print("\n请确保已设置环境变量: export DASHSCOPE_API_KEY='your-api-key'\n") + + results = [] + + # 测试 1: 非 stream 模式 + result1 = await test_llm_agent_multimodal_non_stream() + results.append(('非 stream 模式 (URL图片)', result1)) + + # 测试 2: stream 模式 + result2 = await test_llm_agent_multimodal_stream() + results.append(('stream 模式 (URL图片)', result2)) + + # 测试 3: Base64 非 stream + result3 = await test_llm_agent_multimodal_base64_non_stream() + results.append(('非 stream 模式 (Base64)', result3)) + + # 测试 4: 多轮对话 + result4 = await test_llm_agent_multimodal_conversation() + results.append(('多轮对话', result4)) + + # 总结 + print('\n' + '=' * 70) + print('测试总结') + print('=' * 70) + for name, result in results: + status = '✓ 通过' if result else '✗ 失败' + print(f' {status} - {name}') + + passed = sum(1 for _, r in results if r) + total = len(results) + print(f'\n总计: {passed}/{total} 测试通过') + + return passed == total + + +if __name__ == '__main__': + success = asyncio.run(main()) + sys.exit(0 if success else 1) diff --git a/ms_agent/agent/llm_agent.py b/ms_agent/agent/llm_agent.py index 8602d1621..740eab690 100644 --- a/ms_agent/agent/llm_agent.py +++ b/ms_agent/agent/llm_agent.py @@ -720,13 +720,30 @@ async def condense_memory(self, messages: List[Message]) -> List[Message]: messages = await memory_tool.run(messages) return messages - def log_output(self, content: str): + def log_output(self, content: Union[str, list]): """ Log formatted output with a tag prefix. Args: - content (str): Content to log. + content (Union[str, list]): Content to log. Can be a string or a list (for multimodal content). """ + # Handle multimodal content (list type) + if isinstance(content, list): + # Extract text from multimodal content + text_parts = [] + for item in content: + if isinstance(item, dict): + if item.get('type') == 'text': + text_parts.append(item.get('text', '')) + elif item.get('type') == 'image_url': + img_url = item.get('image_url', {}).get('url', '') + text_parts.append(f'[Image: {img_url[:50]}...]') + content = ' '.join(text_parts) + + # Ensure content is a string + if not isinstance(content, str): + content = str(content) + if len(content) > 1024: content = content[:512] + '\n...\n' + content[-512:] for line in content.split('\n'): diff --git a/ms_agent/llm/openai_llm.py b/ms_agent/llm/openai_llm.py index d9b9179ba..dadc1bf1c 100644 --- a/ms_agent/llm/openai_llm.py +++ b/ms_agent/llm/openai_llm.py @@ -572,6 +572,7 @@ def _format_input_message(self, openai_messages = [] for idx, message in enumerate(messages): if isinstance(message, Message): + # Only strip string content, keep list content as-is for multimodal if isinstance(message.content, str): message.content = message.content.strip() message = message.to_dict_clean() @@ -579,25 +580,32 @@ def _format_input_message(self, message = dict(message) content = message.get('content', '') + # Only strip string content, multimodal content (list) should be kept as-is if isinstance(content, str): content = content.strip() # Apply prefix cache structured content transformation + # Only for string content, multimodal content is already structured if cache_indice is not None and idx == cache_indice: content = self._to_structured_content( content, add_cache_control=True, provider=self._prefix_cache_provider) - message = { - key: value.strip() if isinstance(value, str) else value - for key, value in message.items() - if key in self.input_msg and value - } - if 'content' not in message: - message['content'] = '' - message['content'] = content if content else '' - - openai_messages.append(message) + # Build the message dict, handling both string and multimodal content + formatted_message = {} + for key, value in message.items(): + if key in self.input_msg: + # Only strip string values, keep other types as-is + if isinstance(value, str): + formatted_message[key] = value.strip() if value else '' + else: + formatted_message[key] = value + + # Always use the transformed content to support features like prefix caching + # The content variable has been processed by _to_structured_content() if needed + formatted_message['content'] = content + + openai_messages.append(formatted_message) return openai_messages