From 80ac81a6c2da832141871b95652d676d4897f10b Mon Sep 17 00:00:00 2001
From: "nap.liu" <liuxi@yeyecha.com>
Date: Wed, 8 Apr 2026 21:40:41 +0800
Subject: [PATCH] feat: rehydrate image context in multi-turn conversations for
 vision models

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 backend/app/api/websocket.py          |   5 ++
 backend/app/services/image_context.py | 112 ++++++++++++++++++++++++++
 2 files changed, 117 insertions(+)
 create mode 100644 backend/app/services/image_context.py

diff --git a/backend/app/api/websocket.py b/backend/app/api/websocket.py
index dd466ed2e..5a61cdfba 100644
--- a/backend/app/api/websocket.py
+++ b/backend/app/api/websocket.py
@@ -168,6 +168,11 @@ async def call_llm(
     # Load tools dynamically from DB
     tools_for_llm = await get_agent_tools_for_llm(agent_id) if agent_id else AGENT_TOOLS
 
+    # Re-hydrate image context from previous turns for vision-capable models
+    if supports_vision and agent_id:
+        from app.services.image_context import rehydrate_image_messages
+        messages = rehydrate_image_messages(messages, agent_id)
+
     # Convert messages to LLMMessage format
     api_messages = [LLMMessage(role="system", content=static_prompt, dynamic_content=dynamic_prompt)]
     for msg in messages:
diff --git a/backend/app/services/image_context.py b/backend/app/services/image_context.py
new file mode 100644
index 000000000..c4bf9eb64
--- /dev/null
+++ b/backend/app/services/image_context.py
@@ -0,0 +1,112 @@
+"""Re-hydrate image content from disk for LLM multi-turn context.
+
+Scans history messages for [file:xxx.jpg] patterns,
+reads the image file from agent workspace, and injects base64 data
+so the LLM can see images from previous turns.
+"""
+
+import base64
+import re
+from pathlib import Path
+from typing import Optional
+
+from loguru import logger
+from app.config import get_settings
+
+IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'}
+FILE_PATTERN = re.compile(r'\[file:([^\]]+)\]')
+IMAGE_DATA_PATTERN = re.compile(
+    r'\[image_data:data:image/[^;]+;base64,[A-Za-z0-9+/=]+\]'
+)
+MAX_IMAGE_BYTES = 5 * 1024 * 1024  # 5MB per image
+
+
+def rehydrate_image_messages(
+    messages: list[dict],
+    agent_id,
+    max_images: int = 3,
+) -> list[dict]:
+    """Scan history for [file:xxx.jpg] and inject base64 image data for LLM.
+
+    Only processes the most recent `max_images` user image messages
+    to limit context size and cost.
+
+    Args:
+        messages: List of {"role": ..., "content": ...} dicts
+        agent_id: Agent UUID for resolving file paths
+        max_images: Max number of historical images to re-hydrate
+
+    Returns:
+        New list with image messages enriched with base64 data.
+        Non-image messages and messages with existing image_data are unchanged.
+    """
+    settings = get_settings()
+    upload_dir = (
+        Path(settings.AGENT_DATA_DIR) / str(agent_id) / "workspace" / "uploads"
+    )
+
+    # Find user messages with [file:xxx.jpg] (newest first, skip current turn)
+    image_indices: list[tuple[int, str]] = []  # (index, filename)
+    for i in range(len(messages) - 1, -1, -1):
+        msg = messages[i]
+        if msg.get("role") != "user":
+            continue
+        content = msg.get("content", "")
+        if not isinstance(content, str):
+            continue
+        # Skip if already has image_data (current turn)
+        if "[image_data:" in content:
+            continue
+        match = FILE_PATTERN.search(content)
+        if not match:
+            continue
+        filename = match.group(1)
+        ext = Path(filename).suffix.lower()
+        if ext not in IMAGE_EXTENSIONS:
+            continue
+        image_indices.append((i, filename))
+        if len(image_indices) >= max_images:
+            break
+
+    if not image_indices:
+        return messages
+
+    # Re-hydrate in-place (working on a copy)
+    result = list(messages)
+    rehydrated = 0
+
+    for idx, filename in image_indices:
+        file_path = upload_dir / filename
+        if not file_path.exists():
+            logger.warning(f"[ImageContext] File not found: {file_path}")
+            continue
+        try:
+            img_bytes = file_path.read_bytes()
+            if len(img_bytes) > MAX_IMAGE_BYTES:
+                logger.info(
+                    f"[ImageContext] Skipping large image: "
+                    f"{filename} ({len(img_bytes)} bytes)"
+                )
+                continue
+
+            b64 = base64.b64encode(img_bytes).decode("ascii")
+            ext = file_path.suffix.lower().lstrip('.')
+            mime = f"image/{'jpeg' if ext == 'jpg' else ext}"
+            marker = f"[image_data:data:{mime};base64,{b64}]"
+
+            # Append image_data marker to existing content
+            old_content = result[idx]["content"]
+            result[idx] = {**result[idx], "content": f"{old_content}\n{marker}"}
+            rehydrated += 1
+            logger.debug(f"[ImageContext] Re-hydrated: {filename}")
+
+        except Exception as e:
+            logger.error(f"[ImageContext] Failed to read {filename}: {e}")
+
+    if rehydrated > 0:
+        logger.info(
+            f"[ImageContext] Re-hydrated {rehydrated} image(s) "
+            f"for agent {agent_id}"
+        )
+
+    return result