ModelEngine-Group · Dallas98 · May 28, 2026 · May 17, 2026 · May 17, 2026 · May 26, 2026
@@ -15,6 +15,11 @@
 
 logger = logging.getLogger("model_health_service")
 
+DASHSCOPE_MODEL_FACTORY = "dashscope"
+TOKENPONY_MODEL_FACTORY = "tokenpony"
+PROVIDER_CATALOG_HEALTHCHECK_FACTORIES = {DASHSCOPE_MODEL_FACTORY, TOKENPONY_MODEL_FACTORY}
+PROVIDER_CATALOG_HEALTHCHECK_TYPES = {"vlm", "vlm2", "vlm3"}
+
 
 def _mask_secret(value: Optional[str]) -> str:
     """Mask a secret value, showing only first and last 4 characters."""
@@ -64,7 +69,32 @@
         raise ValueError(f"Unsupported model type: {model_type}")
 
 
+async def _provider_catalog_connectivity_check(
+    model_name: str,
+    model_type: str,
+    model_api_key: str,
+    model_factory: Optional[str],
+) -> bool:
+    """Validate provider-managed multimodal models through their model catalog."""
+    provider = (model_factory or "").lower()
+    if provider not in PROVIDER_CATALOG_HEALTHCHECK_FACTORIES:
+        return False
+
+    from services.model_provider_service import get_provider_models
+
+    model_list = await get_provider_models({
+        "provider": provider,
+        "model_type": model_type,
+        "api_key": model_api_key,
+    })
+    if not model_list or any(model.get("_error") for model in model_list):
+        return False
+
+    expected_model_id = model_name.lower()
+    return any(str(model.get("id", "")).lower() == expected_model_id for model in model_list)
+
+
 async def _perform_connectivity_check(
     model_name: str,
     model_type: str,
    model_base_url: str,
@@ -135,6 +165,18 @@
         )
         connectivity = await rerank_model.connectivity_check()
     elif model_type in ("vlm", "vlm2", "vlm3"):
+        if (
+            model_type in PROVIDER_CATALOG_HEALTHCHECK_TYPES
+            and (model_factory or "").lower() in PROVIDER_CATALOG_HEALTHCHECK_FACTORIES
+        ):
+            connectivity = await _provider_catalog_connectivity_check(
+                model_name=model_name,
+                model_type=model_type,
+                model_api_key=model_api_key,
+                model_factory=model_factory,
+            )
+            return connectivity
+
         observer = MessageObserver()
         set_monitoring_operation("connectivity_check",
                                  display_name=display_name)

@@ -8,7 +8,6 @@
 from database.model_management_db import (
     create_model_record,
     delete_model_record,
-    get_model_by_display_name,
     get_model_by_name_factory,
     get_models_by_display_name,
     get_model_records,
@@ -32,6 +31,23 @@
 
 logger = logging.getLogger("model_management_service")
 
+INDEPENDENT_MULTIMODAL_MODEL_TYPES = {"vlm", "vlm2", "vlm3"}
+
+
+def _has_display_name_conflict(existing_models: List[Dict[str, Any]], model_type: Optional[str]) -> bool:
+    """Allow the three multimodal slots to share display names across slots."""
+    if not existing_models:
+        return False
+
+    if model_type in INDEPENDENT_MULTIMODAL_MODEL_TYPES:
+        return any(
+            existing.get("model_type") == model_type
+            or existing.get("model_type") not in INDEPENDENT_MULTIMODAL_MODEL_TYPES
+            for existing in existing_models
+        )
+
+    return True
+
 
 async def create_model_for_tenant(user_id: str, tenant_id: str, model_data: Dict[str, Any]):
     """Create a single model record for the given tenant.
@@ -77,9 +93,9 @@ async def create_model_for_tenant(user_id: str, tenant_id: str, model_data: Dict
 
         # Check display name conflict scoped by tenant
         if model_data.get("display_name"):
-            existing_model_by_display = get_model_by_display_name(
+            existing_models_by_display = get_models_by_display_name(
                 model_data["display_name"], tenant_id)
-            if existing_model_by_display:
+            if _has_display_name_conflict(existing_models_by_display, model_data.get("model_type")):
                 logging.error(
                     f"Name {model_data['display_name']} is already in use, please choose another display name")
                 raise ValueError(

@@ -6,6 +6,75 @@
 from services.providers.base import AbstractModelProvider, _classify_provider_error
 
 
+DASHSCOPE_IMAGE_GENERATION_KEYWORDS = (
+    "image",
+    "wanx",
+    "aitryon",
+    "tryon",
+    "flux",
+    "stable-diffusion",
+    "sdxl",
+)
+DASHSCOPE_IMAGE_UNDERSTANDING_KEYWORDS = (
+    "qwen-vl",
+    "qwen2-vl",
+    "qwen2.5-vl",
+    "qwen3-vl",
+    "qwen3.5-vl",
+    "qwen3.6-vl",
+    "-vl",
+    "vl-",
+    "vision",
+    "visual",
+    "ocr",
+    "qwen3.6",
+    "qwen-3.6",
+)
+DASHSCOPE_VIDEO_UNDERSTANDING_KEYWORDS = ("omni", "video-understanding", "video-ocr")
+
+
+def _modality_set(value) -> set:
+    if not value:
+        return set()
+    if isinstance(value, str):
+        return {value.lower()}
+    return {str(item).lower() for item in value}
+
+
+def _has_keyword(text: str, keywords: tuple) -> bool:
+    return any(keyword in text for keyword in keywords)
+
+
+def _is_dashscope_explicit_image_understanding_model(model_id: str) -> bool:
+    return _has_keyword(model_id, DASHSCOPE_IMAGE_UNDERSTANDING_KEYWORDS)
+
+
+def _is_dashscope_image_generation_model(model_id: str, desc: str, req_mods: set, res_mods: set) -> bool:
+    if _is_dashscope_explicit_image_understanding_model(model_id):
+        return False
+    return "image" in res_mods or _has_keyword(model_id, DASHSCOPE_IMAGE_GENERATION_KEYWORDS)
+
+
+def _is_dashscope_video_understanding_model(model_id: str, desc: str, req_mods: set, res_mods: set) -> bool:
+    searchable_text = f"{model_id} {desc.lower()}"
+    if "video" in req_mods and "text" in res_mods:
+        return True
+    return _has_keyword(searchable_text, DASHSCOPE_VIDEO_UNDERSTANDING_KEYWORDS)
+
+
+def _is_dashscope_image_understanding_model(model_id: str, desc: str, req_mods: set, res_mods: set) -> bool:
+    searchable_text = f"{model_id} {desc.lower()}"
+    if _is_dashscope_image_generation_model(model_id, desc, req_mods, res_mods):
+        return False
+    if _is_dashscope_video_understanding_model(model_id, desc, req_mods, res_mods):
+        return False
+    if ("image" in req_mods or "video" in req_mods) and "text" in res_mods:
+        return True
+    return _is_dashscope_explicit_image_understanding_model(model_id) or _has_keyword(
+        searchable_text, DASHSCOPE_IMAGE_UNDERSTANDING_KEYWORDS
+    )
+
+
 class DashScopeModelProvider(AbstractModelProvider):
     """Concrete implementation for DashScope (Aliyun) provider."""
 
@@ -57,6 +126,8 @@
             categorized_models = {
                 "chat": [],  # Maps to "llm"
                 "vlm": [],  # Maps to "vlm"
+                "vlm2": [],  # Maps to image generation models
+                "vlm3": [],  # Maps to video understanding models
                 "embedding": [],  # Maps to "embedding" / "multi_embedding"
                 "rerank": [],  # Maps to "rerank"
                 "tts": [],  # Maps to "tts"
@@ -71,6 +142,8 @@
                 metadata = model_obj.get('inference_metadata') or {}
                 req_mod = metadata.get('request_modality', [])
                 res_mod = metadata.get('response_modality', [])
+                req_mods = _modality_set(req_mod)
+                res_mods = _modality_set(res_mod)
                 model_obj.setdefault("object", model_obj.get("object", "model"))
                 model_obj.setdefault("owned_by", model_obj.get("owned_by", "dashscope"))
                 cleaned_model = {
@@ -107,8 +180,17 @@
                     continue
 
                 # 5. VLM
-                vision_mods = {'Image', 'Video'}
-                if (set(req_mod) & vision_mods) or (set(res_mod) & vision_mods) or '视觉' in desc:
+                if _is_dashscope_video_understanding_model(m_id, desc, req_mods, res_mods):
+                    cleaned_model.update({"model_tag": "chat", "model_type": "vlm3"})
+                    categorized_models['vlm3'].append(cleaned_model)
+                    continue
+
+                if _is_dashscope_image_generation_model(m_id, desc, req_mods, res_mods):
+                    cleaned_model.update({"model_tag": "chat", "model_type": "vlm2"})
+                    categorized_models['vlm2'].append(cleaned_model)
+                    continue
+
+                if _is_dashscope_image_understanding_model(m_id, desc, req_mods, res_mods):
                     cleaned_model.update({"model_tag": "chat", "model_type": "vlm"})
                     categorized_models['vlm'].append(cleaned_model)
                     continue
@@ -124,7 +206,10 @@
             elif target_model_type in ("embedding", "multi_embedding"):
                 return categorized_models["embedding"]
             elif target_model_type in categorized_models:
-                return categorized_models[target_model_type]
+                return [
+                    {**model, "model_type": target_model_type}
+                    for model in categorized_models[target_model_type]
+                ]
             else:
                 return []
         except (httpx.HTTPStatusError, httpx.ConnectTimeout, httpx.ConnectError, Exception) as e:

@@ -9,10 +9,68 @@
 from services.providers.base import AbstractModelProvider, _classify_provider_error
 
 
+TOKENPONY_IMAGE_UNDERSTANDING_KEYWORDS = (
+    "qwen-vl",
+    "qwen2-vl",
+    "qwen2.5-vl",
+    "qwen3-vl",
+    "qwen3.5-vl",
+    "qwen3.6-vl",
+    "-vl",
+    "vl-",
+    "vision",
+    "visual",
+    "ocr",
+    "gpt-4o",
+    "qwen3.6",
+    "qwen-3.6",
+)
+TOKENPONY_IMAGE_GENERATION_KEYWORDS = (
+    "image",
+    "dall",
+    "flux",
+    "stable-diffusion",
+    "sdxl",
+    "midjourney",
+    "wanx",
+    "kolors",
+    "seedream",
+    "ideogram",
+    "recraft",
+)
+TOKENPONY_VIDEO_UNDERSTANDING_KEYWORDS = ("omni", "video")
+
+
+def _has_keyword(text: str, keywords: tuple) -> bool:
+    return any(keyword in text for keyword in keywords)
+
+
+def _is_tokenpony_explicit_image_understanding_model(model_id: str) -> bool:
+    return _has_keyword(model_id, TOKENPONY_IMAGE_UNDERSTANDING_KEYWORDS)
+
+
+def _is_tokenpony_image_generation_model(model_id: str) -> bool:
+    if _is_tokenpony_explicit_image_understanding_model(model_id):
+        return False
+    return _has_keyword(model_id, TOKENPONY_IMAGE_GENERATION_KEYWORDS)
+
+
+def _is_tokenpony_video_understanding_model(model_id: str) -> bool:
+    return _has_keyword(model_id, TOKENPONY_VIDEO_UNDERSTANDING_KEYWORDS)
+
+
+def _is_tokenpony_image_understanding_model(model_id: str) -> bool:
+    if _is_tokenpony_image_generation_model(model_id):
+        return False
+    if _is_tokenpony_video_understanding_model(model_id):
+        return False
+    return _is_tokenpony_explicit_image_understanding_model(model_id)
+
+
 class TokenPonyModelProvider(AbstractModelProvider):
     """Concrete implementation for TokenPony provider."""
 
    async def get_models(self, provider_config: Dict) -> List[Dict]:
        """
        Fetch models from TokenPony API, categorize them based on modality/ID,
        and return the requested model type.
@@ -46,6 +104,8 @@
             categorized_models = {
                 "chat": [],       # Maps to "llm"
                 "vlm": [],        # Maps to "vlm"
+                "vlm2": [],       # Maps to image generation models
+                "vlm3": [],       # Maps to video understanding models
                 "embedding": [],  # Maps to "embedding" / "multi_embedding"
                 "rerank": [],   # Maps to "rerank"
                 "tts": [],        # Maps to "tts"
@@ -86,9 +146,14 @@
                     cleaned_model.update({"model_tag": "tts", "model_type": "tts"})
                     categorized_models['tts'].append(cleaned_model)
 
-                # 5. VLM (Vision Language Model / Image & Video Generation)
-
-                elif any(keyword in m_id for keyword in ['-vl', 'vl-', 'ocr', 'vision']):
+                # 5. Multimodal models
+                elif _is_tokenpony_video_understanding_model(m_id):
+                    cleaned_model.update({"model_tag": "chat", "model_type": "vlm3"})
+                    categorized_models['vlm3'].append(cleaned_model)
+                elif _is_tokenpony_image_generation_model(m_id):
+                    cleaned_model.update({"model_tag": "chat", "model_type": "vlm2"})
+                    categorized_models['vlm2'].append(cleaned_model)
+                elif _is_tokenpony_image_understanding_model(m_id):
                     cleaned_model.update({"model_tag": "chat", "model_type": "vlm"})
                     categorized_models['vlm'].append(cleaned_model)
 
@@ -104,7 +169,10 @@
             elif target_model_type in ("embedding", "multi_embedding"):
                 return categorized_models["embedding"]
             elif target_model_type in categorized_models:
-                return categorized_models[target_model_type]
+                return [
+                    {**model, "model_type": target_model_type}
+                    for model in categorized_models[target_model_type]
+                ]
             else:
                 return []
 

@@ -113,11 +113,10 @@ export default function ToolManagement({
   // Use tool list hook for data management
   const { availableTools } = useToolList();
 
-  const { isVlmAvailable, isEmbeddingAvailable, isMultiEmbeddingAvailable } = useConfig();
-  const isEmbeddingOrMultiAvailable = isEmbeddingAvailable || isMultiEmbeddingAvailable;
   const {
     isImageUnderstandingAvailable,
     isVideoUnderstandingAvailable,
+    isEmbeddingAvailable,
   } = useConfig();
 
   // Prefetch knowledge bases for KB tools
@@ -383,10 +382,7 @@ export default function ToolManagement({
                             isImageUnderstandingAvailable,
                             isVideoUnderstandingAvailable
                           );
-                          const isDisabledDueToEmbedding = isToolDisabledDueToEmbedding(
-                            tool.name,
-                            isEmbeddingOrMultiAvailable
-                          );
+                          const isDisabledDueToEmbedding = isToolDisabledDueToEmbedding(tool.name, isEmbeddingAvailable);
                           const isDisabled = isDisabledDueToVlm || isDisabledDueToEmbedding || isReadOnly;
                           // Tooltip priority: permission > VLM > Embedding
                           const tooltipTitle = isReadOnly
@@ -495,10 +491,7 @@ export default function ToolManagement({
                   isImageUnderstandingAvailable,
                   isVideoUnderstandingAvailable
                 );
-                const isDisabledDueToEmbedding = isToolDisabledDueToEmbedding(
-                  tool.name,
-                  isEmbeddingOrMultiAvailable
-                );
+                const isDisabledDueToEmbedding = isToolDisabledDueToEmbedding(tool.name, isEmbeddingAvailable);
                 const isDisabled = isDisabledDueToVlm || isDisabledDueToEmbedding || isReadOnly;
                 // Tooltip priority: permission > VLM > Embedding
                 const tooltipTitle = isReadOnly