From a1d1a50bd8d584bc6de99445e27e79e0bfaf7ef1 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Tue, 2 Sep 2025 00:06:41 +0800 Subject: [PATCH 1/9] add protocols form vllm Signed-off-by: JaredforReal --- src/vllm_router/protocols.py | 71 +++++++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/src/vllm_router/protocols.py b/src/vllm_router/protocols.py index 449ed5214..02322811c 100644 --- a/src/vllm_router/protocols.py +++ b/src/vllm_router/protocols.py @@ -1,5 +1,5 @@ import time -from typing import List, Optional +from typing import List, Optional, Union from pydantic import BaseModel, ConfigDict, Field, model_validator @@ -54,3 +54,72 @@ class ModelCard(OpenAIBaseModel): class ModelList(OpenAIBaseModel): object: str = "list" data: List[ModelCard] = Field(default_factory=list) + + +# ===== Core Request Models ===== +# Based on vLLM official protocol.py definitions + +class ChatCompletionRequest(OpenAIBaseModel): + """ChatCompletion API request model based on OpenAI specification""" + + # Core required fields + messages: List[dict] # Simplified message type to avoid complex nested definitions + model: Optional[str] = None + + # Core sampling parameters + max_tokens: Optional[int] = None + max_completion_tokens: Optional[int] = None + temperature: Optional[float] = None + top_p: Optional[float] = None + frequency_penalty: Optional[float] = 0.0 + presence_penalty: Optional[float] = 0.0 + + # Core control parameters + stream: Optional[bool] = False + stop: Optional[Union[str, List[str]]] = None + n: Optional[int] = 1 + + # Other common parameters + seed: Optional[int] = None + user: Optional[str] = None + + +class CompletionRequest(OpenAIBaseModel): + """Completion API request model based on OpenAI specification""" + + # Core required fields + prompt: Optional[Union[str, List[str], List[int], List[List[int]]]] = None + model: Optional[str] = None + + # Core sampling parameters + max_tokens: Optional[int] = 16 + temperature: Optional[float] = None + top_p: Optional[float] = None + frequency_penalty: Optional[float] = 0.0 + presence_penalty: Optional[float] = 0.0 + + # Core control parameters + stream: Optional[bool] = False + stop: Optional[Union[str, List[str]]] = None + n: int = 1 + echo: Optional[bool] = False + + # Other common parameters + seed: Optional[int] = None + user: Optional[str] = None + best_of: Optional[int] = None + logprobs: Optional[int] = None + suffix: Optional[str] = None + + +class EmbeddingRequest(OpenAIBaseModel): + """Embedding API request model based on OpenAI specification""" + + # Core required fields + input: Union[str, List[str], List[int], List[List[int]]] + model: Optional[str] = None + + # Core control parameters + encoding_format: Optional[str] = "float" + dimensions: Optional[int] = None + user: Optional[str] = None From 56d6c3687dada810d74cff69ebb198da940e5502 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Tue, 2 Sep 2025 00:31:40 +0800 Subject: [PATCH 2/9] integrate in Router layer && add unit test Signed-off-by: JaredforReal --- src/tests/test_swagger_integration.py | 275 ++++++++++++++++++ src/vllm_router/protocols.py | 6 +- src/vllm_router/routers/main_router.py | 42 ++- .../services/request_service/request.py | 9 +- 4 files changed, 319 insertions(+), 13 deletions(-) create mode 100644 src/tests/test_swagger_integration.py diff --git a/src/tests/test_swagger_integration.py b/src/tests/test_swagger_integration.py new file mode 100644 index 000000000..525f3630d --- /dev/null +++ b/src/tests/test_swagger_integration.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 +""" +Unit tests for Swagger UI integration with Pydantic models. +Tests the OpenAI-compatible API endpoints with automatic validation. +""" + +import sys +import os +import pytest +import json +from fastapi import FastAPI, Request, BackgroundTasks +from fastapi.testclient import TestClient + +# Add src to path for imports +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) + +# Mock dependencies that might not be available in test environment +class MockServiceDiscovery: + def get_endpoint_info(self): + return [] + def get_health(self): + return True + +class MockEngineStatsScraper: + def get_health(self): + return True + +class MockDynamicConfigWatcher: + def get_current_config(self): + class MockConfig: + def to_json_str(self): + return '{"mock": true}' + return MockConfig() + +# Mock modules before importing our code +sys.modules['vllm_router.service_discovery'] = type('MockModule', (), { + 'get_service_discovery': lambda: MockServiceDiscovery() +})() + +sys.modules['vllm_router.stats.engine_stats'] = type('MockModule', (), { + 'get_engine_stats_scraper': lambda: MockEngineStatsScraper() +})() + +sys.modules['vllm_router.dynamic_config'] = type('MockModule', (), { + 'get_dynamic_config_watcher': lambda: MockDynamicConfigWatcher() +})() + +sys.modules['vllm_router.version'] = type('MockModule', (), { + '__version__': '1.0.0' +})() + +# Create mock for request service +class MockRequestModule: + @staticmethod + async def route_general_request(request, endpoint, background_tasks, request_body=None): + """Mock implementation that mimics the real route_general_request function""" + if request_body: + data = json.loads(request_body) + else: + data = await request.json() + + return { + "mock_response": True, + "endpoint": endpoint, + "model": data.get("model"), + "request_type": "pydantic" if request_body else "raw", + "data": data + } + + @staticmethod + def route_sleep_wakeup_request(r, e, b): + return {"sleep": True} + +sys.modules['vllm_router.services.request_service.request'] = MockRequestModule() + +# Now import our router after mocking +from vllm_router.routers.main_router import main_router + +# Create test app and client +app = FastAPI() +app.include_router(main_router) +client = TestClient(app) + + +class TestSwaggerIntegration: + """Test Swagger UI integration with Pydantic models""" + + def test_chat_completions_pydantic_model(self): + """Test /v1/chat/completions with Pydantic model validation""" + response = client.post("/v1/chat/completions", json={ + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "Hello"}], + "max_tokens": 100, + "temperature": 0.7 + }) + + assert response.status_code == 200, f"Expected 200, got {response.status_code}: {response.text}" + data = response.json() + assert data["mock_response"] == True + assert data["endpoint"] == "/v1/chat/completions" + assert data["model"] == "gpt-3.5-turbo" + assert data["request_type"] == "pydantic" + + def test_completions_pydantic_model(self): + """Test /v1/completions with Pydantic model validation""" + response = client.post("/v1/completions", json={ + "model": "gpt-3.5-turbo", + "prompt": "Hello world", + "max_tokens": 50 + }) + + assert response.status_code == 200, f"Expected 200, got {response.status_code}: {response.text}" + data = response.json() + assert data["mock_response"] == True + assert data["endpoint"] == "/v1/completions" + assert data["model"] == "gpt-3.5-turbo" + assert data["request_type"] == "pydantic" + + def test_embeddings_pydantic_model(self): + """Test /v1/embeddings with Pydantic model validation""" + response = client.post("/v1/embeddings", json={ + "model": "text-embedding-ada-002", + "input": "Hello world" + }) + + assert response.status_code == 200, f"Expected 200, got {response.status_code}: {response.text}" + data = response.json() + assert data["mock_response"] == True + assert data["endpoint"] == "/v1/embeddings" + assert data["model"] == "text-embedding-ada-002" + assert data["request_type"] == "pydantic" + + def test_extra_fields_handling(self): + """Test that extra fields are logged but don't break the request""" + response = client.post("/v1/chat/completions", json={ + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "Hello"}], + "max_tokens": 100, + "unknown_field": "should be ignored" + }) + + assert response.status_code == 200, f"Expected 200, got {response.status_code}: {response.text}" + data = response.json() + assert data["mock_response"] == True + assert data["request_type"] == "pydantic" + + +class TestSemanticCacheCompatibility: + """Test semantic cache compatibility with Pydantic models""" + + def test_semantic_cache_uses_raw_request(self): + """ + Test that semantic cache functionality still uses raw_request correctly. + This verifies that the check_semantic_cache function receives the proper + raw Request object, not the parsed Pydantic model. + """ + # Mock the semantic cache to track what request object it receives + original_available = hasattr(sys.modules.get('vllm_router.routers.main_router'), 'semantic_cache_available') + + # Create a test that will track the request type passed to semantic cache + received_request_type = None + + async def mock_check_semantic_cache(request): + nonlocal received_request_type + received_request_type = type(request).__name__ + # Return None to continue with normal processing + return None + + # Patch the semantic cache check function + import vllm_router.routers.main_router as router_module + if hasattr(router_module, 'check_semantic_cache'): + original_check = router_module.check_semantic_cache + router_module.check_semantic_cache = mock_check_semantic_cache + router_module.semantic_cache_available = True + + try: + # Make a request that would trigger semantic cache check + response = client.post("/v1/chat/completions", json={ + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "Hello cache test"}], + "max_tokens": 100 + }) + + # Verify the request succeeded + assert response.status_code == 200 + + # Verify that semantic cache received a Request object, not a Pydantic model + assert received_request_type == "Request", f"Expected 'Request', got '{received_request_type}'" + + finally: + # Restore original function + router_module.check_semantic_cache = original_check + if not original_available: + router_module.semantic_cache_available = False + else: + # If semantic cache is not available, just verify the request works + response = client.post("/v1/chat/completions", json={ + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "Hello"}], + "max_tokens": 100 + }) + assert response.status_code == 200 + + def test_semantic_cache_with_pydantic_request_body(self): + """ + Test that when semantic cache is bypassed, the Pydantic request body + is properly converted and passed to the backend service. + """ + response = client.post("/v1/chat/completions", json={ + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "Test message"}], + "max_tokens": 100, + "temperature": 0.7, + "cache_similarity_threshold": 0.9 # Semantic cache specific field + }) + + assert response.status_code == 200 + data = response.json() + assert data["mock_response"] == True + assert data["request_type"] == "pydantic" + + # Verify that semantic cache specific fields are preserved in the request + request_data = data["data"] + assert "cache_similarity_threshold" in request_data + assert request_data["cache_similarity_threshold"] == 0.9 + + +class TestBackwardCompatibility: + """Test backward compatibility with existing functionality""" + + def test_validation_errors(self): + """Test that Pydantic validation catches invalid requests""" + # Missing required field (model) + response = client.post("/v1/chat/completions", json={ + "messages": [{"role": "user", "content": "Hello"}] + }) + + assert response.status_code == 422 # FastAPI validation error + error_data = response.json() + assert "detail" in error_data + assert any("model" in str(error).lower() for error in error_data["detail"]) + + def test_invalid_json(self): + """Test handling of invalid JSON""" + response = client.post( + "/v1/chat/completions", + data="invalid json", + headers={"Content-Type": "application/json"} + ) + + assert response.status_code == 422 # FastAPI validation error + + def test_openapi_schema_generation(self): + """Test that OpenAPI schema is generated correctly""" + response = client.get("/openapi.json") + assert response.status_code == 200 + + schema = response.json() + assert "paths" in schema + + # Check that our endpoints are in the schema + paths = schema["paths"] + assert "/v1/chat/completions" in paths + assert "/v1/completions" in paths + assert "/v1/embeddings" in paths + + # Check that request models are properly defined + chat_completions = paths["/v1/chat/completions"]["post"] + assert "requestBody" in chat_completions + request_body = chat_completions["requestBody"]["content"]["application/json"] + assert "$ref" in request_body["schema"] + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/src/vllm_router/protocols.py b/src/vllm_router/protocols.py index 02322811c..fd17ab53b 100644 --- a/src/vllm_router/protocols.py +++ b/src/vllm_router/protocols.py @@ -64,7 +64,7 @@ class ChatCompletionRequest(OpenAIBaseModel): # Core required fields messages: List[dict] # Simplified message type to avoid complex nested definitions - model: Optional[str] = None + model: str # Required field according to OpenAI API spec # Core sampling parameters max_tokens: Optional[int] = None @@ -89,7 +89,7 @@ class CompletionRequest(OpenAIBaseModel): # Core required fields prompt: Optional[Union[str, List[str], List[int], List[List[int]]]] = None - model: Optional[str] = None + model: str # Required field according to OpenAI API spec # Core sampling parameters max_tokens: Optional[int] = 16 @@ -117,7 +117,7 @@ class EmbeddingRequest(OpenAIBaseModel): # Core required fields input: Union[str, List[str], List[int], List[List[int]]] - model: Optional[str] = None + model: str # Required field according to OpenAI API spec # Core control parameters encoding_format: Optional[str] = "float" diff --git a/src/vllm_router/routers/main_router.py b/src/vllm_router/routers/main_router.py index c13590440..6c9b05cc5 100644 --- a/src/vllm_router/routers/main_router.py +++ b/src/vllm_router/routers/main_router.py @@ -18,7 +18,13 @@ from vllm_router.dynamic_config import get_dynamic_config_watcher from vllm_router.log import init_logger -from vllm_router.protocols import ModelCard, ModelList +from vllm_router.protocols import ( + ChatCompletionRequest, + CompletionRequest, + EmbeddingRequest, + ModelCard, + ModelList, +) from vllm_router.service_discovery import get_service_discovery from vllm_router.services.request_service.request import ( route_general_request, @@ -43,30 +49,50 @@ @main_router.post("/v1/chat/completions") -async def route_chat_completion(request: Request, background_tasks: BackgroundTasks): +async def route_chat_completion( + request: ChatCompletionRequest, raw_request: Request, background_tasks: BackgroundTasks +): if semantic_cache_available: # Check if the request can be served from the semantic cache logger.debug("Received chat completion request, checking semantic cache") - cache_response = await check_semantic_cache(request=request) + cache_response = await check_semantic_cache(request=raw_request) if cache_response: logger.info("Serving response from semantic cache") return cache_response logger.debug("No cache hit, forwarding request to backend") + + # Convert Pydantic model to JSON bytes for existing service + request_body = request.model_dump_json().encode('utf-8') + return await route_general_request( - request, "/v1/chat/completions", background_tasks + raw_request, "/v1/chat/completions", background_tasks, request_body ) @main_router.post("/v1/completions") -async def route_completion(request: Request, background_tasks: BackgroundTasks): - return await route_general_request(request, "/v1/completions", background_tasks) +async def route_completion( + request: CompletionRequest, raw_request: Request, background_tasks: BackgroundTasks +): + # Convert Pydantic model to JSON bytes for existing service + request_body = request.model_dump_json().encode('utf-8') + + return await route_general_request( + raw_request, "/v1/completions", background_tasks, request_body + ) @main_router.post("/v1/embeddings") -async def route_embeddings(request: Request, background_tasks: BackgroundTasks): - return await route_general_request(request, "/v1/embeddings", background_tasks) +async def route_embeddings( + request: EmbeddingRequest, raw_request: Request, background_tasks: BackgroundTasks +): + # Convert Pydantic model to JSON bytes for existing service + request_body = request.model_dump_json().encode('utf-8') + + return await route_general_request( + raw_request, "/v1/embeddings", background_tasks, request_body + ) @main_router.post("/tokenize") diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py index 83e647927..5be68722c 100644 --- a/src/vllm_router/services/request_service/request.py +++ b/src/vllm_router/services/request_service/request.py @@ -17,6 +17,7 @@ import os import time import uuid +from typing import Optional import aiohttp from fastapi import BackgroundTasks, HTTPException, Request @@ -137,7 +138,7 @@ async def process_request( async def route_general_request( - request: Request, endpoint: str, background_tasks: BackgroundTasks + request: Request, endpoint: str, background_tasks: BackgroundTasks, request_body: Optional[bytes] = None ): """ Route the incoming request to the backend server and stream the response back to the client. @@ -162,7 +163,11 @@ async def route_general_request( in_router_time = time.time() # Same as vllm, Get request_id from X-Request-Id header if available request_id = request.headers.get("X-Request-Id") or str(uuid.uuid4()) - request_body = await request.body() + + # Use pre-provided request_body if available, otherwise read from request + if request_body is None: + request_body = await request.body() + request_json = json.loads(request_body) if request.query_params: From a69916ae55412ab0551d3492eebb4c28b7e6dbea Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Wed, 3 Sep 2025 00:39:30 +0800 Subject: [PATCH 3/9] fix(router): update Content-Length after request rewrite and set proper media type Fix truncated JSON causing backend 400 responses by syncing Content-Length after request rewriting. Also return application/json for non-stream requests instead of always text/event-stream. Signed-off-by: JaredforReal --- src/vllm_router/services/request_service/request.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py index 5be68722c..d28a68798 100644 --- a/src/vllm_router/services/request_service/request.py +++ b/src/vllm_router/services/request_service/request.py @@ -169,6 +169,8 @@ async def route_general_request( request_body = await request.body() request_json = json.loads(request_body) + # Determine if request expects streaming (OpenAI style) + is_streaming = bool(request_json.get("stream", False)) if request.query_params: request_endpoint = request.query_params.get("id") @@ -199,6 +201,11 @@ async def route_general_request( ) logger.info(f"Request for model {requested_model} was rewritten") request_body = rewritten_body + # IMPORTANT: after rewriting, update Content-Length so backend reads full JSON + try: + update_content_length(request, request_body) + except Exception as e: + logger.warning(f"Failed to update Content-Length after rewrite: {e}") # Update request_json if the body was rewritten try: request_json = json.loads(request_body) @@ -298,11 +305,13 @@ async def route_general_request( headers, status = await anext(stream_generator) headers_dict = {key: value for key, value in headers.items()} headers_dict["X-Request-Id"] = request_id + # Choose appropriate media type. If client didn't request streaming, return JSON. + media_type = "text/event-stream" if is_streaming else "application/json" return StreamingResponse( stream_generator, status_code=status, headers=headers_dict, - media_type="text/event-stream", + media_type=media_type, ) From 336be03978f11a3dcc5a5dd6135452dc28978738 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Wed, 3 Sep 2025 00:40:07 +0800 Subject: [PATCH 4/9] chore(dev-tools): add mock backend and Swagger smoke test tooling Add mock backend (examples/mock_backend), CLI swagger_smoke script, shared core, and optional E2E pytest smoke test (RUN_E2E_SWAGGER gated). Replaces ad-hoc root-level scripts; improves local & CI verification workflow. Signed-off-by: JaredforReal --- examples/mock_backend/main.py | 115 +++++++++++++++++++++++++++++ scripts/_swagger_smoke_core.py | 94 +++++++++++++++++++++++ scripts/swagger_smoke.py | 25 +++++++ tests/e2e/test_swagger_ui_smoke.py | 113 ++++++++++++++++++++++++++++ 4 files changed, 347 insertions(+) create mode 100644 examples/mock_backend/main.py create mode 100644 scripts/_swagger_smoke_core.py create mode 100644 scripts/swagger_smoke.py create mode 100644 tests/e2e/test_swagger_ui_smoke.py diff --git a/examples/mock_backend/main.py b/examples/mock_backend/main.py new file mode 100644 index 000000000..a2674ef3d --- /dev/null +++ b/examples/mock_backend/main.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +""" +Mock vLLM backend used for local Swagger UI and routing smoke tests. + +Features: + - Implements /v1/chat/completions, /v1/completions, /v1/embeddings + - Lightweight, deterministic style responses + - Adjustable port via --port (default 8000) + +This is NOT a production server; it's only for development / CI smoke tests. +""" + +# Copyright 2024-2025 The vLLM Production Stack Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import argparse +import time +import uuid +from fastapi import FastAPI, Request +from fastapi.responses import JSONResponse +import uvicorn + +app = FastAPI(title="Mock vLLM Backend", version="1.0.0") + + +@app.post("/v1/chat/completions") +async def mock_chat_completions(request: Request): # pragma: no cover - exercised in e2e + body = await request.json() + response = { + "id": f"chatcmpl-{uuid.uuid4().hex[:10]}", + "object": "chat.completion", + "created": int(time.time()), + "model": body.get("model", "mock-model"), + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "Hello! This is a mock response from the Swagger UI integration test.", + }, + "finish_reason": "stop", + } + ], + "usage": {"prompt_tokens": 10, "completion_tokens": 15, "total_tokens": 25}, + } + return JSONResponse(content=response) + + +@app.post("/v1/completions") +async def mock_completions(request: Request): # pragma: no cover + body = await request.json() + response = { + "id": f"cmpl-{uuid.uuid4().hex[:10]}", + "object": "text_completion", + "created": int(time.time()), + "model": body.get("model", "mock-model"), + "choices": [ + {"text": " This is a mock completion response.", "index": 0, "finish_reason": "stop"} + ], + "usage": {"prompt_tokens": 5, "completion_tokens": 8, "total_tokens": 13}, + } + return JSONResponse(content=response) + + +@app.post("/v1/embeddings") +async def mock_embeddings(request: Request): # pragma: no cover + body = await request.json() + response = { + "object": "list", + "data": [ + { + "object": "embedding", + "embedding": [0.1, 0.2, 0.3, 0.4, 0.5] * 100, # 500-dim + "index": 0, + } + ], + "model": body.get("model", "mock-embedding-model"), + "usage": {"prompt_tokens": 8, "total_tokens": 8}, + } + return JSONResponse(content=response) + + +@app.get("/health") +async def health(): # pragma: no cover + return {"status": "healthy"} + + +def parse_args(): # pragma: no cover + p = argparse.ArgumentParser() + p.add_argument("--port", type=int, default=8000) + p.add_argument("--host", type=str, default="0.0.0.0") + return p.parse_args() + + +def main(): # pragma: no cover + args = parse_args() + print(f"🚀 Starting Mock vLLM Backend on http://{args.host}:{args.port}") + uvicorn.run(app, host=args.host, port=args.port) + + +if __name__ == "__main__": # pragma: no cover + main() diff --git a/scripts/_swagger_smoke_core.py b/scripts/_swagger_smoke_core.py new file mode 100644 index 000000000..18f8703fc --- /dev/null +++ b/scripts/_swagger_smoke_core.py @@ -0,0 +1,94 @@ +"""Reusable Swagger smoke test core logic for CLI + pytest. + +Keep this dependency-light. Do not import internal router modules here. +""" + +from __future__ import annotations + +import json +import time +from dataclasses import dataclass, field +from typing import Dict, List +import requests + + +@dataclass +class TestResult: + name: str + success: bool + detail: str = "" + extra: Dict = field(default_factory=dict) + + +class SwaggerUITester: + def __init__(self, base_url: str): + self.base_url = base_url.rstrip("/") + self.session = requests.Session() + self.results: List[TestResult] = [] + + def record(self, name: str, success: bool, detail: str = "", **extra): + self.results.append(TestResult(name, success, detail, extra)) + + def _url(self, path: str) -> str: + return f"{self.base_url}{path}" + + def test_docs(self): + try: + r = self.session.get(self._url("/docs"), timeout=5) + self.record("/docs", r.status_code == 200, f"status={r.status_code}") + except Exception as e: # pragma: no cover + self.record("/docs", False, str(e)) + + def test_openapi(self): + try: + r = self.session.get(self._url("/openapi.json"), timeout=5) + if r.status_code != 200: + self.record("openapi", False, f"status={r.status_code}") + return + schema = r.json() + paths = schema.get("paths", {}) + expected = ["/v1/chat/completions", "/v1/completions", "/v1/embeddings"] + missing = [p for p in expected if p not in paths] + self.record("openapi", not missing, "ok" if not missing else f"missing={missing}") + except Exception as e: # pragma: no cover + self.record("openapi", False, str(e)) + + def test_core_endpoints(self): + # chat valid + chat = {"model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "hi"}], "max_tokens": 4} + r = self.session.post(self._url("/v1/chat/completions"), json=chat, timeout=8) + self.record("chat_valid", r.status_code == 200, f"status={r.status_code}") + # chat 422 + r2 = self.session.post(self._url("/v1/chat/completions"), json={"messages": []}, timeout=5) + self.record("chat_422", r2.status_code == 422, f"status={r2.status_code}") + # completions + comp = {"model": "gpt-3.5-turbo", "prompt": "Hello", "max_tokens": 5} + r3 = self.session.post(self._url("/v1/completions"), json=comp, timeout=8) + self.record("completions_valid", r3.status_code == 200, f"status={r3.status_code}") + # embeddings + emb = {"model": "text-embedding-ada-002", "input": "hello"} + r4 = self.session.post(self._url("/v1/embeddings"), json=emb, timeout=8) + self.record("embeddings_valid", r4.status_code == 200, f"status={r4.status_code}") + + def run(self): + start = time.time() + self.test_docs() + self.test_openapi() + self.test_core_endpoints() + elapsed = time.time() - start + passed = sum(1 for r in self.results if r.success) + return passed == len(self.results), elapsed, self.results + + +def run_smoke(base_url: str) -> bool: + tester = SwaggerUITester(base_url) + ok, elapsed, results = tester.run() + print(f"Swagger smoke: {passed_count(results)}/{len(results)} passed in {elapsed:.2f}s") + for r in results: + icon = "✅" if r.success else "❌" + print(f" {icon} {r.name} - {r.detail}") + return ok + + +def passed_count(results: List[TestResult]) -> int: + return sum(1 for r in results if r.success) diff --git a/scripts/swagger_smoke.py b/scripts/swagger_smoke.py new file mode 100644 index 000000000..b3ba5c614 --- /dev/null +++ b/scripts/swagger_smoke.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +"""CLI Swagger smoke test script. + +Uses the same tester logic as the e2e pytest but runnable standalone. +Exit code 0 = all pass; non-zero otherwise. +""" + +# Copyright 2024-2025 The vLLM Production Stack Authors. +# Licensed under the Apache License, Version 2.0. + +from __future__ import annotations + +import os +import sys +from _swagger_smoke_core import run_smoke + + +def main(): # pragma: no cover + base = sys.argv[1] if len(sys.argv) > 1 else os.getenv("SWAGGER_BASE_URL", "http://localhost:8080") + ok = run_smoke(base) + sys.exit(0 if ok else 1) + + +if __name__ == "__main__": # pragma: no cover + main() diff --git a/tests/e2e/test_swagger_ui_smoke.py b/tests/e2e/test_swagger_ui_smoke.py new file mode 100644 index 000000000..f68a2d6a3 --- /dev/null +++ b/tests/e2e/test_swagger_ui_smoke.py @@ -0,0 +1,113 @@ +"""End-to-end Swagger UI smoke test. + +Starts a mock backend + initializes router (subprocess) and validates core endpoints. +Skip unless RUN_E2E_SWAGGER=1 is set to avoid slowing default test runs. +""" + +# Copyright 2024-2025 The vLLM Production Stack Authors. +# Licensed under the Apache License, Version 2.0. + +from __future__ import annotations + +import os +import signal +import subprocess +import sys +import time +from pathlib import Path + +import pytest +import requests + +REPO_ROOT = Path(__file__).resolve().parents[2] +MOCK_PATH = REPO_ROOT / "examples" / "mock_backend" / "main.py" + + +pytestmark = pytest.mark.skipif( + os.getenv("RUN_E2E_SWAGGER") != "1", reason="Set RUN_E2E_SWAGGER=1 to run E2E Swagger smoke test" +) + + +def wait_http(url: str, timeout: float = 10.0): + end = time.time() + timeout + while time.time() < end: + try: + r = requests.get(url, timeout=1) + if r.status_code < 500: + return True + except Exception: + pass + time.sleep(0.2) + raise RuntimeError(f"Timeout waiting for {url}") + + +@pytest.fixture(scope="module") +def processes(): + env = os.environ.copy() + python = sys.executable + backend_port = 18080 + router_port = 18081 + backend = subprocess.Popen( + [python, str(MOCK_PATH), "--port", str(backend_port)], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + wait_http(f"http://localhost:{backend_port}/health") + + router = subprocess.Popen( + [ + python, + "-m", + "vllm_router.app", + "--service-discovery", + "static", + "--static-backends", + f"http://localhost:{backend_port}", + "--static-models", + "gpt-3.5-turbo", + "--static-aliases", + "text-embedding-ada-002=gpt-3.5-turbo", + "--routing-logic", + "roundrobin", + "--port", + str(router_port), + "--host", + "0.0.0.0", + ], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + env=env, + ) + wait_http(f"http://localhost:{router_port}/docs") + yield {"backend": backend, "router": router, "router_port": router_port} + for p in (router, backend): + if p.poll() is None: + p.send_signal(signal.SIGINT) + try: + p.wait(timeout=5) + except subprocess.TimeoutExpired: + p.kill() + + +def test_swagger_smoke(processes): + base = f"http://localhost:{processes['router_port']}" + # Basic endpoint checks + r_docs = requests.get(f"{base}/docs") + assert r_docs.status_code == 200 + + chat_payload = {"model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "hi"}]} + r_chat = requests.post(f"{base}/v1/chat/completions", json=chat_payload) + assert r_chat.status_code == 200, r_chat.text + + bad = requests.post(f"{base}/v1/chat/completions", json={"messages": []}) + assert bad.status_code == 422 + + comp_payload = {"model": "gpt-3.5-turbo", "prompt": "Hello"} + r_comp = requests.post(f"{base}/v1/completions", json=comp_payload) + assert r_comp.status_code == 200, r_comp.text + + emb_payload = {"model": "text-embedding-ada-002", "input": "Hello"} + r_emb = requests.post(f"{base}/v1/embeddings", json=emb_payload) + assert r_emb.status_code == 200, r_emb.text From 84adf32e2d21bc997b2f26bd10de4932f8709e7b Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Wed, 3 Sep 2025 01:38:40 +0800 Subject: [PATCH 5/9] Get rid of E2E test to simplify Signed-off-by: JaredforReal --- tests/e2e/test_swagger_ui_smoke.py | 113 ----------------------------- 1 file changed, 113 deletions(-) delete mode 100644 tests/e2e/test_swagger_ui_smoke.py diff --git a/tests/e2e/test_swagger_ui_smoke.py b/tests/e2e/test_swagger_ui_smoke.py deleted file mode 100644 index f68a2d6a3..000000000 --- a/tests/e2e/test_swagger_ui_smoke.py +++ /dev/null @@ -1,113 +0,0 @@ -"""End-to-end Swagger UI smoke test. - -Starts a mock backend + initializes router (subprocess) and validates core endpoints. -Skip unless RUN_E2E_SWAGGER=1 is set to avoid slowing default test runs. -""" - -# Copyright 2024-2025 The vLLM Production Stack Authors. -# Licensed under the Apache License, Version 2.0. - -from __future__ import annotations - -import os -import signal -import subprocess -import sys -import time -from pathlib import Path - -import pytest -import requests - -REPO_ROOT = Path(__file__).resolve().parents[2] -MOCK_PATH = REPO_ROOT / "examples" / "mock_backend" / "main.py" - - -pytestmark = pytest.mark.skipif( - os.getenv("RUN_E2E_SWAGGER") != "1", reason="Set RUN_E2E_SWAGGER=1 to run E2E Swagger smoke test" -) - - -def wait_http(url: str, timeout: float = 10.0): - end = time.time() + timeout - while time.time() < end: - try: - r = requests.get(url, timeout=1) - if r.status_code < 500: - return True - except Exception: - pass - time.sleep(0.2) - raise RuntimeError(f"Timeout waiting for {url}") - - -@pytest.fixture(scope="module") -def processes(): - env = os.environ.copy() - python = sys.executable - backend_port = 18080 - router_port = 18081 - backend = subprocess.Popen( - [python, str(MOCK_PATH), "--port", str(backend_port)], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - ) - wait_http(f"http://localhost:{backend_port}/health") - - router = subprocess.Popen( - [ - python, - "-m", - "vllm_router.app", - "--service-discovery", - "static", - "--static-backends", - f"http://localhost:{backend_port}", - "--static-models", - "gpt-3.5-turbo", - "--static-aliases", - "text-embedding-ada-002=gpt-3.5-turbo", - "--routing-logic", - "roundrobin", - "--port", - str(router_port), - "--host", - "0.0.0.0", - ], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - env=env, - ) - wait_http(f"http://localhost:{router_port}/docs") - yield {"backend": backend, "router": router, "router_port": router_port} - for p in (router, backend): - if p.poll() is None: - p.send_signal(signal.SIGINT) - try: - p.wait(timeout=5) - except subprocess.TimeoutExpired: - p.kill() - - -def test_swagger_smoke(processes): - base = f"http://localhost:{processes['router_port']}" - # Basic endpoint checks - r_docs = requests.get(f"{base}/docs") - assert r_docs.status_code == 200 - - chat_payload = {"model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "hi"}]} - r_chat = requests.post(f"{base}/v1/chat/completions", json=chat_payload) - assert r_chat.status_code == 200, r_chat.text - - bad = requests.post(f"{base}/v1/chat/completions", json={"messages": []}) - assert bad.status_code == 422 - - comp_payload = {"model": "gpt-3.5-turbo", "prompt": "Hello"} - r_comp = requests.post(f"{base}/v1/completions", json=comp_payload) - assert r_comp.status_code == 200, r_comp.text - - emb_payload = {"model": "text-embedding-ada-002", "input": "Hello"} - r_emb = requests.post(f"{base}/v1/embeddings", json=emb_payload) - assert r_emb.status_code == 200, r_emb.text From e2b53dfc2c4ab9901bae6ca459b4aa5a80d2e8d7 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Wed, 3 Sep 2025 11:55:37 +0800 Subject: [PATCH 6/9] pass pre-commit locally Signed-off-by: JaredforReal --- examples/mock_backend/main.py | 13 ++++++--- scripts/_swagger_smoke_core.py | 27 ++++++++++++++----- scripts/swagger_smoke.py | 7 ++++- src/vllm_router/protocols.py | 21 ++++++++------- src/vllm_router/routers/main_router.py | 18 +++++++------ .../services/request_service/request.py | 9 ++++--- 6 files changed, 64 insertions(+), 31 deletions(-) diff --git a/examples/mock_backend/main.py b/examples/mock_backend/main.py index a2674ef3d..da5978ce4 100644 --- a/examples/mock_backend/main.py +++ b/examples/mock_backend/main.py @@ -29,15 +29,18 @@ import argparse import time import uuid + +import uvicorn from fastapi import FastAPI, Request from fastapi.responses import JSONResponse -import uvicorn app = FastAPI(title="Mock vLLM Backend", version="1.0.0") @app.post("/v1/chat/completions") -async def mock_chat_completions(request: Request): # pragma: no cover - exercised in e2e +async def mock_chat_completions( + request: Request, +): # pragma: no cover - exercised in e2e body = await request.json() response = { "id": f"chatcmpl-{uuid.uuid4().hex[:10]}", @@ -68,7 +71,11 @@ async def mock_completions(request: Request): # pragma: no cover "created": int(time.time()), "model": body.get("model", "mock-model"), "choices": [ - {"text": " This is a mock completion response.", "index": 0, "finish_reason": "stop"} + { + "text": " This is a mock completion response.", + "index": 0, + "finish_reason": "stop", + } ], "usage": {"prompt_tokens": 5, "completion_tokens": 8, "total_tokens": 13}, } diff --git a/scripts/_swagger_smoke_core.py b/scripts/_swagger_smoke_core.py index 18f8703fc..f73dd59a7 100644 --- a/scripts/_swagger_smoke_core.py +++ b/scripts/_swagger_smoke_core.py @@ -9,6 +9,7 @@ import time from dataclasses import dataclass, field from typing import Dict, List + import requests @@ -49,26 +50,38 @@ def test_openapi(self): paths = schema.get("paths", {}) expected = ["/v1/chat/completions", "/v1/completions", "/v1/embeddings"] missing = [p for p in expected if p not in paths] - self.record("openapi", not missing, "ok" if not missing else f"missing={missing}") + self.record( + "openapi", not missing, "ok" if not missing else f"missing={missing}" + ) except Exception as e: # pragma: no cover self.record("openapi", False, str(e)) def test_core_endpoints(self): # chat valid - chat = {"model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "hi"}], "max_tokens": 4} + chat = { + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "hi"}], + "max_tokens": 4, + } r = self.session.post(self._url("/v1/chat/completions"), json=chat, timeout=8) self.record("chat_valid", r.status_code == 200, f"status={r.status_code}") # chat 422 - r2 = self.session.post(self._url("/v1/chat/completions"), json={"messages": []}, timeout=5) + r2 = self.session.post( + self._url("/v1/chat/completions"), json={"messages": []}, timeout=5 + ) self.record("chat_422", r2.status_code == 422, f"status={r2.status_code}") # completions comp = {"model": "gpt-3.5-turbo", "prompt": "Hello", "max_tokens": 5} r3 = self.session.post(self._url("/v1/completions"), json=comp, timeout=8) - self.record("completions_valid", r3.status_code == 200, f"status={r3.status_code}") + self.record( + "completions_valid", r3.status_code == 200, f"status={r3.status_code}" + ) # embeddings emb = {"model": "text-embedding-ada-002", "input": "hello"} r4 = self.session.post(self._url("/v1/embeddings"), json=emb, timeout=8) - self.record("embeddings_valid", r4.status_code == 200, f"status={r4.status_code}") + self.record( + "embeddings_valid", r4.status_code == 200, f"status={r4.status_code}" + ) def run(self): start = time.time() @@ -83,7 +96,9 @@ def run(self): def run_smoke(base_url: str) -> bool: tester = SwaggerUITester(base_url) ok, elapsed, results = tester.run() - print(f"Swagger smoke: {passed_count(results)}/{len(results)} passed in {elapsed:.2f}s") + print( + f"Swagger smoke: {passed_count(results)}/{len(results)} passed in {elapsed:.2f}s" + ) for r in results: icon = "✅" if r.success else "❌" print(f" {icon} {r.name} - {r.detail}") diff --git a/scripts/swagger_smoke.py b/scripts/swagger_smoke.py index b3ba5c614..ec8c689a5 100644 --- a/scripts/swagger_smoke.py +++ b/scripts/swagger_smoke.py @@ -12,11 +12,16 @@ import os import sys + from _swagger_smoke_core import run_smoke def main(): # pragma: no cover - base = sys.argv[1] if len(sys.argv) > 1 else os.getenv("SWAGGER_BASE_URL", "http://localhost:8080") + base = ( + sys.argv[1] + if len(sys.argv) > 1 + else os.getenv("SWAGGER_BASE_URL", "http://localhost:8080") + ) ok = run_smoke(base) sys.exit(0 if ok else 1) diff --git a/src/vllm_router/protocols.py b/src/vllm_router/protocols.py index fd17ab53b..d3526b7f8 100644 --- a/src/vllm_router/protocols.py +++ b/src/vllm_router/protocols.py @@ -59,13 +59,14 @@ class ModelList(OpenAIBaseModel): # ===== Core Request Models ===== # Based on vLLM official protocol.py definitions + class ChatCompletionRequest(OpenAIBaseModel): """ChatCompletion API request model based on OpenAI specification""" - + # Core required fields messages: List[dict] # Simplified message type to avoid complex nested definitions model: str # Required field according to OpenAI API spec - + # Core sampling parameters max_tokens: Optional[int] = None max_completion_tokens: Optional[int] = None @@ -73,12 +74,12 @@ class ChatCompletionRequest(OpenAIBaseModel): top_p: Optional[float] = None frequency_penalty: Optional[float] = 0.0 presence_penalty: Optional[float] = 0.0 - + # Core control parameters stream: Optional[bool] = False stop: Optional[Union[str, List[str]]] = None n: Optional[int] = 1 - + # Other common parameters seed: Optional[int] = None user: Optional[str] = None @@ -86,24 +87,24 @@ class ChatCompletionRequest(OpenAIBaseModel): class CompletionRequest(OpenAIBaseModel): """Completion API request model based on OpenAI specification""" - + # Core required fields prompt: Optional[Union[str, List[str], List[int], List[List[int]]]] = None model: str # Required field according to OpenAI API spec - + # Core sampling parameters max_tokens: Optional[int] = 16 temperature: Optional[float] = None top_p: Optional[float] = None frequency_penalty: Optional[float] = 0.0 presence_penalty: Optional[float] = 0.0 - + # Core control parameters stream: Optional[bool] = False stop: Optional[Union[str, List[str]]] = None n: int = 1 echo: Optional[bool] = False - + # Other common parameters seed: Optional[int] = None user: Optional[str] = None @@ -114,11 +115,11 @@ class CompletionRequest(OpenAIBaseModel): class EmbeddingRequest(OpenAIBaseModel): """Embedding API request model based on OpenAI specification""" - + # Core required fields input: Union[str, List[str], List[int], List[List[int]]] model: str # Required field according to OpenAI API spec - + # Core control parameters encoding_format: Optional[str] = "float" dimensions: Optional[int] = None diff --git a/src/vllm_router/routers/main_router.py b/src/vllm_router/routers/main_router.py index 22103e595..98b0a998b 100644 --- a/src/vllm_router/routers/main_router.py +++ b/src/vllm_router/routers/main_router.py @@ -55,7 +55,9 @@ @main_router.post("/v1/chat/completions") async def route_chat_completion( - request: ChatCompletionRequest, raw_request: Request, background_tasks: BackgroundTasks + request: ChatCompletionRequest, + raw_request: Request, + background_tasks: BackgroundTasks, ): if semantic_cache_available: # Check if the request can be served from the semantic cache @@ -67,10 +69,10 @@ async def route_chat_completion( return cache_response logger.debug("No cache hit, forwarding request to backend") - + # Convert Pydantic model to JSON bytes for existing service - request_body = request.model_dump_json().encode('utf-8') - + request_body = request.model_dump_json().encode("utf-8") + return await route_general_request( raw_request, "/v1/chat/completions", background_tasks, request_body ) @@ -81,8 +83,8 @@ async def route_completion( request: CompletionRequest, raw_request: Request, background_tasks: BackgroundTasks ): # Convert Pydantic model to JSON bytes for existing service - request_body = request.model_dump_json().encode('utf-8') - + request_body = request.model_dump_json().encode("utf-8") + return await route_general_request( raw_request, "/v1/completions", background_tasks, request_body ) @@ -93,8 +95,8 @@ async def route_embeddings( request: EmbeddingRequest, raw_request: Request, background_tasks: BackgroundTasks ): # Convert Pydantic model to JSON bytes for existing service - request_body = request.model_dump_json().encode('utf-8') - + request_body = request.model_dump_json().encode("utf-8") + return await route_general_request( raw_request, "/v1/embeddings", background_tasks, request_body ) diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py index 10b5749df..f231b255e 100644 --- a/src/vllm_router/services/request_service/request.py +++ b/src/vllm_router/services/request_service/request.py @@ -138,7 +138,10 @@ async def process_request( async def route_general_request( - request: Request, endpoint: str, background_tasks: BackgroundTasks, request_body: Optional[bytes] = None + request: Request, + endpoint: str, + background_tasks: BackgroundTasks, + request_body: Optional[bytes] = None, ): """ Route the incoming request to the backend server and stream the response back to the client. @@ -163,11 +166,11 @@ async def route_general_request( in_router_time = time.time() # Same as vllm, Get request_id from X-Request-Id header if available request_id = request.headers.get("X-Request-Id") or str(uuid.uuid4()) - + # Use pre-provided request_body if available, otherwise read from request if request_body is None: request_body = await request.body() - + request_json = json.loads(request_body) # Determine if request expects streaming (OpenAI style) is_streaming = bool(request_json.get("stream", False)) From 3e7324809e9e37a6976d2bd4d7cf960110a31882 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Wed, 3 Sep 2025 12:02:26 +0800 Subject: [PATCH 7/9] fix: lint and formatting for swagger integration test Signed-off-by: JaredforReal --- src/tests/test_swagger_integration.py | 305 +++++++++++--------------- 1 file changed, 129 insertions(+), 176 deletions(-) diff --git a/src/tests/test_swagger_integration.py b/src/tests/test_swagger_integration.py index 525f3630d..4e753390f 100644 --- a/src/tests/test_swagger_integration.py +++ b/src/tests/test_swagger_integration.py @@ -1,275 +1,228 @@ #!/usr/bin/env python3 -""" -Unit tests for Swagger UI integration with Pydantic models. -Tests the OpenAI-compatible API endpoints with automatic validation. -""" +"""Unit tests for Swagger UI integration: request validation & OpenAPI generation.""" -import sys +import json import os +import sys + import pytest -import json -from fastapi import FastAPI, Request, BackgroundTasks +from fastapi import FastAPI from fastapi.testclient import TestClient -# Add src to path for imports -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + -# Mock dependencies that might not be available in test environment class MockServiceDiscovery: def get_endpoint_info(self): return [] + def get_health(self): return True + class MockEngineStatsScraper: def get_health(self): return True + class MockDynamicConfigWatcher: def get_current_config(self): class MockConfig: def to_json_str(self): return '{"mock": true}' - return MockConfig() -# Mock modules before importing our code -sys.modules['vllm_router.service_discovery'] = type('MockModule', (), { - 'get_service_discovery': lambda: MockServiceDiscovery() -})() + return MockConfig() -sys.modules['vllm_router.stats.engine_stats'] = type('MockModule', (), { - 'get_engine_stats_scraper': lambda: MockEngineStatsScraper() -})() -sys.modules['vllm_router.dynamic_config'] = type('MockModule', (), { - 'get_dynamic_config_watcher': lambda: MockDynamicConfigWatcher() -})() +sys.modules["vllm_router.service_discovery"] = type( + "MockModule", (), {"get_service_discovery": lambda: MockServiceDiscovery()} +)() +sys.modules["vllm_router.stats.engine_stats"] = type( + "MockModule", (), {"get_engine_stats_scraper": lambda: MockEngineStatsScraper()} +)() +sys.modules["vllm_router.dynamic_config"] = type( + "MockModule", (), {"get_dynamic_config_watcher": lambda: MockDynamicConfigWatcher()} +)() +sys.modules["vllm_router.version"] = type("MockModule", (), {"__version__": "1.0.0"})() -sys.modules['vllm_router.version'] = type('MockModule', (), { - '__version__': '1.0.0' -})() -# Create mock for request service class MockRequestModule: @staticmethod - async def route_general_request(request, endpoint, background_tasks, request_body=None): - """Mock implementation that mimics the real route_general_request function""" + async def route_general_request( + request, endpoint, background_tasks, request_body=None + ): if request_body: data = json.loads(request_body) else: data = await request.json() - return { "mock_response": True, "endpoint": endpoint, "model": data.get("model"), "request_type": "pydantic" if request_body else "raw", - "data": data + "data": data, } - + @staticmethod - def route_sleep_wakeup_request(r, e, b): + def route_sleep_wakeup_request(r, e, b): # pragma: no cover return {"sleep": True} -sys.modules['vllm_router.services.request_service.request'] = MockRequestModule() -# Now import our router after mocking -from vllm_router.routers.main_router import main_router +sys.modules["vllm_router.services.request_service.request"] = MockRequestModule() + +from vllm_router.routers.main_router import main_router # noqa: E402 -# Create test app and client app = FastAPI() app.include_router(main_router) client = TestClient(app) class TestSwaggerIntegration: - """Test Swagger UI integration with Pydantic models""" - def test_chat_completions_pydantic_model(self): - """Test /v1/chat/completions with Pydantic model validation""" - response = client.post("/v1/chat/completions", json={ - "model": "gpt-3.5-turbo", - "messages": [{"role": "user", "content": "Hello"}], - "max_tokens": 100, - "temperature": 0.7 - }) - - assert response.status_code == 200, f"Expected 200, got {response.status_code}: {response.text}" - data = response.json() - assert data["mock_response"] == True + resp = client.post( + "/v1/chat/completions", + json={ + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "Hello"}], + "max_tokens": 100, + "temperature": 0.7, + }, + ) + assert resp.status_code == 200, resp.text + data = resp.json() + assert data["mock_response"] assert data["endpoint"] == "/v1/chat/completions" assert data["model"] == "gpt-3.5-turbo" assert data["request_type"] == "pydantic" def test_completions_pydantic_model(self): - """Test /v1/completions with Pydantic model validation""" - response = client.post("/v1/completions", json={ - "model": "gpt-3.5-turbo", - "prompt": "Hello world", - "max_tokens": 50 - }) - - assert response.status_code == 200, f"Expected 200, got {response.status_code}: {response.text}" - data = response.json() - assert data["mock_response"] == True + resp = client.post( + "/v1/completions", + json={"model": "gpt-3.5-turbo", "prompt": "Hello world", "max_tokens": 50}, + ) + assert resp.status_code == 200, resp.text + data = resp.json() + assert data["mock_response"] assert data["endpoint"] == "/v1/completions" assert data["model"] == "gpt-3.5-turbo" assert data["request_type"] == "pydantic" def test_embeddings_pydantic_model(self): - """Test /v1/embeddings with Pydantic model validation""" - response = client.post("/v1/embeddings", json={ - "model": "text-embedding-ada-002", - "input": "Hello world" - }) - - assert response.status_code == 200, f"Expected 200, got {response.status_code}: {response.text}" - data = response.json() - assert data["mock_response"] == True + resp = client.post( + "/v1/embeddings", + json={"model": "text-embedding-ada-002", "input": "Hello world"}, + ) + assert resp.status_code == 200, resp.text + data = resp.json() + assert data["mock_response"] assert data["endpoint"] == "/v1/embeddings" assert data["model"] == "text-embedding-ada-002" assert data["request_type"] == "pydantic" def test_extra_fields_handling(self): - """Test that extra fields are logged but don't break the request""" - response = client.post("/v1/chat/completions", json={ - "model": "gpt-3.5-turbo", - "messages": [{"role": "user", "content": "Hello"}], - "max_tokens": 100, - "unknown_field": "should be ignored" - }) - - assert response.status_code == 200, f"Expected 200, got {response.status_code}: {response.text}" - data = response.json() - assert data["mock_response"] == True + resp = client.post( + "/v1/chat/completions", + json={ + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "Hello"}], + "max_tokens": 100, + "unknown_field": "ignored", + }, + ) + assert resp.status_code == 200, resp.text + data = resp.json() + assert data["mock_response"] assert data["request_type"] == "pydantic" class TestSemanticCacheCompatibility: - """Test semantic cache compatibility with Pydantic models""" - def test_semantic_cache_uses_raw_request(self): - """ - Test that semantic cache functionality still uses raw_request correctly. - This verifies that the check_semantic_cache function receives the proper - raw Request object, not the parsed Pydantic model. - """ - # Mock the semantic cache to track what request object it receives - original_available = hasattr(sys.modules.get('vllm_router.routers.main_router'), 'semantic_cache_available') - - # Create a test that will track the request type passed to semantic cache + import vllm_router.routers.main_router as router_module + received_request_type = None - + async def mock_check_semantic_cache(request): nonlocal received_request_type received_request_type = type(request).__name__ - # Return None to continue with normal processing return None - - # Patch the semantic cache check function - import vllm_router.routers.main_router as router_module - if hasattr(router_module, 'check_semantic_cache'): + + if hasattr(router_module, "check_semantic_cache"): original_check = router_module.check_semantic_cache + original_flag = getattr(router_module, "semantic_cache_available", False) router_module.check_semantic_cache = mock_check_semantic_cache router_module.semantic_cache_available = True - try: - # Make a request that would trigger semantic cache check - response = client.post("/v1/chat/completions", json={ - "model": "gpt-3.5-turbo", - "messages": [{"role": "user", "content": "Hello cache test"}], - "max_tokens": 100 - }) - - # Verify the request succeeded - assert response.status_code == 200 - - # Verify that semantic cache received a Request object, not a Pydantic model - assert received_request_type == "Request", f"Expected 'Request', got '{received_request_type}'" - + resp = client.post( + "/v1/chat/completions", + json={ + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "Cache test"}], + }, + ) + assert resp.status_code == 200 + assert received_request_type == "Request" finally: - # Restore original function router_module.check_semantic_cache = original_check - if not original_available: - router_module.semantic_cache_available = False + router_module.semantic_cache_available = original_flag else: - # If semantic cache is not available, just verify the request works - response = client.post("/v1/chat/completions", json={ - "model": "gpt-3.5-turbo", - "messages": [{"role": "user", "content": "Hello"}], - "max_tokens": 100 - }) - assert response.status_code == 200 - + resp = client.post( + "/v1/chat/completions", + json={ + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "Hello"}], + }, + ) + assert resp.status_code == 200 + def test_semantic_cache_with_pydantic_request_body(self): - """ - Test that when semantic cache is bypassed, the Pydantic request body - is properly converted and passed to the backend service. - """ - response = client.post("/v1/chat/completions", json={ - "model": "gpt-3.5-turbo", - "messages": [{"role": "user", "content": "Test message"}], - "max_tokens": 100, - "temperature": 0.7, - "cache_similarity_threshold": 0.9 # Semantic cache specific field - }) - - assert response.status_code == 200 - data = response.json() - assert data["mock_response"] == True + resp = client.post( + "/v1/chat/completions", + json={ + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "Test message"}], + "cache_similarity_threshold": 0.9, + }, + ) + assert resp.status_code == 200 + data = resp.json() + assert data["mock_response"] assert data["request_type"] == "pydantic" - - # Verify that semantic cache specific fields are preserved in the request - request_data = data["data"] - assert "cache_similarity_threshold" in request_data - assert request_data["cache_similarity_threshold"] == 0.9 + assert "cache_similarity_threshold" in data["data"] + assert data["data"]["cache_similarity_threshold"] == 0.9 class TestBackwardCompatibility: - """Test backward compatibility with existing functionality""" - def test_validation_errors(self): - """Test that Pydantic validation catches invalid requests""" - # Missing required field (model) - response = client.post("/v1/chat/completions", json={ - "messages": [{"role": "user", "content": "Hello"}] - }) - - assert response.status_code == 422 # FastAPI validation error - error_data = response.json() + resp = client.post( + "/v1/chat/completions", + json={"messages": [{"role": "user", "content": "Hello"}]}, + ) + assert resp.status_code == 422 + error_data = resp.json() assert "detail" in error_data - assert any("model" in str(error).lower() for error in error_data["detail"]) + assert any("model" in str(err).lower() for err in error_data["detail"]) def test_invalid_json(self): - """Test handling of invalid JSON""" - response = client.post( - "/v1/chat/completions", + resp = client.post( + "/v1/chat/completions", data="invalid json", - headers={"Content-Type": "application/json"} + headers={"Content-Type": "application/json"}, ) - - assert response.status_code == 422 # FastAPI validation error + assert resp.status_code == 422 def test_openapi_schema_generation(self): - """Test that OpenAPI schema is generated correctly""" - response = client.get("/openapi.json") - assert response.status_code == 200 - - schema = response.json() + resp = client.get("/openapi.json") + assert resp.status_code == 200 + schema = resp.json() assert "paths" in schema - - # Check that our endpoints are in the schema paths = schema["paths"] - assert "/v1/chat/completions" in paths - assert "/v1/completions" in paths - assert "/v1/embeddings" in paths - - # Check that request models are properly defined - chat_completions = paths["/v1/chat/completions"]["post"] - assert "requestBody" in chat_completions - request_body = chat_completions["requestBody"]["content"]["application/json"] - assert "$ref" in request_body["schema"] - - -if __name__ == "__main__": + for p in ["/v1/chat/completions", "/v1/completions", "/v1/embeddings"]: + assert p in paths + chat_post = paths["/v1/chat/completions"]["post"] + assert "requestBody" in chat_post + rb = chat_post["requestBody"]["content"]["application/json"] + assert "$ref" in rb["schema"] + + +if __name__ == "__main__": # pragma: no cover pytest.main([__file__, "-v"]) From 324843b8cf6722cc17cbb63dfb1f96d6bd125854 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Thu, 4 Sep 2025 14:21:16 +0800 Subject: [PATCH 8/9] [Fix] CI test error: import error and requirement error Signed-off-by: JaredforReal --- pyproject.toml | 3 ++- requirements-test.txt | 1 + src/vllm_router/routers/main_router.py | 28 +++++++++++++++++--------- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dfc923313..749aa0188 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,5 +60,6 @@ lint = [ ] test = [ "pytest>=8.3.4", - "pytest-asyncio>=0.25.3" + "pytest-asyncio>=0.25.3", + "httpx==0.28.1" ] diff --git a/requirements-test.txt b/requirements-test.txt index f180cb7c9..16a5c7a8a 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -2,6 +2,7 @@ faiss-cpu>=1.7.4 huggingface-hub==0.33.0 pytest pytest-asyncio +httpx==0.28.1 sentence-transformers>=2.2.2 transformers==4.51.1 vllm==0.9.2 diff --git a/src/vllm_router/routers/main_router.py b/src/vllm_router/routers/main_router.py index 98b0a998b..79a837ef9 100644 --- a/src/vllm_router/routers/main_router.py +++ b/src/vllm_router/routers/main_router.py @@ -32,12 +32,21 @@ from vllm_router.service_discovery import get_service_discovery from vllm_router.services.request_service.request import ( route_general_request, - route_general_transcriptions, route_sleep_wakeup_request, ) from vllm_router.stats.engine_stats import get_engine_stats_scraper from vllm_router.version import __version__ +try: + # Semantic cache integration + from vllm_router.services.request_service.request import ( + route_general_transcriptions + ) + + _route_general_transcriptions = True +except ImportError: + _route_general_transcriptions = False + try: # Semantic cache integration from vllm_router.experimental.semantic_cache_integration import ( @@ -264,11 +273,12 @@ async def health() -> Response: return JSONResponse(content={"status": "healthy"}, status_code=200) -@main_router.post("/v1/audio/transcriptions") -async def route_v1_audio_transcriptions( - request: Request, background_tasks: BackgroundTasks -): - """Handles audio transcription requests.""" - return await route_general_transcriptions( - request, "/v1/audio/transcriptions", background_tasks - ) +if _route_general_transcriptions: + @main_router.post("/v1/audio/transcriptions") + async def route_v1_audio_transcriptions( + request: Request, background_tasks: BackgroundTasks + ): + """Handles audio transcription requests.""" + return await route_general_transcriptions( + request, "/v1/audio/transcriptions", background_tasks + ) From 00765ec9d484e8632bf7c0c7ff42b02b0b886fcc Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 5 Sep 2025 12:03:49 +0800 Subject: [PATCH 9/9] [FIX] pre-commit error Signed-off-by: JaredforReal --- requirements-test.txt | 2 +- src/vllm_router/routers/main_router.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements-test.txt b/requirements-test.txt index 16a5c7a8a..b59cac88d 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,8 +1,8 @@ faiss-cpu>=1.7.4 +httpx==0.28.1 huggingface-hub==0.33.0 pytest pytest-asyncio -httpx==0.28.1 sentence-transformers>=2.2.2 transformers==4.51.1 vllm==0.9.2 diff --git a/src/vllm_router/routers/main_router.py b/src/vllm_router/routers/main_router.py index 79a837ef9..a052dffac 100644 --- a/src/vllm_router/routers/main_router.py +++ b/src/vllm_router/routers/main_router.py @@ -40,7 +40,7 @@ try: # Semantic cache integration from vllm_router.services.request_service.request import ( - route_general_transcriptions + route_general_transcriptions, ) _route_general_transcriptions = True @@ -274,6 +274,7 @@ async def health() -> Response: if _route_general_transcriptions: + @main_router.post("/v1/audio/transcriptions") async def route_v1_audio_transcriptions( request: Request, background_tasks: BackgroundTasks