diff --git a/community/ai-vws-sizing-advisor/CHANGELOG.md b/community/ai-vws-sizing-advisor/CHANGELOG.md index 2e665a686..9fae02c9a 100644 --- a/community/ai-vws-sizing-advisor/CHANGELOG.md +++ b/community/ai-vws-sizing-advisor/CHANGELOG.md @@ -3,6 +3,46 @@ All notable changes to this project will be documented in this file. The format is based on Keep a Changelog, and this project adheres to Semantic Versioning. +## [2.3] - 2026-01-08 + +This release focuses on improved sizing recommendations, enhanced Nemotron model integration, and comprehensive documentation updates. + +### Added +- **Demo Screenshots** — Added visual examples showcasing the Configuration Wizard, RAG-powered sizing recommendations, and Local Deployment verification +- **Official Documentation Link** — Added link to [NVIDIA vGPU Docs Hub](https://docs.nvidia.com/vgpu/toolkits/sizing-advisor/latest/intro.html) in README + +### Changed +- **README Overhaul** — Reorganized documentation to highlight NVIDIA Nemotron models + - Llama-3.3-Nemotron-Super-49B powers the RAG backend + - Nemotron-3 Nano 30B (FP8) as default for workload sizing + - New Demo section with screenshots demonstrating key features + +- **Sizing Recommendation Improvements** + - Enhanced 95% usable capacity rule for profile selection (5% reserved for system overhead) + - Improved profile selection logic: picks smallest profile where (profile × 0.95) >= workload + - Better handling of edge cases near profile boundaries + +- **GPU Passthrough Logic** + - Automatic passthrough recommendation when workload exceeds max single vGPU profile + - Clearer passthrough examples in RAG context (e.g., 92GB on BSE → 2× BSE GPU passthrough) + - Calculator now returns `vgpu_profile: null` with multi-GPU passthrough recommendation + +- **vLLM Local Deployment** + - Updated to vLLM v0.12.0 for proper NemotronH (hybrid Mamba-Transformer) architecture support + - Improved GPU memory utilization calculations for local testing + - Better max-model-len auto-detection (only set when explicitly specified) + +- **Chat Improvements** + - Enhanced conversational mode with vGPU configuration context + - Better model extraction from sizing responses for follow-up questions + - Improved context handling for RAG vs inference workload discussions + +### Improved +- **Nemotron Model Integration** + - Default model changed to Nemotron-3 Nano 30B FP8 in configuration wizard + - Nemotron thinking prompt support for enhanced reasoning + - Better model matching for Nemotron variants in calculator + ## [2.2] - 2025-11-04 ### Changed diff --git a/community/ai-vws-sizing-advisor/README.md b/community/ai-vws-sizing-advisor/README.md index d63dc9bfe..b12830350 100644 --- a/community/ai-vws-sizing-advisor/README.md +++ b/community/ai-vws-sizing-advisor/README.md @@ -1,18 +1,67 @@ # AI vWS Sizing Advisor +

+ AI vWS Sizing Advisor +

+ +

+ RAG-powered vGPU sizing recommendations for AI Virtual Workstations
+ Powered by NVIDIA NeMo™ and Nemotron models +

+ +

+ Official Documentation • + Demo • + Quick Start • + Changelog +

+ +--- + ## Overview AI vWS Sizing Advisor is a RAG-powered tool that helps you determine the optimal NVIDIA vGPU sizing configuration for AI workloads on NVIDIA AI Virtual Workstation (AI vWS). Using NVIDIA vGPU documentation and best practices, it provides tailored recommendations for optimal performance and resource efficiency. +### Powered by NVIDIA Nemotron + +This tool leverages **NVIDIA Nemotron models** for intelligent sizing recommendations: + +- **[Llama-3.3-Nemotron-Super-49B](https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1)** — Powers the RAG backend for intelligent conversational sizing guidance +- **[Nemotron-3 Nano 30B](https://build.nvidia.com/nvidia/nvidia-nemotron-3-nano-30b-a3b-fp8)** — Default model for workload sizing calculations (FP8 optimized) + +### Key Capabilities + Enter your workload requirements and receive validated recommendations including: -- **vGPU Profile** - Recommended profile (e.g., L40S-24Q) based on your workload -- **Resource Requirements** - vCPUs, GPU memory, system RAM needed -- **Performance Estimates** - Expected latency, throughput, and time to first token -- **Live Testing** - Instantly deploy and validate your configuration locally using vLLM containers +- **vGPU Profile** — Recommended profile (e.g., L40S-24Q) based on your workload +- **Resource Requirements** — vCPUs, GPU memory, system RAM needed +- **Performance Estimates** — Expected latency, throughput, and time to first token +- **Live Testing** — Instantly deploy and validate your configuration locally using vLLM containers The tool differentiates between RAG and inference workloads by accounting for embedding vectors and database overhead. It intelligently suggests GPU passthrough when jobs exceed standard vGPU profile limits. +--- + +## Demo + +### Configuration Wizard + +Configure your workload parameters including model selection, GPU type, quantization, and token sizes: + +

+ Configuration Wizard +

+ +### Local Deployment Verification + +Validate your configuration by deploying a vLLM container locally and comparing actual GPU memory usage against estimates: + +

+ Local Deployment +

+ +--- + ## Prerequisites ### Hardware @@ -44,8 +93,10 @@ docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi > **Note:** Docker must be at `/usr/bin/docker` (verified in `deploy/compose/docker-compose-rag-server.yaml`). User must be in docker group or have socket permissions. ### API Keys -- **NVIDIA Build API Key** (Required) - [Get your key](https://build.nvidia.com/settings/api-keys) -- **HuggingFace Token** (Optional) - [Create token](https://huggingface.co/settings/tokens) for gated models +- **NVIDIA Build API Key** (Required) — [Get your key](https://build.nvidia.com/settings/api-keys) +- **HuggingFace Token** (Optional) — [Create token](https://huggingface.co/settings/tokens) for gated models + +--- ## Deployment @@ -74,28 +125,32 @@ npm install npm run dev ``` +--- + ## Usage -2. **Select Workload Type:** RAG or Inference +1. **Select Workload Type:** RAG or Inference -3. **Enter Parameters:** - - Model name (e.g., `meta-llama/Llama-2-7b-chat-hf`) +2. **Enter Parameters:** + - Model name (default: **Nemotron-3 Nano 30B FP8**) - GPU type - Prompt size (input tokens) - Response size (output tokens) - - Quantization (FP16, INT8, INT4) + - Quantization (FP16, FP8, INT8, INT4) - For RAG: Embedding model and vector dimensions -4. **View Recommendations:** +3. **View Recommendations:** - Recommended vGPU profiles - Resource requirements (vCPUs, RAM, GPU memory) - Performance estimates -5. **Test Locally** (optional): +4. **Test Locally** (optional): - Run local inference with a containerized vLLM server - View performance metrics - Compare actual results versus suggested profile configuration +--- + ## Management Commands ```bash @@ -120,6 +175,8 @@ The stop script automatically performs Docker cleanup operations: - Optionally removes dangling images (`--cleanup-images`) - Optionally removes all data volumes (`--volumes`) +--- + ## Adding Documents to RAG Context The tool includes NVIDIA vGPU documentation by default. To add your own: @@ -134,8 +191,7 @@ curl -X POST -F "file=@./vgpu_docs/your-document.pdf" http://localhost:8082/v1/i **Supported formats:** PDF, TXT, DOCX, HTML, PPTX - - +--- ## License @@ -145,6 +201,6 @@ Models governed by [NVIDIA AI Foundation Models Community License](https://docs. --- -**Version:** 2.2 (November 2025) - See [CHANGELOG.md](./CHANGELOG.md) +**Version:** 2.3 (January 2026) — See [CHANGELOG.md](./CHANGELOG.md) -**Support:** [GitHub Issues](https://github.com/NVIDIA/GenerativeAIExamples/issues) | [NVIDIA Forums](https://forums.developer.nvidia.com/) \ No newline at end of file +**Support:** [GitHub Issues](https://github.com/NVIDIA/GenerativeAIExamples/issues) | [NVIDIA Forums](https://forums.developer.nvidia.com/) | [Official Docs](https://docs.nvidia.com/vgpu/toolkits/sizing-advisor/latest/intro.html) \ No newline at end of file diff --git a/community/ai-vws-sizing-advisor/deploy/compose/docker-compose-ingestor-server.yaml b/community/ai-vws-sizing-advisor/deploy/compose/docker-compose-ingestor-server.yaml index 24eb910ba..a163c3680 100644 --- a/community/ai-vws-sizing-advisor/deploy/compose/docker-compose-ingestor-server.yaml +++ b/community/ai-vws-sizing-advisor/deploy/compose/docker-compose-ingestor-server.yaml @@ -1,3 +1,11 @@ +# ============================================================================ +# CENTRALIZED MODEL CONFIGURATION +# Change these values to use different models throughout the application +# ============================================================================ +x-model-config: + # Embedding Model Configuration + embedding-model: &embedding-model "nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1" + services: # Main ingestor server which is responsible for ingestion @@ -38,10 +46,14 @@ services: NGC_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"} ##===Embedding Model specific configurations=== + # Model name - pulls from centralized config at top of file (can be overridden by env var) + APP_EMBEDDINGS_MODELNAME: *embedding-model # url on which embedding model is hosted. If "", Nvidia hosted API is used - APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000"} - APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/nv-embedqa-mistral-7b-v2} - APP_EMBEDDINGS_DIMENSIONS: ${APP_EMBEDDINGS_DIMENSIONS:-2048} + APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL:-"nemoretriever-embedding-ms:8000"} + # Embedding dimensions - IMPORTANT: Must match your embedding model! + # nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1: 4096 + # nvidia/nv-embedqa-mistral-7b-v2: 2048 + APP_EMBEDDINGS_DIMENSIONS: ${APP_EMBEDDINGS_DIMENSIONS:-4096} ##===NV-Ingest Connection Configurations======= APP_NVINGEST_MESSAGECLIENTHOSTNAME: ${APP_NVINGEST_MESSAGECLIENTHOSTNAME:-"nv-ingest-ms-runtime"} @@ -115,9 +127,10 @@ services: - AUDIO_INFER_PROTOCOL=grpc - CUDA_VISIBLE_DEVICES=0 - MAX_INGEST_PROCESS_WORKERS=${MAX_INGEST_PROCESS_WORKERS:-16} - - EMBEDDING_NIM_MODEL_NAME=${EMBEDDING_NIM_MODEL_NAME:-${APP_EMBEDDINGS_MODELNAME:-nvidia/nv-embedqa-7b-v2}} + # Embedding model - uses APP_EMBEDDINGS_MODELNAME which pulls from centralized config + - EMBEDDING_NIM_MODEL_NAME=${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1} # Incase of self-hosted embedding model, use the endpoint url as - https://integrate.api.nvidia.com/v1 - - EMBEDDING_NIM_ENDPOINT=${EMBEDDING_NIM_ENDPOINT:-${APP_EMBEDDINGS_SERVERURL-http://nemoretriever-embedding-ms:8000/v1}} + - EMBEDDING_NIM_ENDPOINT=${EMBEDDING_NIM_ENDPOINT:-http://nemoretriever-embedding-ms:8000/v1} - INGEST_LOG_LEVEL=DEFAULT - INGEST_EDGE_BUFFER_SIZE=64 # Message client for development diff --git a/community/ai-vws-sizing-advisor/deploy/compose/docker-compose-rag-server.yaml b/community/ai-vws-sizing-advisor/deploy/compose/docker-compose-rag-server.yaml index 7beba493d..69bfdd194 100644 --- a/community/ai-vws-sizing-advisor/deploy/compose/docker-compose-rag-server.yaml +++ b/community/ai-vws-sizing-advisor/deploy/compose/docker-compose-rag-server.yaml @@ -1,3 +1,14 @@ +# ============================================================================ +# CENTRALIZED MODEL CONFIGURATION +# Change these values to use different models throughout the application +# ============================================================================ +x-model-config: + # Chat/LLM Model Configuration + llm-model: &llm-model "nvidia/llama-3.3-nemotron-super-49b-v1" + + # Embedding Model Configuration + embedding-model: &embedding-model "nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1" + services: # Main orchestrator server which stiches together all calls to different services to fulfill the user request @@ -35,25 +46,16 @@ services: VECTOR_DB_TOPK: ${VECTOR_DB_TOPK:-100} ##===LLM Model specific configurations=== - APP_LLM_MODELNAME: ${APP_LLM_MODELNAME:-"meta/llama-3.1-8b-instruct"} + # Model name - pulls from centralized config at top of file (can be overridden by env var) + APP_LLM_MODELNAME: *llm-model # url on which llm model is hosted. If "", Nvidia hosted API is used - APP_LLM_SERVERURL: ${APP_LLM_SERVERURL-""} - - ##===Query Rewriter Model specific configurations=== - APP_QUERYREWRITER_MODELNAME: ${APP_QUERYREWRITER_MODELNAME:-"meta/llama-3.1-8b-instruct"} - # url on which query rewriter model is hosted. If "", Nvidia hosted API is used - APP_QUERYREWRITER_SERVERURL: ${APP_QUERYREWRITER_SERVERURL-"nim-llm-llama-8b-ms:8000"} + APP_LLM_SERVERURL: ${APP_LLM_SERVERURL:-""} ##===Embedding Model specific configurations=== + # Model name - pulls from centralized config at top of file (can be overridden by env var) + APP_EMBEDDINGS_MODELNAME: *embedding-model # url on which embedding model is hosted. If "", Nvidia hosted API is used - APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-""} - APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/nv-embedqa-mistral-7b-v2} - - ##===Reranking Model specific configurations=== - # url on which ranking model is hosted. If "", Nvidia hosted API is used - APP_RANKING_SERVERURL: ${APP_RANKING_SERVERURL-""} - APP_RANKING_MODELNAME: ${APP_RANKING_MODELNAME:-nv-rerank-qa-mistral-4b:1} - ENABLE_RERANKER: ${ENABLE_RERANKER:-True} + APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL:-""} NVIDIA_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"} @@ -65,7 +67,7 @@ services: # enable multi-turn conversation in the rag chain - this controls conversation history usage # while doing query rewriting and in LLM prompt - ENABLE_MULTITURN: ${ENABLE_MULTITURN:-False} + ENABLE_MULTITURN: ${ENABLE_MULTITURN:-True} # enable query rewriting for multiturn conversation in the rag chain. # This will improve accuracy of the retrieiver pipeline but increase latency due to an additional LLM call @@ -139,10 +141,10 @@ services: context: ../../frontend dockerfile: ./Dockerfile args: - # Model name for LLM - NEXT_PUBLIC_MODEL_NAME: ${APP_LLM_MODELNAME:-meta/llama-3.1-8b-instruct} - # Model name for embeddings - NEXT_PUBLIC_EMBEDDING_MODEL: ${APP_EMBEDDINGS_MODELNAME:-nvidia/nv-embedqa-mistral-7b-v2} + # Model name for LLM - pulls from centralized config at top of file + NEXT_PUBLIC_MODEL_NAME: *llm-model + # Model name for embeddings - pulls from centralized config at top of file + NEXT_PUBLIC_EMBEDDING_MODEL: *embedding-model # Model name for reranking NEXT_PUBLIC_RERANKER_MODEL: ${APP_RANKING_MODELNAME:-nv-rerank-qa-mistral-4b:1} # URL for rag server container diff --git a/community/ai-vws-sizing-advisor/deploy/compose/model_config.env b/community/ai-vws-sizing-advisor/deploy/compose/model_config.env new file mode 100644 index 000000000..fa46dc00a --- /dev/null +++ b/community/ai-vws-sizing-advisor/deploy/compose/model_config.env @@ -0,0 +1,82 @@ +# ============================================================================ +# CENTRALIZED MODEL CONFIGURATION +# ============================================================================ +# This file centralizes all model configurations for the RAG system. +# Source this file or set these environment variables to change models. +# +# Usage: +# source model_config.env +# docker compose -f docker-compose-rag-server.yaml up +# +# ============================================================================ + +# ---------------------------------------------------------------------------- +# CHAT/LLM MODEL CONFIGURATION +# ---------------------------------------------------------------------------- +# The main language model used for generating responses +# Default: nvidia/llama-3.3-nemotron-super-49b-v1 +# +# Other options: +# - meta/llama-3.1-405b-instruct +# - meta/llama-3.1-70b-instruct +# - meta/llama-3.1-8b-instruct +# - mistralai/mixtral-8x22b-instruct-v0.1 +# +export APP_LLM_MODELNAME="nvidia/llama-3.3-nemotron-super-49b-v1" + +# LLM Server URL (leave empty "" to use NVIDIA hosted API) +export APP_LLM_SERVERURL="" + +# ---------------------------------------------------------------------------- +# EMBEDDING MODEL CONFIGURATION +# ---------------------------------------------------------------------------- +# The embedding model used for vectorizing documents and queries +# Default: nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1 +# +# Other options: +# - nvidia/nv-embedqa-mistral-7b-v2 +# - nvidia/nv-embed-v2 +# - nvidia/llama-3.2-nv-embedqa-1b-v2 +# +export APP_EMBEDDINGS_MODELNAME="nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1" + +# Embedding Server URL (leave empty "" to use NVIDIA hosted API, or set to self-hosted) +# Example for self-hosted: "nemoretriever-embedding-ms:8000" +export APP_EMBEDDINGS_SERVERURL="" + +# Embedding dimensions (adjust based on your embedding model) +# IMPORTANT: This MUST match your chosen embedding model! +# - nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1: 4096 (current default) +# - nvidia/nv-embedqa-mistral-7b-v2: 2048 +# - nvidia/nv-embed-v2: 4096 +export APP_EMBEDDINGS_DIMENSIONS="4096" + +# ---------------------------------------------------------------------------- +# REFLECTION MODEL CONFIGURATION (for response quality checking) +# ---------------------------------------------------------------------------- +# Model used for reflection/self-checking if ENABLE_REFLECTION=true +export REFLECTION_LLM="mistralai/mixtral-8x22b-instruct-v0.1" +export REFLECTION_LLM_SERVERURL="nim-llm-mixtral-8x22b:8000" + +# ---------------------------------------------------------------------------- +# CAPTION MODEL CONFIGURATION (for image/chart understanding) +# ---------------------------------------------------------------------------- +# Model used for generating captions for images, charts, and tables +export APP_NVINGEST_CAPTIONMODELNAME="meta/llama-3.2-11b-vision-instruct" +export APP_NVINGEST_CAPTIONENDPOINTURL="http://vlm-ms:8000/v1/chat/completions" +export VLM_CAPTION_MODEL_NAME="meta/llama-3.2-11b-vision-instruct" +export VLM_CAPTION_ENDPOINT="http://vlm-ms:8000/v1/chat/completions" + +# ---------------------------------------------------------------------------- +# ADDITIONAL NOTES +# ---------------------------------------------------------------------------- +# 1. After changing models, you may need to rebuild containers: +# docker compose -f docker-compose-rag-server.yaml build --no-cache rag-playground +# +# 2. For self-hosted models, make sure the corresponding NIM services are running +# +# 3. The embedding dimensions must match your chosen embedding model +# +# 4. When switching between hosted and self-hosted, update both the model name +# and the server URL accordingly + diff --git a/community/ai-vws-sizing-advisor/deployment_examples/configuration_wizard.png b/community/ai-vws-sizing-advisor/deployment_examples/configuration_wizard.png new file mode 100644 index 000000000..8734e4abe Binary files /dev/null and b/community/ai-vws-sizing-advisor/deployment_examples/configuration_wizard.png differ diff --git a/community/ai-vws-sizing-advisor/deployment_examples/example_rag_config.png b/community/ai-vws-sizing-advisor/deployment_examples/example_rag_config.png new file mode 100644 index 000000000..0625c77bf Binary files /dev/null and b/community/ai-vws-sizing-advisor/deployment_examples/example_rag_config.png differ diff --git a/community/ai-vws-sizing-advisor/deployment_examples/local_deployment.png b/community/ai-vws-sizing-advisor/deployment_examples/local_deployment.png new file mode 100644 index 000000000..d54b7eff6 Binary files /dev/null and b/community/ai-vws-sizing-advisor/deployment_examples/local_deployment.png differ diff --git a/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/Chat.tsx b/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/Chat.tsx index 8dc3e59df..927e5e40d 100644 --- a/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/Chat.tsx +++ b/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/Chat.tsx @@ -20,6 +20,7 @@ import RightSidebar from "../RightSidebar/RightSidebar"; import VGPUConfigCard from "./VGPUConfigCard"; import WorkloadConfigWizard from "./WorkloadConfigWizard"; import ApplyConfigurationForm from "./ApplyConfigurationForm"; +import ChatPanel from "../RightSidebar/ChatPanel"; import { v4 as uuidv4 } from "uuid"; import { API_CONFIG } from "@/app/config/api"; import { marked } from "marked"; @@ -32,10 +33,17 @@ export default function Chat() { const { activePanel, toggleSidebar, setActiveCitations } = useSidebar(); const [messages, setMessages] = useState([]); const [isWizardOpen, setIsWizardOpen] = useState(false); - const [expandedConfigId, setExpandedConfigId] = useState(null); const [isApplyFormOpen, setIsApplyFormOpen] = useState(false); const [applyFormConfig, setApplyFormConfig] = useState(null); const [showPassthroughError, setShowPassthroughError] = useState(false); + const [lastVGPUConfig, setLastVGPUConfig] = useState(null); // Track last vGPU config for context + const [showChatPanel, setShowChatPanel] = useState(false); // Show inline chat panel + const [chatPanelHistory, setChatPanelHistory] = useState; + }>>([]); + const [isChatPanelLoading, setIsChatPanelLoading] = useState(false); const { streamState, processStream, startStream, resetStream, stopStream } = useChatStream(); @@ -86,6 +94,32 @@ export default function Chat() { } }, [messages, activePanel, setActiveCitations]); + // Separate effect to extract vGPU config (only depends on messages, not activePanel) + useEffect(() => { + const lastMessage = messages[messages.length - 1]; + if (lastMessage && lastMessage.role === "assistant" && lastMessage.content) { + try { + const parsed = JSON.parse(lastMessage.content.trim()); + if (parsed.title === "generate_vgpu_config" && parsed.parameters) { + // Only reset chat history if this is a NEW config (different from last one) + setLastVGPUConfig((prevConfig: any) => { + const prevProfileId = prevConfig?.parameters?.vgpu_profile || prevConfig?.parameters?.vGPU_profile; + const newProfileId = parsed.parameters?.vgpu_profile || parsed.parameters?.vGPU_profile; + + // Only reset chat history if this is actually a new config + if (prevProfileId !== newProfileId || !prevConfig) { + setChatPanelHistory([]); + } + + return parsed; + }); + } + } catch { + // Not a JSON config, ignore + } + } + }, [messages]); + const handleSubmit = async (message: string) => { if (!message.trim()) return; @@ -134,9 +168,11 @@ export default function Chat() { const renderMessageContent = (content: string, isTyping: boolean, messageId: string) => { if (isTyping) { return ( -
-
- Generating configuration... +
+
+
+ Generating configuration... +
); } @@ -145,86 +181,135 @@ export default function Chat() { if (isVGPUConfig(content)) { try { const vgpuConfig = JSON.parse(content.trim()); - const configId = messageId; - const isExpanded = expandedConfigId === configId; - // Return a preview card with inline expandable details + // Return a preview card with inline details AND chat panel (always expanded) return ( -
-
- - - -

vGPU Configuration Ready

-
- -

- {vgpuConfig.description.split(/(Inference|RAG|inference|rag)/gi).map((part: string, i: number) => - /^(Inference|RAG|inference|rag)$/i.test(part) ? ( - {part} - ) : part - )} -

- - {(vgpuConfig.parameters.vgpu_profile || vgpuConfig.parameters.vGPU_profile) && ( -
- Profile: - {vgpuConfig.parameters.vgpu_profile || vgpuConfig.parameters.vGPU_profile} - {vgpuConfig.parameters.gpu_memory_size && ( - <> - - Memory: - {vgpuConfig.parameters.gpu_memory_size} GB - - )} -
- )} - - {/* Configuration Details Toggle Button */} - - - {/* Inline Configuration Details */} - {isExpanded && ( -
- +
+
+
+
+ + + +

vGPU Configuration Suggestion

- )} - {/* Verify Configuration Button */} -
- + {/* Advanced Details - Full width below both panels */} +
+ +
+
+ + {/* Divider Line */} +
+ + {/* Action Buttons - Side by Side */} +
+ + + {/* Size Another Configuration Button */} + +
+
); } catch (error) { console.error("Error parsing vGPU config:", error); @@ -260,6 +345,203 @@ export default function Chat() { timestamp: new Date().toISOString(), }); + const handleChatPanelMessage = async (message: string) => { + if (!lastVGPUConfig) return; + + setIsChatPanelLoading(true); + setChatPanelHistory((prev) => [...prev, { role: "user", content: message }]); + + // Extract configuration details for context + const profileId = lastVGPUConfig.parameters?.vgpu_profile || lastVGPUConfig.parameters?.vGPU_profile || 'GPU Passthrough'; + const gpuMemory = lastVGPUConfig.parameters?.gpu_memory_size || 'N/A'; + const vcpuCount = lastVGPUConfig.parameters?.vcpu_count || lastVGPUConfig.parameters?.vCPU_count || 'N/A'; + const systemRAM = lastVGPUConfig.parameters?.system_RAM || lastVGPUConfig.parameters?.RAM || 'N/A'; + const precision = lastVGPUConfig.parameters?.precision || 'FP8'; + + // RAG-specific fields + const ragBreakdown = lastVGPUConfig.parameters?.rag_breakdown || {}; + const ragConfig = lastVGPUConfig.parameters?.rag_config || {}; + const embeddingModel = lastVGPUConfig.parameters?.embedding_model + || ragConfig.embedding_model + || ragBreakdown.embedding_model + || ''; + const vectorDbVectors = lastVGPUConfig.parameters?.vector_db_vectors + || ragConfig.total_vectors + || ragBreakdown.vector_db_vectors + || ''; + const vectorDbDimension = lastVGPUConfig.parameters?.vector_db_dimension + || ragConfig.vector_dimension + || ragBreakdown.vector_db_dimension + || ''; + const embeddingMemory = ragBreakdown.embedding_memory || ''; + const vectorDbMemory = ragBreakdown.vector_db_memory || ''; + const isRagWorkload = lastVGPUConfig.description?.toLowerCase().includes('rag') || !!embeddingModel; + + // Get model tag + let modelTag = lastVGPUConfig.parameters?.model_tag || lastVGPUConfig.parameters?.model_name || ''; + if (!modelTag && lastVGPUConfig.description) { + const patterns = [/inference of ([^\s(]+)/i, /for RAG \(([^)]+)\)/i, /(Nemotron[^\s(]+)/i, /(Llama[^\s(]+)/i]; + for (const p of patterns) { + const m = lastVGPUConfig.description.match(p); + if (m) { modelTag = m[1].trim(); break; } + } + } + modelTag = modelTag || 'N/A'; + + // Get model parameter count + const getModelParams = (tag: string): string => { + const t = tag.toLowerCase(); + if (t.includes('30b')) return '30 billion'; + if (t.includes('70b')) return '70 billion'; + if (t.includes('8b')) return '8 billion'; + if (t.includes('7b')) return '7 billion'; + if (t.includes('3b')) return '3 billion'; + if (t.includes('1b')) return '1 billion'; + if (t.includes('49b')) return '49 billion (Mixture of Experts)'; + return 'unknown'; + }; + const modelParams = getModelParams(modelTag); + + // Build workload context + let workloadContext = ''; + if (isRagWorkload) { + workloadContext = ` +This is a RAG (Retrieval-Augmented Generation) workload: +- LLM Model: ${modelTag} (${modelParams} parameters, ${precision} precision) +- Embedding Model: ${embeddingModel}${embeddingMemory ? ` (requires ${embeddingMemory})` : ''} +${vectorDbVectors ? `- Vector Database: ${vectorDbVectors} vectors` : ''} +${vectorDbDimension ? `- Vector Dimension: ${vectorDbDimension}D` : ''} +${vectorDbMemory ? `- Vector DB Memory: ${vectorDbMemory}` : ''}`; + } else { + workloadContext = ` +This is an Inference workload: +- Model: ${modelTag} (${modelParams} parameters, ${precision} precision)`; + } + + // Create context message for RAG server + const contextMessage = `You are a helpful AI assistant. Answer the user's question directly and conversationally. + +Context - The user is asking about this vGPU configuration: +- Profile: ${profileId} | GPU Memory: ${gpuMemory}GB | vCPUs: ${vcpuCount} | RAM: ${systemRAM}GB +${workloadContext} + +CRITICAL INSTRUCTIONS: +- Answer in plain text ONLY. NO JSON. NO structured output. +- Use your general knowledge about LLMs, GPUs, and AI to answer questions +- If asked about the profile: Explain vGPU naming (e.g., BSE-24Q = BSE GPU with 24GB VRAM, Q suffix = time-sliced vGPU) +- For RAG questions: Explain how the embedding model and LLM work together +- Use retrieved documentation to support your answers when relevant +- Be concise and helpful`; + + try { + const chatTemperature = Math.min(temperature + 0.1, 1.0); + + const requestBody: GenerateRequest = { + messages: [ + { role: "system", content: contextMessage }, + ...chatPanelHistory.map(msg => ({ role: msg.role, content: msg.content })), + { role: "user", content: message } + ], + collection_name: "vgpu_knowledge_base", + temperature: chatTemperature, + top_p: topP, + reranker_top_k: rerankerTopK, + vdb_top_k: vdbTopK, + confidence_threshold: confidenceScoreThreshold, + use_knowledge_base: true, + enable_citations: true, + enable_query_rewriting: true, + enable_reranker: true, + enable_guardrails: useGuardrails, + conversational_mode: true, + }; + + if (process.env.NEXT_PUBLIC_MODEL_NAME) { + requestBody.model = process.env.NEXT_PUBLIC_MODEL_NAME; + } + if (process.env.NEXT_PUBLIC_EMBEDDING_MODEL) { + requestBody.embedding_model = process.env.NEXT_PUBLIC_EMBEDDING_MODEL; + } + if (process.env.NEXT_PUBLIC_RERANKER_MODEL) { + requestBody.reranker_model = process.env.NEXT_PUBLIC_RERANKER_MODEL; + } + + const response = await fetch("/api/generate", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(requestBody), + }); + + if (!response.ok) throw new Error("RAG server error"); + + const reader = response.body?.getReader(); + if (!reader) throw new Error("No response body"); + + let assistantMsg = ""; + let citations: Array<{ text: string; source: string; document_type: string }> = []; + const decoder = new TextDecoder(); + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + const chunk = decoder.decode(value); + const lines = chunk.split("\n"); + for (const line of lines) { + if (line.startsWith("data: ")) { + try { + const data = JSON.parse(line.slice(6)); + if (data.choices?.[0]?.delta?.content) { + assistantMsg += data.choices[0].delta.content; + } + if (data.citations && Array.isArray(data.citations)) { + citations = data.citations; + } + } catch (e) {} + } + } + } + + // Process response - handle JSON structured output and plain text + let finalMessage = assistantMsg || "No response"; + try { + const trimmed = assistantMsg.trim(); + if (trimmed.startsWith('{') || trimmed.startsWith('[')) { + const parsed = JSON.parse(trimmed); + if (parsed.title && parsed.parameters) { + if (parsed.description && !parsed.description.includes('generate_vgpu_config')) { + let desc = parsed.description; + if (/^(BSE|L40S?|A40|L4)\s+with\s+vGPU\s+profile/i.test(desc)) { + finalMessage = `The ${profileId} profile provides ${gpuMemory}GB of GPU memory. This configuration is sized for running ${modelTag}. Is there something specific you'd like to know about this setup?`; + } else { + finalMessage = desc; + } + } else { + finalMessage = `Based on your configuration (${profileId} with ${gpuMemory}GB), I can help answer questions about the profile, model requirements, or performance expectations. What would you like to know?`; + } + } else if (parsed.description) { + finalMessage = parsed.description; + } + } + } catch (e) { + // Not JSON - use as is (this is the expected case for chat responses) + } + + setChatPanelHistory((prev) => [...prev, { + role: "assistant", + content: finalMessage, + citations: citations.length > 0 ? citations : undefined + }]); + } catch (error) { + console.error("Chat panel error:", error); + setChatPanelHistory((prev) => [...prev, { + role: "assistant", + content: "Error from rag-server. Please check rag-server logs for more details." + }]); + } finally { + setIsChatPanelLoading(false); + } + }; + const createRequestBody = (userMessage: ChatMessage) => { // Create base request body - always use the vGPU knowledge base const requestBody: GenerateRequest = { @@ -376,59 +658,66 @@ export default function Chat() { }; return ( -
+
- -
-
-
- {messages.map((msg) => ( -
+
+
+ {/* Show centered button when no messages */} + {messages.length === 0 ? ( +
+ +
+ ) : ( +
+
+ {messages.map((msg) => ( +
+
+
+ {msg.content + ? renderMessageContent(msg.content, false, msg.id) + : msg.role === "assistant" && streamState.isTyping + ? renderMessageContent("", true, msg.id) + : ""} +
+
-
+ ))} +
- ))} -
-
-
- -
-
- -
+
+ )}
diff --git a/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/VGPUConfigCard.tsx b/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/VGPUConfigCard.tsx index 54889e8b8..940ce5571 100644 --- a/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/VGPUConfigCard.tsx +++ b/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/VGPUConfigCard.tsx @@ -15,7 +15,32 @@ "use client"; -import { useState, ReactNode, useRef } from "react"; +import { useState, ReactNode } from "react"; + +// Tooltip trigger component - displays content in card's bottom banner +const TooltipTrigger = ({ + content, + children, + onShow, + onHide +}: { + content: string; + children: ReactNode; + onShow: (content: string) => void; + onHide: () => void; +}) => { + return ( +
+
onShow(content)} + onMouseLeave={onHide} + className="cursor-help" + > + {children} +
+
+ ); +}; interface VGPUConfig { title: string; @@ -25,6 +50,7 @@ interface VGPUConfig { gpu_model?: string | null; vcpu_count?: number | null; gpu_memory_size?: number | null; + gpu_count?: number | null; system_RAM?: number | null; concurrent_users?: number | null; rag_breakdown?: { @@ -37,6 +63,9 @@ interface VGPUConfig { vector_db_dimension?: number; reranker_model?: string; reranker_memory?: string; + // Token configuration + prompt_size?: number; + response_size?: number; }; // Legacy fields for backward compatibility (to be removed) vGPU_profile?: string | null; @@ -61,33 +90,10 @@ interface VGPUConfig { interface VGPUConfigCardProps { config: VGPUConfig; + hideAdvancedDetails?: boolean; + showOnlyAdvancedDetails?: boolean; } -// Tooltip trigger component - displays content in card's bottom banner -const TooltipTrigger = ({ - content, - children, - onShow, - onHide -}: { - content: string; - children: ReactNode; - onShow: (content: string) => void; - onHide: () => void; -}) => { - return ( -
-
onShow(content)} - onMouseLeave={onHide} - className="cursor-help" - > - {children} -
-
- ); -}; - // Parameter definitions for tooltips - detailed explanations for users const parameterDefinitions: { [key: string]: string } = { vgpu_profile: "The specific NVIDIA vGPU profile (e.g., L40S-24Q, BSE-48Q) that partitions the physical GPU. The number indicates VRAM in GB, and 'Q' means it's optimized for compute workloads.", @@ -102,6 +108,7 @@ const parameterDefinitions: { [key: string]: string } = { time_to_first_token: "Time from request start until the first output token is generated (TTFT). Critical for streaming responses and perceived responsiveness. Heavily influenced by prompt length.", throughput: "Number of tokens the system can generate per second across all concurrent requests. Higher throughput means better overall capacity and efficiency.", model_tag: "The specific LLM model identifier (e.g., meta-llama/Llama-3-8b-instruct). Used to determine model size, architecture, and memory requirements.", + precision: "Numerical precision for model inference. FP16 (16-bit) offers high accuracy with moderate memory. FP8 (8-bit) reduces memory by ~50% with minimal accuracy loss. FP4 (4-bit) offers maximum memory savings for inference-only workloads.", vector_db_vectors: "Total number of document embeddings stored in the vector database. More vectors = larger knowledge base but requires more memory for the vector index.", vector_db_dimension: "Dimensionality of each embedding vector (determined by the embedding model). Common dimensions: 384, 768, 1024, 1536. Higher dimensions capture more semantic information but require more memory.", // Legacy fields (kept for backward compatibility) @@ -167,6 +174,19 @@ const ParameterIcon = ({ type, className = "w-4 h-4" }: { type: string; classNam } }; +// Helper to darken a color +const darkenColor = (color: string, amount: number = 0.4): string => { + // Handle hex colors + if (color.startsWith('#')) { + const hex = color.slice(1); + const r = Math.max(0, Math.floor(parseInt(hex.slice(0, 2), 16) * (1 - amount))); + const g = Math.max(0, Math.floor(parseInt(hex.slice(2, 4), 16) * (1 - amount))); + const b = Math.max(0, Math.floor(parseInt(hex.slice(4, 6), 16) * (1 - amount))); + return `#${r.toString(16).padStart(2, '0')}${g.toString(16).padStart(2, '0')}${b.toString(16).padStart(2, '0')}`; + } + return color; +}; + // Circular Progress Chart Component const VRAMUsageChart = ({ usedVRAM, @@ -179,19 +199,31 @@ const VRAMUsageChart = ({ numGPUs: number; gpuModel?: string; }) => { - const percentage = Math.min((usedVRAM / totalVRAM) * 100, 100); + // Calculate percentages - usable is 95% of total (5% reserved for system overhead) + const usableVRAM = totalVRAM * 0.95; + const reservedVRAM = totalVRAM * 0.05; + const percentage = Math.min((usedVRAM / usableVRAM) * 100, 100); + const reservedPercentage = 5; // Fixed 5% reserved for system overhead + // Cap the used percentage at 95% so overhead segment is always visible + // The dial shows: 0-95% for actual usage, 95-100% for reserved overhead + const usedPercentageOfTotal = Math.min((usedVRAM / totalVRAM) * 100, 95); + const radius = 80; const strokeWidth = 12; const normalizedRadius = radius - strokeWidth * 2; const circumference = normalizedRadius * 2 * Math.PI; - const strokeDashoffset = circumference - (percentage / 100) * circumference; - // Determine fit category and color - const getFitCategory = (pct: number): { label: string; color: string; bgColor: string; textColor: string } => { + // Calculate stroke offsets - overhead segment starts at 95% position + const usedStrokeDashoffset = circumference - (usedPercentageOfTotal / 100) * circumference; + const reservedStrokeDashoffset = circumference - (95 / 100) * circumference; + + // Determine fit category and color based on usable percentage + const getFitCategory = (pct: number): { label: string; color: string; darkColor: string; bgColor: string; textColor: string } => { if (pct >= 90) { return { label: "TIGHT", color: "#ef4444", // red-500 + darkColor: darkenColor("#ef4444", 0.5), // darker red for overhead bgColor: "rgba(239, 68, 68, 0.1)", // red with opacity textColor: "#fca5a5" // red-300 }; @@ -199,6 +231,7 @@ const VRAMUsageChart = ({ return { label: "MODERATE", color: "#76b900", // NVIDIA green + darkColor: darkenColor("#76b900", 0.5), // darker green for overhead bgColor: "rgba(118, 185, 0, 0.1)", // green with opacity textColor: "#a3e635" // lime-400 }; @@ -206,6 +239,7 @@ const VRAMUsageChart = ({ return { label: "COMFORTABLE", color: "#10b981", // emerald-500 + darkColor: darkenColor("#10b981", 0.5), // darker emerald for overhead bgColor: "rgba(16, 185, 129, 0.1)", // emerald with opacity textColor: "#6ee7b7" // emerald-300 }; @@ -240,19 +274,43 @@ const VRAMUsageChart = ({ cx={radius} cy={radius} /> - {/* Progress circle */} + {/* Used VRAM circle */} + {/* Reserved/Overhead segment (5% - striped pattern to stand out) */} + + {/* Overhead indicator line (subtle marker at 95% position) */} + {/* Center text */}
@@ -279,14 +337,14 @@ const VRAMUsageChart = ({ {/* Usage details */}
- {usedVRAM.toFixed(1)} GB + {usedVRAM.toFixed(1)} GB + {reservedVRAM.toFixed(1)} overhead
of {totalVRAM.toFixed(0)} GB VRAM
{numGPUs > 1 && (
- ({numGPUs}× {gpuModel || 'GPU'} GPUs with {(totalVRAM / numGPUs).toFixed(0)}GB each) + ({numGPUs}× {gpuModel || 'GPU'} with {(totalVRAM / numGPUs).toFixed(0)}GB each)
)}
@@ -320,7 +378,7 @@ const getIconType = (key: string): string => { } }; -export default function VGPUConfigCard({ config }: VGPUConfigCardProps) { +export default function VGPUConfigCard({ config, hideAdvancedDetails = false, showOnlyAdvancedDetails = false }: VGPUConfigCardProps) { const [isExpanded, setIsExpanded] = useState(true); const [showAdvancedDetails, setShowAdvancedDetails] = useState(false); const [showRawJSON, setShowRawJSON] = useState(false); @@ -451,6 +509,8 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) { return 'Performance Tier'; case 'concurrent_users': return 'Concurrent Users'; + case 'precision': + return 'Precision'; default: return key.replace(/_/g, ' ').replace(/^./, str => str.toUpperCase()); } @@ -458,8 +518,8 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) { const isRelevantConfig = Object.values(config.parameters).some(value => value !== null && value !== undefined); - // Fields to exclude from display - const excludedFields = ['total_CPU_count', 'total_cpu_count', 'rag_breakdown']; + // Fields to exclude from display (RAG-specific fields are shown in RAG Components section) + const excludedFields = ['total_CPU_count', 'total_cpu_count', 'rag_breakdown', 'rag_config', 'gpu_count', 'gpu_model', 'embedding_model', 'vector_db_vectors', 'vector_db_dimension']; // Separate key and advanced parameters, excluding unwanted fields const keyParams = Object.entries(config.parameters).filter(([key]) => @@ -538,7 +598,9 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) { // For passthrough, use 95% usable capacity (reserve 5% for driver/OS overhead) const usablePerGpu = gpuCapacity * 0.95; - const numGPUs = Math.ceil(estimatedVRAM / usablePerGpu); + // Use gpu_count from backend if available (backend has already calculated this correctly) + const backendGpuCount = config.parameters.gpu_count; + const numGPUs = backendGpuCount && backendGpuCount >= 1 ? backendGpuCount : Math.ceil(estimatedVRAM / usablePerGpu); return { used: estimatedVRAM, @@ -553,8 +615,65 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) { const singleGPUCapacity = getGPUCapacityFromProfile(profile); if (!singleGPUCapacity) return null; - // Calculate number of GPUs needed (ceiling) - const numGPUs = Math.ceil(estimatedVRAM / singleGPUCapacity); + // Use gpu_count from backend if available AND valid + // Validate that backend gpu_count provides enough capacity for required VRAM + const backendGpuCount = config.parameters.gpu_count; + let numGPUs: number; + let needsLargerProfile = false; + + // Small epsilon for floating point comparison (0.1 GB tolerance) + const EPSILON = 0.1; + + if (backendGpuCount && backendGpuCount >= 1) { + const backendCapacity = backendGpuCount * singleGPUCapacity; + // Check if backend calculation provides enough capacity (with 5% headroom) + const usableBackendCapacity = backendCapacity * 0.95; + // Use epsilon for floating point comparison (24 * 0.95 = 22.799999... not 22.8) + if (usableBackendCapacity + EPSILON >= estimatedVRAM) { + numGPUs = backendGpuCount; + } else { + // Backend config is invalid - recalculate based on usable capacity per GPU + const usablePerGpu = singleGPUCapacity * 0.95; + numGPUs = Math.ceil(estimatedVRAM / (usablePerGpu + EPSILON)); + console.warn(`Backend gpu_count (${backendGpuCount}) insufficient for ${estimatedVRAM}GB. Recalculated to ${numGPUs}.`); + } + } else { + // No backend count - calculate ourselves with 5% headroom per GPU + const usablePerGpu = singleGPUCapacity * 0.95; + numGPUs = Math.ceil(estimatedVRAM / (usablePerGpu + EPSILON)); + } + + // Check if there's a larger vGPU profile available that could reduce GPU count + // Only show warning if: 1) numGPUs > 1, 2) it's a vGPU profile, AND 3) a larger profile exists + if (numGPUs > 1 && profile) { + // Get max profile capacity for this GPU family + const gpuFamily = profile.split('-')[0]; + const maxProfiles: { [key: string]: number } = { + 'BSE': 96, + 'L40S': 48, + 'L40': 48, + 'A40': 48, + 'L4': 24 + }; + const maxProfileCapacity = maxProfiles[gpuFamily] || singleGPUCapacity; + + // Only flag as needing larger profile if: + // 1. Current profile is NOT the largest available for this GPU family, AND + // 2. A larger profile would actually reduce the number of GPUs needed + if (singleGPUCapacity < maxProfileCapacity) { + const usableMaxProfile = maxProfileCapacity * 0.95; + const gpusNeededWithMaxProfile = Math.ceil(estimatedVRAM / usableMaxProfile); + + // Only show warning if upgrading to max profile would reduce GPU count + if (gpusNeededWithMaxProfile < numGPUs) { + needsLargerProfile = true; + console.warn(`Consider ${gpuFamily}-${maxProfileCapacity}Q profile which would only need ${gpusNeededWithMaxProfile} GPU(s) instead of ${numGPUs}.`); + } + } + // Note: If already using largest profile, multi-GPU vGPU is the correct solution + // No warning needed - this is expected behavior for large workloads + } + // Calculate total capacity across all GPUs const totalCapacity = numGPUs * singleGPUCapacity; @@ -563,20 +682,85 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) { total: totalCapacity, numGPUs: numGPUs, singleGPUCapacity: singleGPUCapacity, - isPassthrough: false + isPassthrough: false, + needsLargerProfile: needsLargerProfile }; }; const vramUsage = getVRAMUsageData(); + // If showing only advanced details, render just that section + if (showOnlyAdvancedDetails) { + return ( +
+ {advancedParams.length > 0 && ( +
+ + + {showAdvancedDetails && ( +
+
+
+ {advancedParams.map(([key, value], index) => ( +
+
+
+ +
+
+
+ + {getParameterLabel(key)} + + {parameterDefinitions[key] && ( +
+ + + +
+

+ {parameterDefinitions[key]} +

+
+
+ )} +
+ + {formatParameterValue(key, value)} + +
+
+
+ ))} +
+
+
+ )} +
+ )} +
+ ); + } + return ( -
+
{/* Content */} {isExpanded && ( -
+
{/* Host Capabilities Context */} {config.host_capabilities && ( -
+
@@ -608,12 +792,12 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) {
)} -
+
{/* VRAM Usage Chart / JSON View */} {vramUsage && ( -
+
{/* Header - Always visible */} -
+

@@ -627,7 +811,7 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) { -
+

{config.rationale || "This configuration balances performance and resource efficiency for your specific AI workload, ensuring optimal GPU utilization while maintaining cost-effectiveness."}

@@ -698,16 +882,20 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) { gpuModel={vramUsage.gpuModel} />
-
-
-
Configuration Summary
-
+
+
+
Configuration Summary
+
- Required VRAM: + Required VRAM: {vramUsage.used.toFixed(1)} GB
- GPU Profile: + 5% Reserved Overhead: + {(vramUsage.total * 0.05).toFixed(1)} GB +
+
+ GPU Profile: {config.parameters.vgpu_profile || config.parameters.vGPU_profile || GPU Passthrough Required @@ -715,11 +903,16 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) {
- GPUs Required: + GPUs Required: {vramUsage.numGPUs}
+ {vramUsage.needsLargerProfile && ( +
+ ⚠️ Consider a larger vGPU profile or GPU passthrough. Multi-GPU vGPU profiles typically require separate VMs. +
+ )}
- Total Capacity: + Total Capacity: {vramUsage.total.toFixed(0)} GB
@@ -727,174 +920,154 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) {

Utilization Guidelines

    -
  • - +
  • + Comfortable (0-60%): Ideal for production with room for growth
  • -
  • - +
  • + Moderate (60-90%): Efficient utilization with performance buffer
  • -
  • - +
  • + Tight (90-100%): Consider larger GPU profile or additional units
- - )} -
- )} - - {/* Key Parameters Section */} - {keyParams.length > 0 && ( -
-

Key Parameters

-
-
-
- {keyParams.map(([key, value], index) => ( -
-
-
- -
-
- - {getParameterLabel(key)} + + {/* Key Parameters Section - Inside VRAM Analysis */} + {keyParams.length > 0 && ( +
+

Key Parameters

+
+
+
+ {keyParams.map(([key, value], index) => ( +
+
+
+ +
+
+ + {getParameterLabel(key)} + + {parameterDefinitions[key] && ( +
+ + + +
+

+ {parameterDefinitions[key]} +

+
+
+ )} +
+
+ + {formatParameterValue(key, value)} - {parameterDefinitions[key] && ( - setKeyParamsTooltip(null)} - > - - - - - )}
-
- - {formatParameterValue(key, value)} - + ))}
- ))} -
-
- - {/* Tooltip Banner for Key Parameters */} - {keyParamsTooltip && ( -
-
- - - -
{keyParamsTooltip}
- )} -
+
+ )} + + )}
)} - {/* RAG Components Breakdown - Only show for RAG workloads */} + {/* RAG Components - Only show for RAG workloads */} {config.parameters.rag_breakdown && config.parameters.rag_breakdown.workload_type === 'rag' && (config.parameters.rag_breakdown.embedding_model || config.parameters.rag_breakdown.vector_db_memory) && ( -
-

+
+

- RAG Components Memory + RAG Components

-
-
+
+ {/* Embedding Model and Vector Database side by side - compact */} +
{/* Embedding Model */} {config.parameters.rag_breakdown.embedding_model && ( -
-
-
Embedding Model
-
- {config.parameters.rag_breakdown.embedding_model} +
+
+
Embedding Model
+
+ {config.parameters.rag_breakdown.embedding_model.split('/').pop()}
{config.parameters.rag_breakdown.vector_db_dimension && ( -
- Output: {config.parameters.rag_breakdown.vector_db_dimension}D vectors +
+ {config.parameters.rag_breakdown.vector_db_dimension}D output
)}
-
-
- {config.parameters.rag_breakdown.embedding_memory} -
+
+ {config.parameters.rag_breakdown.embedding_memory}
)} {/* Vector Database */} {config.parameters.rag_breakdown.vector_db_memory && ( -
-
-
Vector Database Index
-
- {config.parameters.rag_breakdown.vector_db_vectors && - config.parameters.rag_breakdown.vector_db_dimension && ( - <> -
- {config.parameters.rag_breakdown.vector_db_vectors >= 10000000 ? 'Extra Large' : - config.parameters.rag_breakdown.vector_db_vectors >= 1000000 ? 'Large' : - config.parameters.rag_breakdown.vector_db_vectors >= 100000 ? 'Medium' : 'Small'} -
-
- {config.parameters.rag_breakdown.vector_db_vectors.toLocaleString()} vectors × {config.parameters.rag_breakdown.vector_db_dimension}D -
- - )} - {(!config.parameters.rag_breakdown.vector_db_vectors || - !config.parameters.rag_breakdown.vector_db_dimension) && ( - Index memory - )} -
+
+
+
Vector Database
+ {config.parameters.rag_breakdown.vector_db_vectors && + config.parameters.rag_breakdown.vector_db_dimension ? ( + <> +
+ {config.parameters.rag_breakdown.vector_db_vectors >= 10000000 ? 'Extra Large' : + config.parameters.rag_breakdown.vector_db_vectors >= 1000000 ? 'Large' : + config.parameters.rag_breakdown.vector_db_vectors >= 100000 ? 'Medium' : 'Small'} +
+
+ {config.parameters.rag_breakdown.vector_db_vectors.toLocaleString()} × {config.parameters.rag_breakdown.vector_db_dimension}D +
+ + ) : ( + Index memory + )}
-
-
- {config.parameters.rag_breakdown.vector_db_memory} -
+
+ {config.parameters.rag_breakdown.vector_db_memory}
)} +
- {/* Reranker Model */} - {config.parameters.rag_breakdown.reranker_model && ( -
-
-
Reranker Model
-
- {config.parameters.rag_breakdown.reranker_model} -
-
-
-
- {config.parameters.rag_breakdown.reranker_memory} -
+ {/* Reranker Model - full width below if present */} + {config.parameters.rag_breakdown.reranker_model && ( +
+
+
Reranker
+
+ {config.parameters.rag_breakdown.reranker_model}
- )} -
+
+ {config.parameters.rag_breakdown.reranker_memory} +
+
+ )}
)} {/* Advanced Details - Collapsible */} - {advancedParams.length > 0 && ( + {!hideAdvancedDetails && advancedParams.length > 0 && (
))} - - {/* Add RAG-specific vector DB details */} - {config.parameters.rag_breakdown?.vector_db_vectors && ( -
-
-
- -
-
-
- - Vector DB Vectors - - {parameterDefinitions['vector_db_vectors'] && ( - setAdvancedTooltip(null)} - > - - - - - )} -
- - {config.parameters.rag_breakdown.vector_db_vectors.toLocaleString()} - -
-
-
- )} - - {config.parameters.rag_breakdown?.vector_db_dimension && ( -
-
-
- -
-
-
- - Vector Dimension - - {parameterDefinitions['vector_db_dimension'] && ( - setAdvancedTooltip(null)} - > - - - - - )} -
- - {config.parameters.rag_breakdown.vector_db_dimension}D - -
-
-
- )}
@@ -1025,7 +1135,7 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) { {/* Notes/Recommendations */} {config.notes && config.notes.length > 0 && ( -
+
@@ -1044,7 +1154,7 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) { {/* No config warning */} {!isRelevantConfig && ( -
+
diff --git a/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/WorkloadConfigWizard.tsx b/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/WorkloadConfigWizard.tsx index d776eef14..8c87f9e4f 100644 --- a/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/WorkloadConfigWizard.tsx +++ b/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/WorkloadConfigWizard.tsx @@ -52,14 +52,14 @@ export default function WorkloadConfigWizard({ }: WorkloadConfigWizardProps) { const [config, setConfig] = useState({ workloadType: "", - specificModel: "", + specificModel: "nemotron-30b-fp8", modelSize: "", batchSize: "", promptSize: "1024", responseSize: "256", embeddingModel: "nvidia/nvolveqa-embed-large-1B", - gpuInventory: { "DC": 1 }, - precision: "fp16", + gpuInventory: { "BSE": 1 }, + precision: "fp8", vectorDimension: "1024", // Default to 1024 (matches default embedding model) numberOfVectors: "10000", // Default to 10,000 advancedConfig: { @@ -89,23 +89,34 @@ export default function WorkloadConfigWizard({ const data = await response.json(); if (data.models && data.models.length > 0) { // Use modelTag as value to ensure uniqueness (full model ID like "org/model-name") - const formattedModels = data.models.map((model: any) => ({ - value: model.modelTag.toLowerCase().replace(/\//g, '-').replace(/\./g, '-'), - label: model.label, - modelTag: model.modelTag - })); - setDynamicModels(formattedModels); - console.log(`✓ Successfully loaded ${formattedModels.length} models from HuggingFace`); + const formattedModels = data.models + .filter((model: any) => model && model.modelTag) // Filter out invalid models + .map((model: any) => ({ + value: (model.modelTag || '').toLowerCase().replace(/\//g, '-').replace(/\./g, '-'), + label: model.label || model.modelTag || 'Unknown Model', + modelTag: model.modelTag + })); + // Always prepend Nemotron as the first/default option + const nemotronModel = { + value: "nemotron-30b-fp8", + label: "NVIDIA Nemotron-3 Nano 30B", + modelTag: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8" + }; + setDynamicModels([nemotronModel, ...formattedModels]); + console.log(`✓ Successfully loaded ${formattedModels.length + 1} models (including Nemotron)`); } else { - console.warn('No models returned from API'); + console.warn('No models returned from API, using fallback'); + setIsLoadingModels(false); } } else { console.warn('API returned non-OK status:', response.status); + setIsLoadingModels(false); } } catch (error) { console.error('Failed to fetch dynamic models:', error); console.log('Using fallback model list'); // Fallback to hardcoded models will be used + setIsLoadingModels(false); } finally { setIsLoadingModels(false); } @@ -146,15 +157,16 @@ export default function WorkloadConfigWizard({ ]; const availableGPUInventory = [ - { value: "DC", label: "NVIDIA RTX Pro 6000 BSE", desc: "96GB GDDR7 with ECC, Blackwell, passive‑cooled dual‑slot PCIe Gen5 – Enterprise AI/graphics, scientific computing & virtual workstations" }, - { value: "l40s", label: "NVIDIA L40S", desc: "48GB GDDR6 with ECC, Ada Lovelace, 350W - ML training & inference + virtual workstations" }, - { value: "l40", label: "NVIDIA L40", desc: "48GB GDDR6 with ECC, Ada Lovelace - Virtual workstations & compute workloads" }, - { value: "l4", label: "NVIDIA L4", desc: "24GB GDDR6 with ECC, Ada Lovelace, 72W - AI inference, small model training & 3D graphics" }, - { value: "a40", label: "NVIDIA A40", desc: "48GB GDDR6 with ECC, Ampere, 300W - 3D design & mixed virtual workstation workloads" }, + { value: "BSE", label: "NVIDIA RTX Pro 6000 BSE", desc: "96GB GDDR7 with ECC, Blackwell, passive‑cooled dual‑slot PCIe Gen5 – Enterprise AI/graphics, scientific computing & virtual workstations" }, + { value: "L40S", label: "NVIDIA L40S", desc: "48GB GDDR6 with ECC, Ada Lovelace, 350W - ML training & inference + virtual workstations" }, + { value: "L40", label: "NVIDIA L40", desc: "48GB GDDR6 with ECC, Ada Lovelace - Virtual workstations & compute workloads" }, + { value: "L4", label: "NVIDIA L4", desc: "24GB GDDR6 with ECC, Ada Lovelace, 72W - AI inference, small model training & 3D graphics" }, + { value: "A40", label: "NVIDIA A40", desc: "48GB GDDR6 with ECC, Ampere, 300W - 3D design & mixed virtual workstation workloads" }, ]; // Fallback hardcoded models in case dynamic fetch fails const fallbackModels = [ + { value: "nemotron-30b-fp8", label: "NVIDIA Nemotron-3 Nano 30B", modelTag: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8" }, { value: "llama-3-8b", label: "Llama-3-8B", modelTag: "meta-llama/Meta-Llama-3-8B-Instruct" }, { value: "llama-3-70b", label: "Llama-3-70B", modelTag: "meta-llama/Meta-Llama-3-70B-Instruct" }, { value: "llama-3.1-8b", label: "Llama-3.1-8B", modelTag: "meta-llama/Llama-3.1-8B-Instruct" }, @@ -170,8 +182,8 @@ export default function WorkloadConfigWizard({ const specificModels = dynamicModels.length > 0 ? dynamicModels : fallbackModels; const precisionOptions = [ - { value: "fp16", label: "FP16", desc: "Half precision - Recommended balance of performance and accuracy" }, - { value: "fp8", label: "FP8", desc: "8-bit floating point - Higher performance with good accuracy" }, + { value: "fp8", label: "FP8", desc: "8-bit floating point - Recommended for best performance with good accuracy" }, + { value: "fp16", label: "FP16", desc: "Half precision - Higher accuracy, more memory usage" }, { value: "fp4", label: "FP4", desc: "4-bit floating point - Maximum performance, lower accuracy" }, ]; @@ -358,7 +370,7 @@ export default function WorkloadConfigWizard({ parts.push(`with ${precisionLabel} precision`); } else { // Recommended precision - parts.push(`with FP16 precision`); + parts.push(`with FP8 precision`); } // Add retrieval configuration for RAG workloads @@ -382,7 +394,20 @@ export default function WorkloadConfigWizard({ // Determine the model tag to use let modelTagToUse = null; if (config.specificModel && config.specificModel !== 'unknown') { + // First try to find in dynamic/fallback models modelTagToUse = specificModels.find(m => m.value === config.specificModel)?.modelTag || null; + + // Hardcoded fallback for common models if lookup fails + if (!modelTagToUse) { + const modelTagFallbacks: Record = { + 'nemotron-30b-fp8': 'nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8', + 'llama-3-8b': 'meta-llama/Meta-Llama-3-8B-Instruct', + 'llama-3-70b': 'meta-llama/Meta-Llama-3-70B-Instruct', + 'llama-3.1-8b': 'meta-llama/Llama-3.1-8B-Instruct', + 'llama-3.1-70b': 'meta-llama/Llama-3.3-70B-Instruct', + }; + modelTagToUse = modelTagFallbacks[config.specificModel] || config.specificModel; + } } else if (config.specificModel === 'unknown' && config.modelSize) { // Use default model for the size category modelTagToUse = getDefaultModelForSize(config.modelSize).modelTag; @@ -398,14 +423,14 @@ export default function WorkloadConfigWizard({ responseSize: config.responseSize ? parseInt(config.responseSize) : 256, embeddingModel: config.workloadType === 'rag' ? (config.embeddingModel || getRecommendedEmbeddingModel()) : null, gpuInventory: config.gpuInventory, - precision: config.precision || 'fp16', + precision: config.precision || 'fp8', // Add retrieval config for RAG ...(config.workloadType === 'rag' && { vectorDimension: config.vectorDimension ? parseInt(config.vectorDimension) : null, numberOfVectors: config.numberOfVectors ? parseInt(config.numberOfVectors) : null, }), // Add computed values for easier backend processing - selectedGPU: Object.keys(config.gpuInventory)[0] || 'DC', + selectedGPU: Object.keys(config.gpuInventory)[0] || 'BSE', gpuCount: Object.values(config.gpuInventory)[0] as number || 1, // Include advanced configuration advancedConfig: config.advancedConfig, @@ -419,17 +444,17 @@ export default function WorkloadConfigWizard({ const query = generateQuery(); onSubmit(query); onClose(); - // Reset form + // Reset form - use same defaults as initial state setConfig({ workloadType: "", - specificModel: "", + specificModel: "nemotron-30b-fp8", // Default to Nemotron model modelSize: "", batchSize: "", promptSize: "1024", responseSize: "256", embeddingModel: "nvidia/nvolveqa-embed-large-1B", - gpuInventory: { "DC": 1 }, - precision: "fp16", + gpuInventory: { "BSE": 1 }, + precision: "fp8", vectorDimension: "1024", // Default to 1024 (matches default embedding model) numberOfVectors: "10000", // Default to 10,000 advancedConfig: { @@ -548,13 +573,10 @@ export default function WorkloadConfigWizard({ className="w-full p-3 rounded-lg bg-neutral-800 border border-neutral-600 text-white mb-4" disabled={isLoadingModels} > - - {specificModels.map((model) => ( ))} + {!isLoadingModels && dynamicModels.length > 0 && (

✓ {dynamicModels.length} models loaded from HuggingFace

@@ -705,12 +727,12 @@ export default function WorkloadConfigWizard({

GPU Selection

setInputMessage(e.target.value)} + placeholder="Ask a question about your configuration..." + disabled={isLoading} + className="flex-1 min-w-0 rounded-lg bg-neutral-800/50 border border-neutral-700 px-4 py-2 text-sm text-white placeholder-gray-500 focus:outline-none focus:border-[#76b900] disabled:opacity-50" + /> + + +
+
+ ); +} + diff --git a/community/ai-vws-sizing-advisor/frontend/src/app/components/RightSidebar/RightSidebar.tsx b/community/ai-vws-sizing-advisor/frontend/src/app/components/RightSidebar/RightSidebar.tsx index e057db93b..8ac8ac64b 100644 --- a/community/ai-vws-sizing-advisor/frontend/src/app/components/RightSidebar/RightSidebar.tsx +++ b/community/ai-vws-sizing-advisor/frontend/src/app/components/RightSidebar/RightSidebar.tsx @@ -17,9 +17,22 @@ import { useEffect, useState } from "react"; import Citations from "./Citations"; +import ChatPanel from "./ChatPanel"; import { useSidebar } from "../../context/SidebarContext"; -export default function RightSidebar() { +interface RightSidebarProps { + vgpuConfig?: any; + onSendChatMessage?: (message: string) => void; + chatHistory?: Array<{ role: "user" | "assistant"; content: string }>; + isChatLoading?: boolean; +} + +export default function RightSidebar({ + vgpuConfig, + onSendChatMessage, + chatHistory = [], + isChatLoading = false, +}: RightSidebarProps) { const { activePanel, closeSidebar, activeCitations } = useSidebar(); const [displayPanel, setDisplayPanel] = useState(activePanel); @@ -34,27 +47,56 @@ export default function RightSidebar() { } }, [activePanel]); + const getPanelTitle = () => { + if (displayPanel === "citations") return "Citations"; + if (displayPanel === "chat") return "Configuration Chat"; + return ""; + }; + return (
-
-

- Citations -

- -
-
- + {displayPanel !== "chat" && ( +
+

+ {getPanelTitle()} +

+ +
+ )} +
+ {displayPanel === "citations" && ( +
+ +
+ )} + {displayPanel === "chat" && onSendChatMessage && ( +
+ + +
+ )}
diff --git a/community/ai-vws-sizing-advisor/frontend/src/app/context/SidebarContext.tsx b/community/ai-vws-sizing-advisor/frontend/src/app/context/SidebarContext.tsx index ee083cd89..e4f07dacc 100644 --- a/community/ai-vws-sizing-advisor/frontend/src/app/context/SidebarContext.tsx +++ b/community/ai-vws-sizing-advisor/frontend/src/app/context/SidebarContext.tsx @@ -19,9 +19,9 @@ import { createContext, useContext, useState, ReactNode } from "react"; import { Citation } from "@/types/chat"; interface SidebarContextType { - activePanel: "citations" | "settings" | null; + activePanel: "citations" | "settings" | "chat" | null; activeCitations: Citation[]; - toggleSidebar: (panel: "citations" | "settings") => void; + toggleSidebar: (panel: "citations" | "settings" | "chat") => void; closeSidebar: () => void; setActiveCitations: (citations: Citation[]) => void; } @@ -30,11 +30,11 @@ const SidebarContext = createContext(undefined); export function SidebarProvider({ children }: { children: ReactNode }) { const [activePanel, setActivePanel] = useState< - "citations" | "settings" | null + "citations" | "settings" | "chat" | null >(null); const [activeCitations, setActiveCitations] = useState([]); - const toggleSidebar = (panel: "citations" | "settings") => { + const toggleSidebar = (panel: "citations" | "settings" | "chat") => { setActivePanel(activePanel === panel ? null : panel); }; diff --git a/community/ai-vws-sizing-advisor/frontend/src/app/globals.css b/community/ai-vws-sizing-advisor/frontend/src/app/globals.css index c099bf4b3..1c3a98c54 100644 --- a/community/ai-vws-sizing-advisor/frontend/src/app/globals.css +++ b/community/ai-vws-sizing-advisor/frontend/src/app/globals.css @@ -20,6 +20,30 @@ body { } } +/* Green scrollbar only for chat messages area */ +.chat-scrollbar::-webkit-scrollbar { + width: 10px; +} + +.chat-scrollbar::-webkit-scrollbar-track { + background: #252525; + border-radius: 5px; +} + +.chat-scrollbar::-webkit-scrollbar-thumb { + background: #76b900; + border-radius: 5px; +} + +.chat-scrollbar::-webkit-scrollbar-thumb:hover { + background: #5a8c00; +} + +.chat-scrollbar { + scrollbar-width: thin; + scrollbar-color: #76b900 #252525; +} + @keyframes typing { 0% { content: ""; diff --git a/community/ai-vws-sizing-advisor/frontend/src/types/chat.ts b/community/ai-vws-sizing-advisor/frontend/src/types/chat.ts index ec60a8207..d2eff0e43 100644 --- a/community/ai-vws-sizing-advisor/frontend/src/types/chat.ts +++ b/community/ai-vws-sizing-advisor/frontend/src/types/chat.ts @@ -62,4 +62,5 @@ export interface GenerateRequest { reranker_model?: string; reranker_endpoint?: string; stop?: string[]; + conversational_mode?: boolean; } diff --git a/community/ai-vws-sizing-advisor/scripts/start_app.sh b/community/ai-vws-sizing-advisor/scripts/start_app.sh index 4d6624fe2..c3c79cbc0 100755 --- a/community/ai-vws-sizing-advisor/scripts/start_app.sh +++ b/community/ai-vws-sizing-advisor/scripts/start_app.sh @@ -90,6 +90,14 @@ docker_login() { setup_environment() { print_info "Setting up environment..." + # Source centralized model configuration first (highest priority) + if [ -f "$COMPOSE_DIR/model_config.env" ]; then + set -a + source "$COMPOSE_DIR/model_config.env" + set +a + print_status "Loaded centralized model configuration" + fi + # Source .env file if [ -f "$COMPOSE_DIR/.env" ]; then set -a @@ -253,6 +261,11 @@ show_status() { echo " • Ingestor API: http://localhost:8082" echo " • Milvus: http://localhost:9011" echo "" + echo -e "${BLUE}🤖 AI Models:${NC}" + echo " • Chat/LLM: ${APP_LLM_MODELNAME:-nvidia/llama-3.3-nemotron-super-49b-v1}" + echo " • Embedding: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}" + echo " • Config File: deploy/compose/model_config.env" + echo "" echo -e "${BLUE}📚 Knowledge Base:${NC}" echo " • Collection: vgpu_knowledge_base" echo " • Location: ./vgpu_docs" @@ -274,6 +287,7 @@ show_status() { echo " • Stop Backend: ./scripts/stop_app.sh" echo " • Restart App: ./scripts/restart_app.sh" echo " • Logs: docker logs -f rag-server" + echo " • Change Models: Edit deploy/compose/model_config.env" echo "" } diff --git a/community/ai-vws-sizing-advisor/src/apply_configuration.py b/community/ai-vws-sizing-advisor/src/apply_configuration.py index 497baaea2..3874a17bf 100644 --- a/community/ai-vws-sizing-advisor/src/apply_configuration.py +++ b/community/ai-vws-sizing-advisor/src/apply_configuration.py @@ -112,7 +112,11 @@ def calculate_gpu_memory_utilization( return 0.9 # Use recommended workload size if provided, otherwise extract from profile + # IMPORTANT: Add KV cache to workload - the calculator provides model memory only workload_memory_gb = recommended_workload_gb + if workload_memory_gb and kv_cache_gb: + workload_memory_gb = recommended_workload_gb + kv_cache_gb + logger.info(f" Total workload = {recommended_workload_gb}GB (model) + {kv_cache_gb:.2f}GB (KV cache) = {workload_memory_gb:.2f}GB") if not workload_memory_gb: # Extract profile memory size from vGPU profile name (e.g., "DC-12Q" → 12) @@ -753,6 +757,7 @@ def run_command(cmd: str, shell: bool = True) -> tuple: # Build docker command - only include max-model-len if specified # Note: gpu_util may exceed 0.90 intentionally - vLLM will adapt KV cache to available memory + # Use vLLM v0.12.0+ for proper NemotronH (hybrid Mamba-Transformer) architecture support docker_cmd_parts = [ "docker run -d --runtime nvidia --gpus all", f"--name {container_name}", @@ -760,9 +765,10 @@ def run_command(cmd: str, shell: bool = True) -> tuple: f'-e "HUGGING_FACE_HUB_TOKEN={hf_token}"', "-p 8000:8000", "--ipc=host", - "vllm/vllm-openai:latest", + "vllm/vllm-openai:v0.12.0", f"--model {model}", - f"--gpu-memory-utilization {gpu_util:.2f}" + f"--gpu-memory-utilization {gpu_util:.2f}", + "--trust-remote-code" ] # Only add max-model-len if explicitly specified (let vLLM auto-detect otherwise) diff --git a/community/ai-vws-sizing-advisor/src/calculator.py b/community/ai-vws-sizing-advisor/src/calculator.py index 901010aaf..fcc0c6683 100644 --- a/community/ai-vws-sizing-advisor/src/calculator.py +++ b/community/ai-vws-sizing-advisor/src/calculator.py @@ -378,6 +378,8 @@ def _initialize_model_specs(self) -> List[ModelSpec]: ModelSpec(name="Falcon-40B", params_billion=40, d_model=8192, n_layers=60), ModelSpec(name="Falcon-180B", params_billion=180, d_model=14848, n_layers=80), ModelSpec(name="Qwen-14B", params_billion=14, d_model=5120, n_layers=40), + # NVIDIA Nemotron model - 30B parameters + ModelSpec(name="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8", params_billion=30, d_model=8192, n_layers=48), ] # Try to dynamically fetch popular models from HuggingFace @@ -566,11 +568,25 @@ def get_available_gpus(self) -> List[str]: def _find_model(self, model_name: str) -> Optional[ModelSpec]: """Find model specification by name, fetching from HuggingFace if not found""" - # First, try to find in existing specs + # First, try exact match in existing specs for model in self.model_specs: if model.name == model_name or model.name.lower() == model_name.lower(): return model + # Try partial match for common patterns + lower_name = model_name.lower() + for model in self.model_specs: + lower_model_name = model.name.lower() + # Check if model name appears in query OR query appears in model name + # e.g., "nemotron-30b-fp8" in "nvidia/nvidia-nemotron-3-nano-30b-a3b-fp8" OR vice versa + if lower_model_name in lower_name or lower_name in lower_model_name: + logging.info(f"Partial match found: '{model.name}' matches '{model_name}'") + return model + # Check for Nemotron patterns specifically (handles "nemotron-30b-fp8" -> "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8") + if 'nemotron' in lower_name and 'nemotron' in lower_model_name: + logging.info(f"Nemotron match found: '{model.name}' for '{model_name}'") + return model + # If not found, try to create it dynamically from HuggingFace logging.info(f"Model '{model_name}' not in cache, attempting to fetch from HuggingFace") @@ -594,7 +610,32 @@ def _find_model(self, model_name: str) -> Optional[ModelSpec]: except Exception as e: logging.warning(f"Could not extract model from '{model_name}': {e}") - return None + # Final fallback: Try to create a model spec from the name using parameter extraction + # This handles cases like "Llama-3-70B-Custom" where we can extract "70B" + params_billion = extract_model_params_from_name(model_name) + if params_billion: + logging.info(f"Creating dynamic model spec for '{model_name}' with {params_billion}B params") + estimated = estimate_model_spec_from_params(params_billion, model_name) + fallback_spec = ModelSpec( + name=model_name, + params_billion=params_billion, + n_layers=estimated.get('n_layers', 32), + d_model=estimated.get('d_model', 4096), + max_context_length=32768 + ) + self.model_specs.append(fallback_spec) + return fallback_spec + + # Absolute last resort: Use a default 8B model spec + logging.warning(f"Could not determine model specs for '{model_name}', using default 8B model") + default_spec = ModelSpec( + name=model_name, + params_billion=8.0, + n_layers=32, + d_model=4096, + max_context_length=32768 + ) + return default_spec def _find_gpu(self, gpu_name: str) -> Optional[GPUSpec]: """Find GPU specification by name""" @@ -649,65 +690,57 @@ def _get_available_profiles(self, gpu_family: str) -> List[int]: def _recommend_vgpu_profile(self, total_memory_needed: float, gpu_family: str, safety_buffer_gb: float = 0.0) -> Dict[str, Any]: """ - Recommend vGPU profile based on total memory needed with 5% headroom. + Recommend vGPU profile based on total memory needed with 5% headroom reserve. - CRITICAL RULE: Pick the SMALLEST profile where (profile × 0.95) >= total_memory_needed - This reserves 5% headroom to avoid running at 100% capacity. + CRITICAL RULE: Use vGPU profiles ONLY when workload fits in a SINGLE profile. + If workload exceeds max single profile capacity, use GPU passthrough. Logic: - - If total > (max_profile × 0.95): recommend passthrough - - Otherwise: find smallest profile where (profile × 0.95) >= total_memory_needed + 1. If workload fits in single profile (workload ≤ profile × 0.95): use smallest fitting profile + 2. If workload > max_profile × 0.95: recommend passthrough with N GPUs """ + import math physical_memory = self._get_physical_gpu_memory(gpu_family) available_profiles = self._get_available_profiles(gpu_family) # Get max profile for this GPU family max_profile = max(available_profiles) if available_profiles else physical_memory - max_profile_usable = max_profile * 0.95 # 95% usable capacity - - # Check if we need passthrough (exceeds max vGPU profile with 5% headroom) - if total_memory_needed > max_profile_usable: - # Calculate GPUs needed for passthrough - # Even with passthrough, reserve ~5% for driver/OS overhead to avoid running at 100% - import math - usable_per_gpu = physical_memory * 0.95 # 95% usable capacity per GPU - num_gpus_needed = math.ceil(total_memory_needed / usable_per_gpu) - return { - "type": "passthrough", - "profile": None, - "gpu_count": num_gpus_needed, - "profile_memory_gb": physical_memory, - "total_memory_available": physical_memory * num_gpus_needed, - "recommendation": f"{num_gpus_needed}x {gpu_family} passthrough (no vGPU profile)", - "reason": f"Workload requires {total_memory_needed:.1f}GB but max vGPU profile usable capacity is {max_profile_usable:.1f}GB ({max_profile}GB × 0.95). GPU passthrough provides ~95% usable capacity ({usable_per_gpu:.1f}GB per {physical_memory}GB GPU)." - } + max_profile_usable = max_profile * 0.95 # 95% usable capacity (5% reserved) + usable_physical = physical_memory * 0.95 # 95% usable physical GPU memory - # Find smallest profile where (profile × 0.95) >= total_memory_needed + # Try to find smallest single profile that fits recommended_profile = None - for profile_size in sorted(available_profiles): - usable_capacity = profile_size * 0.95 # 5% headroom + usable_capacity = profile_size * 0.95 # 5% headroom reserved if usable_capacity >= total_memory_needed: recommended_profile = profile_size break - # If no profile found (shouldn't happen), use largest - if recommended_profile is None: - recommended_profile = max(available_profiles) - warning = f"Warning: No profile with enough capacity for {total_memory_needed:.1f}GB, using largest ({recommended_profile}GB)" - else: - warning = None + # If single profile found, use it + if recommended_profile is not None: + usable_capacity = recommended_profile * 0.95 + return { + "type": "vgpu", + "profile": f"{gpu_family}-{recommended_profile}Q", + "gpu_count": 1, + "profile_memory_gb": recommended_profile, + "total_memory_available": recommended_profile, + "recommendation": f"1x {gpu_family}-{recommended_profile}Q vGPU profile", + "reason": f"Workload needs {total_memory_needed:.1f}GB, selected profile: {recommended_profile}GB (usable: {usable_capacity:.1f}GB with 5% reserved for system overhead)", + "warning": None + } - usable_capacity = recommended_profile * 0.95 + # No single profile fits - use GPU passthrough + # Calculate GPUs needed based on physical GPU memory + num_gpus_needed = math.ceil(total_memory_needed / usable_physical) return { - "type": "vgpu", - "profile": f"{gpu_family}-{recommended_profile}Q", - "gpu_count": 1, - "profile_memory_gb": recommended_profile, - "total_memory_available": recommended_profile, - "recommendation": f"1x {gpu_family}-{recommended_profile}Q vGPU profile", - "reason": f"Workload needs {total_memory_needed:.1f}GB, selected profile: {recommended_profile}GB (usable: {usable_capacity:.1f}GB with 5% headroom)", - "warning": warning + "type": "passthrough", + "profile": None, + "gpu_count": num_gpus_needed, + "profile_memory_gb": physical_memory, + "total_memory_available": physical_memory * num_gpus_needed, + "recommendation": f"{num_gpus_needed}x {gpu_family} GPU passthrough", + "reason": f"Workload requires {total_memory_needed:.1f}GB which exceeds max vGPU profile capacity ({max_profile_usable:.1f}GB usable from {max_profile}GB profile). GPU passthrough with {num_gpus_needed} GPUs provides {physical_memory * num_gpus_needed}GB total capacity." } @@ -892,9 +925,11 @@ def calculate(self, request: VGPURequest) -> VGPUResult: config = request.advanced_config if request.advanced_config else AdvancedCalculatorConfig() # Find model + logging.info(f"Looking up model: '{request.model_name}'") model = self._find_model(request.model_name) if not model: raise ValueError(f"Model '{request.model_name}' not found. Available: {self.get_available_models()}") + logging.info(f"Found model: '{model.name}' with {model.params_billion}B params") # Get GPU family from vgpu_profile gpu = self._find_gpu(request.vgpu_profile) diff --git a/community/ai-vws-sizing-advisor/src/chains.py b/community/ai-vws-sizing-advisor/src/chains.py index 559c76837..afc2737b0 100644 --- a/community/ai-vws-sizing-advisor/src/chains.py +++ b/community/ai-vws-sizing-advisor/src/chains.py @@ -47,7 +47,7 @@ from .utils import streaming_filter_think, get_streaming_filter_think_parser from .reflection import ReflectionCounter, check_context_relevance, check_response_groundedness from .utils import normalize_relevance_scores -from .apply_configuration import model_extractor, GENERAL_FALLBACK_MODEL +from .apply_configuration import model_extractor # Import enhanced components try: @@ -62,7 +62,7 @@ "Llama-3-8B", "Llama-3-70B", "Llama-3.1-8B", "Llama-3.1-70B", "Mistral-7B", "Falcon-7B", "Falcon-40B", "Falcon-180B", "Qwen-14B" ] -VALID_PRECISIONS = ["fp16", "int8", "INT-8", "int-8", "FP16", "FP-16"] +VALID_PRECISIONS = ["fp16", "fp8", "int8", "INT-8", "int-8", "FP16", "FP-16", "FP8", "FP-8", "fp4", "FP4"] def extract_embedded_config(query: str) -> dict: """Extract structured config from HTML comment in query (from WorkloadConfigWizard).""" @@ -139,15 +139,18 @@ def parse_vgpu_query(query: str) -> dict: if user_match: result["Concurrent Users"] = int(user_match.group(1)) - # 4) Precision - prec_match = re.search(r"\b(fp16|int8|INT-8)\b", query, re.IGNORECASE) + # 4) Precision - support FP8, FP16, FP4, INT8 + prec_match = re.search(r"\b(fp16|fp8|fp4|int8|INT-8|FP-8)\b", query, re.IGNORECASE) if prec_match: - precision = prec_match.group(1).lower() - if precision in VALID_PRECISIONS: - if precision == "int-8" or precision == "INT-8": - result["Precision"] = "int8" - elif precision == "fp16" or precision == "FP16" or precision == "FP-16": - result["Precision"] = "fp16" + precision = prec_match.group(1).lower().replace("-", "") + if precision in ["fp8", "fp-8"]: + result["Precision"] = "fp8" + elif precision in ["fp16", "fp-16"]: + result["Precision"] = "fp16" + elif precision in ["fp4", "fp-4"]: + result["Precision"] = "fp4" + elif precision in ["int8", "int-8"]: + result["Precision"] = "int8" else: result["Precision"] = None @@ -167,7 +170,7 @@ def parse_vgpu_query(query: str) -> dict: # 5) Default precision if not specified if not result["Precision"]: - result["Precision"] = "fp16" + result["Precision"] = "fp8" # Default to FP8 for modern inference if not result["Model"]: result["Model"] = "Llama-3-8B" if not result["Concurrent Users"]: @@ -188,7 +191,7 @@ class StructuredResponse(BaseModel): description="Function title for vGPU configuration generation" ) description: str = Field( - description="Brief summary of the recommended configuration (1-2 sentences max)" + description="Brief summary including: GPU family, vGPU profile, workload type (RAG/Inference), model name, AND precision (FP8/FP16/FP4). Example: 'BSE with vGPU profile BSE-48Q for RAG (Nemotron-30B) with FP8 precision'" ) parameters: Dict[str, Any] = Field( description="vGPU configuration parameters" @@ -446,11 +449,20 @@ def stream_structured_response(): try: # Extract embedded config if present (from WorkloadConfigWizard) embedded_config = extract_embedded_config(query) + logger.info(f"[LLM_CHAIN DEBUG] Extracted embedded_config: {embedded_config}") + logger.info(f"[LLM_CHAIN DEBUG] modelTag from config: {embedded_config.get('modelTag') if embedded_config else 'NO CONFIG'}") - # Try to get model name from various sources - model_name = (corrected_params.get("model_tag") or - (embedded_config.get('modelTag') if embedded_config else None) or - (embedded_config.get('specificModel') if embedded_config else None)) + # PRIORITY: Get model from embedded_config FIRST (wizard selection) + # This ensures we use Nemotron when user selects it, not the LLM's guess + model_name = None + if embedded_config: + model_name = embedded_config.get('modelTag') or embedded_config.get('specificModel') + if model_name: + logger.info(f"Using model from embedded config in llm_chain: {model_name}") + + # Only fall back to LLM params if embedded config didn't have it + if not model_name: + model_name = corrected_params.get("model_tag") # ALWAYS call calculator if we have a model name (regardless of vgpu_profile) # The calculator will determine the correct profile/passthrough based on workload @@ -459,7 +471,7 @@ def stream_structured_response(): # Get configuration parameters batch_size = int(embedded_config.get('batchSize', 1)) if embedded_config else 1 - precision = (embedded_config.get('precision', 'fp16') if embedded_config else 'fp16').lower() + precision = (embedded_config.get('precision', 'fp8') if embedded_config else 'fp8').lower() prompt_size = int(embedded_config.get('promptSize', 1024)) if embedded_config else 1024 response_size = int(embedded_config.get('responseSize', 256)) if embedded_config else 256 @@ -478,9 +490,9 @@ def stream_structured_response(): if not gpu_model and corrected_params.get("vgpu_profile") and corrected_params["vgpu_profile"] not in [None, "null", ""]: gpu_model = corrected_params["vgpu_profile"].split('-')[0] - # Final fallback + # Final fallback (BSE is the wizard default) if not gpu_model: - gpu_model = "L40S" + gpu_model = "BSE" logger.info(f"Using GPU model: {gpu_model} for calculator") @@ -531,12 +543,125 @@ def stream_structured_response(): logger.info("Enhanced LLM response with calculator results: %s", corrected_params) except Exception as e: + import math logger.warning("Calculator enhancement failed in llm_chain: %s", e) + # Fallback: Calculate profile based on gpu_memory_size + # Use vGPU only if single profile fits, otherwise passthrough + gpu_memory_size = corrected_params.get("gpu_memory_size", 24) + if not gpu_model: + gpu_model = "BSE" # Default + available_profiles = { + 'BSE': [8, 12, 24, 48, 96], + 'L40S': [8, 12, 24, 48], + 'L40': [8, 12, 24, 48], + 'A40': [8, 12, 24, 48], + 'L4': [4, 8, 12, 24] + } + profiles = available_profiles.get(gpu_model, [8, 12, 24, 48]) + physical_memory = {'BSE': 96, 'L40S': 48, 'L40': 48, 'A40': 48, 'L4': 24}.get(gpu_model, 48) + + # Find smallest single profile that fits + selected_profile = None + for profile in sorted(profiles): + if profile * 0.95 >= gpu_memory_size: + selected_profile = profile + break + + if selected_profile: + corrected_params["vgpu_profile"] = f"{gpu_model}-{selected_profile}Q" + corrected_params["gpu_count"] = 1 + else: + # No single profile fits - use passthrough + corrected_params["vgpu_profile"] = None + corrected_params["gpu_count"] = math.ceil(gpu_memory_size / (physical_memory * 0.95)) + corrected_params["gpu_model"] = f"{gpu_model} (passthrough)" + logger.info(f"Fallback profile: {corrected_params['vgpu_profile']} x{corrected_params.get('gpu_count', 1)}") + + # Ensure embedded_config is available for final processing + if 'embedded_config' not in dir() or embedded_config is None: + embedded_config = extract_embedded_config(query) + + # Add rag_breakdown fallback for RAG workloads if not already present + workload_type = embedded_config.get('workloadType', 'inference') if embedded_config else 'inference' + if workload_type == 'rag' and "rag_breakdown" not in corrected_params and embedded_config: + rag_breakdown = {"workload_type": "rag"} + + embedding_model = embedded_config.get('embeddingModel') + vector_db_vectors = embedded_config.get('numberOfVectors') + vector_db_dimension = embedded_config.get('vectorDimension') + + if vector_db_vectors: + vector_db_vectors = int(vector_db_vectors) if isinstance(vector_db_vectors, str) else vector_db_vectors + if vector_db_dimension: + vector_db_dimension = int(vector_db_dimension) if isinstance(vector_db_dimension, str) else vector_db_dimension + + if embedding_model: + rag_breakdown["embedding_model"] = embedding_model + embedding_model_lower = embedding_model.lower() + if 'large' in embedding_model_lower or '1b' in embedding_model_lower: + embedding_mem = 2.0 + elif 'base' in embedding_model_lower or '110m' in embedding_model_lower: + embedding_mem = 0.5 + elif 'small' in embedding_model_lower: + embedding_mem = 0.25 + else: + embedding_mem = 1.0 + rag_breakdown["embedding_memory"] = f"{embedding_mem:.2f} GB" + + if vector_db_vectors and vector_db_dimension: + rag_breakdown["vector_db_vectors"] = vector_db_vectors + rag_breakdown["vector_db_dimension"] = vector_db_dimension + vector_mem_bytes = vector_db_vectors * vector_db_dimension * 4 * 1.5 + vector_mem_gb = vector_mem_bytes / (1024**3) + if vector_mem_gb < 0.1: + rag_breakdown["vector_db_memory"] = f"{vector_mem_gb * 1024:.1f} MB" + else: + rag_breakdown["vector_db_memory"] = f"{vector_mem_gb:.2f} GB" + + corrected_params["rag_breakdown"] = rag_breakdown + logger.info("Added rag_breakdown fallback in llm_chain: %s", rag_breakdown) + + # CRITICAL: Use modelTag from embedded_config for the final response + final_model_tag = None + if embedded_config and embedded_config.get('modelTag'): + final_model_tag = embedded_config.get('modelTag') + logger.info(f"Using modelTag from embedded config for llm_chain final: {final_model_tag}") + if not final_model_tag: + # FALLBACK: Extract model from query text (e.g. "running nvidia/model-name") + import re + query_model_match = re.search(r'running\s+([\w\-/\.]+/[\w\-\.]+)', query, re.IGNORECASE) + if query_model_match: + final_model_tag = query_model_match.group(1) + logger.info(f"Extracted model from query text for llm_chain: {final_model_tag}") + else: + final_model_tag = corrected_params.get("model_tag") or "Unknown" + + # Update corrected_params to ensure JSON has the correct model_tag + corrected_params["model_tag"] = final_model_tag + + # Add precision from embedded config (default to FP8 - wizard default) + if embedded_config and embedded_config.get('precision'): + corrected_params["precision"] = embedded_config.get('precision').upper() + else: + corrected_params["precision"] = "FP8" # Default if not specified (matches wizard default) + + # Get GPU model from embedded config or profile (default to BSE - wizard default) + final_gpu_model = "BSE" + if embedded_config and embedded_config.get('selectedGPU'): + final_gpu_model = embedded_config.get('selectedGPU') + elif corrected_params.get("vgpu_profile"): + final_gpu_model = corrected_params["vgpu_profile"].split('-')[0] + + # Reconstruct description with correct model name and precision + final_profile = corrected_params.get("vgpu_profile", "Unknown") + final_precision = corrected_params.get("precision", "FP8") + final_model_name = final_model_tag.split('/')[-1] if '/' in final_model_tag else final_model_tag + corrected_description = f"{final_gpu_model} with vGPU profile {final_profile} for inference of {final_model_name} ({final_precision})" # Build the final response with corrected field names final_response = { "title": json_data.get("title", "generate_vgpu_config"), - "description": json_data.get("description", ""), + "description": corrected_description, "parameters": corrected_params } @@ -723,7 +848,55 @@ def rag_chain( # pylint: disable=arguments-differ # Log for debugging logger.info(f"Final structured response after reflection: {structured_final.description[:200]}...") - return iter([json.dumps(structured_final.model_dump(), ensure_ascii=False, indent=2)]), context_to_show + # Enhance response with rag_breakdown for RAG workloads + final_response = structured_final.model_dump() + embedded_config = extract_embedded_config(query) + workload_type = embedded_config.get('workloadType', 'inference') if embedded_config else 'inference' + + if workload_type == 'rag' and embedded_config: + params = final_response.get("parameters", {}) + if "rag_breakdown" not in params: + rag_breakdown = {"workload_type": "rag"} + + # Extract RAG config from embedded config + embedding_model = embedded_config.get('embeddingModel') + vector_db_vectors = embedded_config.get('numberOfVectors') + vector_db_dimension = embedded_config.get('vectorDimension') + + if vector_db_vectors: + vector_db_vectors = int(vector_db_vectors) if isinstance(vector_db_vectors, str) else vector_db_vectors + if vector_db_dimension: + vector_db_dimension = int(vector_db_dimension) if isinstance(vector_db_dimension, str) else vector_db_dimension + + if embedding_model: + rag_breakdown["embedding_model"] = embedding_model + # Calculate embedding memory based on model size + embedding_model_lower = embedding_model.lower() + if 'large' in embedding_model_lower or '1b' in embedding_model_lower: + embedding_mem = 2.0 + elif 'base' in embedding_model_lower or '110m' in embedding_model_lower: + embedding_mem = 0.5 + elif 'small' in embedding_model_lower: + embedding_mem = 0.25 + else: + embedding_mem = 1.0 + rag_breakdown["embedding_memory"] = f"{embedding_mem:.2f} GB" + + if vector_db_vectors and vector_db_dimension: + rag_breakdown["vector_db_vectors"] = vector_db_vectors + rag_breakdown["vector_db_dimension"] = vector_db_dimension + vector_mem_bytes = vector_db_vectors * vector_db_dimension * 4 * 1.5 + vector_mem_gb = vector_mem_bytes / (1024**3) + if vector_mem_gb < 0.1: + rag_breakdown["vector_db_memory"] = f"{vector_mem_gb * 1024:.1f} MB" + else: + rag_breakdown["vector_db_memory"] = f"{vector_mem_gb:.2f} GB" + + params["rag_breakdown"] = rag_breakdown + final_response["parameters"] = params + logger.info("Added rag_breakdown to reflection response: %s", rag_breakdown) + + return iter([json.dumps(final_response, ensure_ascii=False, indent=2)]), context_to_show else: def stream_structured_rag_response(): try: @@ -740,10 +913,23 @@ def stream_structured_rag_response(): # Extract GPU info and model info from wherever the LLM put it vgpu_profile = params.get("vgpu_profile") or "" - model_name = params.get("model_name") or params.get("model") - # Extract embedded config to get the actual GPU model selected by user + # Extract embedded config to get the actual GPU model and LLM model selected by user embedded_config = extract_embedded_config(query) + logger.info(f"[RAG DEBUG] Extracted embedded_config: {embedded_config}") + logger.info(f"[RAG DEBUG] modelTag from config: {embedded_config.get('modelTag') if embedded_config else 'NO CONFIG'}") + + # PRIORITY: Get model from embedded config (wizard selection) FIRST + # This ensures we use Nemotron when user selects it, not fallback to LLM's guess + model_name = None + if embedded_config: + model_name = embedded_config.get('modelTag') or embedded_config.get('specificModel') + if model_name: + logger.info(f"Using model from embedded config: {model_name}") + + # Fallback to LLM params only if embedded config didn't have it + if not model_name: + model_name = params.get("model_name") or params.get("model") # Extract GPU model from embedded config first (most reliable) gpu_model = None @@ -762,38 +948,73 @@ def stream_structured_rag_response(): if not gpu_model and vgpu_profile and vgpu_profile not in [None, "null", ""]: gpu_model = vgpu_profile.split('-')[0] - # Final fallback + # Final fallback (BSE is the wizard default) if not gpu_model: - gpu_model = "L40S" + gpu_model = "BSE" logger.info(f"Using GPU model: {gpu_model} for RAG chain") + + # Initialize workload with default value + workload = "RAG" # Default to RAG for RAG chain queries + prompt_size = None + response_size = None # Try to extract from description if not in parameters if not model_name: payload = parse_vgpu_query(query) model_name = model_name or payload.get("Model") - precision = payload.get("Precision", "fp16").lower() - workload = payload.get("Workload") or payload.get("workload") + precision = payload.get("Precision", "fp8").lower() + workload = payload.get("Workload") or payload.get("workload") or "RAG" prompt_size = payload.get("Prompt Size") response_size = payload.get("Response Size") - logger.info("Extracted model name: %s, precision: %s, workload: %s, prompt size: %s, response size: %s", model_name, precision, workload, prompt_size, response_size) + else: + # Even if model_name exists, try to extract workload from query + payload = parse_vgpu_query(query) + workload = payload.get("Workload") or payload.get("workload") or "RAG" + prompt_size = payload.get("Prompt Size") + response_size = payload.get("Response Size") # Build properly structured parameters with correct field names + # PRIORITY: Use the modelTag from embedded_config directly if available + # This is the AUTHORITATIVE source - user selected this in the wizard model_tag = None - if model_name: - # Check if model_name is already a HuggingFace model tag (contains "/") - if "/" in model_name: - # Use the full HF model tag directly - model_tag = model_name - logger.info(f"Using HuggingFace model tag directly: {model_tag}") - else: - # Use the dynamic model extractor for simplified names - model_tag = model_extractor.extract(model_name) - # If no match found, use general fallback model - if not model_tag: - logger.info(f"No exact match for model '{model_name}', using fallback: {GENERAL_FALLBACK_MODEL}") - model_tag = GENERAL_FALLBACK_MODEL - + if embedded_config and embedded_config.get('modelTag'): + # Embedded config has the full HuggingFace model tag from wizard - USE THIS + model_tag = embedded_config.get('modelTag') + logger.info(f"Using modelTag from embedded config (authoritative): {model_tag}") + else: + # FALLBACK: Extract model from query text (e.g. "running nvidia/model-name") + # This handles cases where embedded config isn't sent + import re + query_model_match = re.search(r'running\s+([\w\-/\.]+/[\w\-\.]+)', query, re.IGNORECASE) + if query_model_match: + model_tag = query_model_match.group(1) + logger.info(f"Extracted model from query text: {model_tag}") + elif model_name: + # Fallback to model_name extraction only if no embedded config + if "/" in model_name: + model_tag = model_name + logger.info(f"Using HuggingFace model tag directly: {model_tag}") + else: + model_tag = model_extractor.extract(model_name) + if not model_tag: + # No fallback to hardcoded model - use what was provided + logger.warning(f"No match for model '{model_name}', keeping as-is") + model_tag = model_name # Use the provided name, don't substitute + + # CRITICAL: ALWAYS update model_name with extracted model_tag for VGPURequest + # The model_tag from query/embedded_config is authoritative over params defaults + if model_tag: + model_name = model_tag + logger.info(f"Using model_tag for calculator: {model_name}") + + # Get precision from embedded config (default to fp8 which is the wizard default) + precision_from_config = (embedded_config.get('precision', 'fp8') if embedded_config else precision or 'fp8').lower() + + # Get prompt/response sizes from embedded config first + prompt_size_from_config = int(embedded_config.get('promptSize', 1024)) if embedded_config else (prompt_size or 1024) + response_size_from_config = int(embedded_config.get('responseSize', 256)) if embedded_config else (response_size or 256) + corrected_params = { "vgpu_profile": params.get("vgpu_profile"), "vcpu_count": ((params.get("system_RAM") or 96) // 4), @@ -804,6 +1025,10 @@ def stream_structured_rag_response(): "time_to_first_token": None, "throughput": None, "model_tag": model_tag, + # Add precision and prompt/response sizes + "precision": precision_from_config.upper(), + "prompt_size": prompt_size_from_config, + "response_size": response_size_from_config, } @@ -839,10 +1064,17 @@ def stream_structured_rag_response(): logger.info(f"Using batch size (concurrent requests): {batch_size}") # Extract RAG-specific parameters if workload type is RAG - # First try embedded config, then fall back to extracted workload from LLM response + # First try embedded config, then detect from query text, then fall back to LLM response workload_type = embedded_config.get('workloadType', 'inference') if embedded_config else 'inference' + + # ROBUST RAG DETECTION: Check query text directly for RAG indicators + is_rag_query = ('RAG' in query or 'Retrieval-Augmented' in query or + 'embedding model' in query.lower() or 'vector' in query.lower()) + if is_rag_query: + workload_type = 'rag' + logger.info(f"Workload type set to 'rag' based on query text analysis") # If embedded config says inference but LLM extracted "RAG", use that instead - if workload_type.lower() == 'inference' and workload and 'rag' in workload.lower(): + elif workload_type.lower() == 'inference' and workload and 'rag' in workload.lower(): workload_type = 'rag' logger.info(f"Workload type set to 'rag' based on LLM extraction: {workload}") else: @@ -855,8 +1087,13 @@ def stream_structured_rag_response(): # Try to get from embedded config first if embedded_config: embedding_model = embedded_config.get('embeddingModel') - vector_db_vectors = embedded_config.get('numberOfVectors') - vector_db_dimension = embedded_config.get('vectorDimension') + # Convert to integers if present (they come as strings from JSON) + num_vectors = embedded_config.get('numberOfVectors') + vec_dim = embedded_config.get('vectorDimension') + if num_vectors: + vector_db_vectors = int(num_vectors) if isinstance(num_vectors, str) else num_vectors + if vec_dim: + vector_db_dimension = int(vec_dim) if isinstance(vec_dim, str) else vec_dim # If not in embedded config, try to extract from query text if not embedding_model: @@ -967,12 +1204,136 @@ def stream_structured_rag_response(): logger.info("Enhanced with calculator results: %s", corrected_params) except Exception as e: + import traceback logger.warning("Calculator enhancement failed: %s", e) + logger.warning("Traceback: %s", traceback.format_exc()) + # Fallback: Calculate profile based on gpu_memory_size even if calculator fails + gpu_memory_size = corrected_params.get("gpu_memory_size", 24) + # Profile selection: Pick smallest profile where profile × 0.95 >= workload + # If no single profile fits, use passthrough + available_profiles = { + 'BSE': [8, 12, 24, 48, 96], + 'L40S': [8, 12, 24, 48], + 'L40': [8, 12, 24, 48], + 'A40': [8, 12, 24, 48], + 'L4': [4, 8, 12, 24] + } + profiles = available_profiles.get(gpu_model, [8, 12, 24, 48]) + physical_memory = {'BSE': 96, 'L40S': 48, 'L40': 48, 'A40': 48, 'L4': 24}.get(gpu_model, 48) + + # Find smallest single profile where profile × 0.95 >= workload + selected_profile = None + for profile in sorted(profiles): + if profile * 0.95 >= gpu_memory_size: + selected_profile = profile + break + + if selected_profile: + # Single vGPU profile fits + corrected_params["vgpu_profile"] = f"{gpu_model}-{selected_profile}Q" + corrected_params["gpu_count"] = 1 + logger.info(f"Fallback: Using {corrected_params['vgpu_profile']}") + else: + # No single profile fits - use passthrough + corrected_params["vgpu_profile"] = None + corrected_params["gpu_count"] = math.ceil(gpu_memory_size / (physical_memory * 0.95)) + corrected_params["gpu_model"] = f"{gpu_model} (passthrough)" + logger.info(f"Fallback: Using passthrough with {corrected_params['gpu_count']}x {gpu_model}") + + # Add RAG-specific fields to the response if this is a RAG workload + if workload_type == 'rag': + # Add top-level RAG fields + if embedding_model: + corrected_params["embedding_model"] = embedding_model + if vector_db_vectors: + corrected_params["vector_db_vectors"] = vector_db_vectors + if vector_db_dimension: + corrected_params["vector_db_dimension"] = vector_db_dimension + + # Build rag_breakdown if not already present (from calculator) + if "rag_breakdown" not in corrected_params: + rag_breakdown = {"workload_type": "rag"} + if embedding_model: + rag_breakdown["embedding_model"] = embedding_model + # Calculate embedding memory based on model size (approximate) + # Common embedding models and their approximate sizes: + embedding_model_lower = embedding_model.lower() + if 'large' in embedding_model_lower or '1b' in embedding_model_lower: + embedding_mem = 2.0 # ~1B params at FP16 + elif 'base' in embedding_model_lower or '110m' in embedding_model_lower: + embedding_mem = 0.5 # ~110M params at FP16 + elif 'small' in embedding_model_lower: + embedding_mem = 0.25 # ~33M params at FP16 + else: + embedding_mem = 1.0 # Default estimate + rag_breakdown["embedding_memory"] = f"{embedding_mem:.2f} GB" + + if vector_db_vectors and vector_db_dimension: + rag_breakdown["vector_db_vectors"] = vector_db_vectors + rag_breakdown["vector_db_dimension"] = vector_db_dimension + # Calculate vector DB memory: vectors * dimension * 4 bytes (float32) + 50% overhead for index + vector_mem_bytes = vector_db_vectors * vector_db_dimension * 4 * 1.5 + vector_mem_gb = vector_mem_bytes / (1024**3) + if vector_mem_gb < 0.1: + rag_breakdown["vector_db_memory"] = f"{vector_mem_gb * 1024:.1f} MB" + else: + rag_breakdown["vector_db_memory"] = f"{vector_mem_gb:.2f} GB" + elif vector_db_vectors: + rag_breakdown["vector_db_vectors"] = vector_db_vectors + elif vector_db_dimension: + rag_breakdown["vector_db_dimension"] = vector_db_dimension + + # Add prompt/response size info + rag_breakdown["prompt_size"] = prompt_size_from_config + rag_breakdown["response_size"] = response_size_from_config + corrected_params["rag_breakdown"] = rag_breakdown + logger.info("Built RAG breakdown manually: %s", rag_breakdown) + else: + logger.info("Using rag_breakdown from calculator: %s", corrected_params["rag_breakdown"]) + + # Reconstruct description with correct format: GPU family, profile, workload, model, precision + final_profile = corrected_params.get("vgpu_profile", f"{gpu_model}-12Q") + final_precision = corrected_params.get("precision", precision_from_config.upper()) + + # CRITICAL: Get model_tag from embedded_config first (most reliable source) + # This ensures the JSON model_tag matches what the user selected in the wizard + final_model_tag = None + if embedded_config and embedded_config.get('modelTag'): + final_model_tag = embedded_config.get('modelTag') + logger.info(f"Using modelTag from embedded config for final response: {final_model_tag}") + if not final_model_tag: + final_model_tag = corrected_params.get("model_tag") or model_tag or "Unknown" + + # Update corrected_params to ensure JSON has the correct model_tag + corrected_params["model_tag"] = final_model_tag + + if workload_type == 'rag': + # Format: "L40S with vGPU profile L40S-48Q for RAG (model-name) with embedding-model (FP8)" + emb_model_name = embedding_model.split('/')[-1] if embedding_model else "embedding" + final_model_name = final_model_tag.split('/')[-1] if '/' in final_model_tag else final_model_tag + corrected_description = f"{gpu_model} with vGPU profile {final_profile} for RAG ({final_model_name}) with {emb_model_name} ({final_precision})" + + # Add rag_config sub-object with RAG-specific configuration + rag_config = { + "workload_type": "rag", + "embedding_model": embedding_model, + "vector_dimension": vector_db_dimension, + "total_vectors": vector_db_vectors, + } + # Remove None values + rag_config = {k: v for k, v in rag_config.items() if v is not None} + if rag_config: + corrected_params["rag_config"] = rag_config + logger.info(f"Added rag_config to response: {rag_config}") + else: + # Format: "L40S with vGPU profile L40S-48Q for inference of model-name (FP8)" + final_model_name = final_model_tag.split('/')[-1] if '/' in final_model_tag else final_model_tag + corrected_description = f"{gpu_model} with vGPU profile {final_profile} for inference of {final_model_name} ({final_precision})" # Build the final response with corrected field names final_response = { "title": json_data.get("title", "generate_vgpu_config"), - "description": json_data.get("description", ""), + "description": corrected_description, "parameters": corrected_params } @@ -1033,6 +1394,12 @@ def rag_chain_with_multiturn(self, **kwargs) -> Generator[str, None, None]: """Execute a Retrieval Augmented Generation chain using the components defined above.""" + # Check for conversational mode - return plain text instead of structured JSON + conversational_mode = kwargs.get("conversational_mode", False) + if conversational_mode: + logger.info("Using CONVERSATIONAL mode for chat query: %s", query[:100]) + return self._conversational_chain(query, chat_history, reranker_top_k, vdb_top_k, collection_name, **kwargs) + # Determine if enhanced mode should be used use_enhanced = self._should_use_enhanced_mode(query) logger.info("Using %s multiturn RAG mode for query: %s", "enhanced" if use_enhanced else "standard", query) @@ -1214,10 +1581,12 @@ def stream_structured_multiturn_response(): # Extract GPU info and model info from wherever the LLM put it vgpu_profile = params.get("vgpu_profile") or "" model_name = params.get("model_name") or params.get("model") - precision = params.get("precision", "fp16").lower() if params.get("precision") else "fp16" + precision = params.get("precision", "fp8").lower() if params.get("precision") else "fp8" # Extract embedded config to get the actual GPU model selected by user embedded_config = extract_embedded_config(query) + logger.info(f"[MULTITURN DEBUG] Extracted embedded_config: {embedded_config}") + logger.info(f"[MULTITURN DEBUG] modelTag from config: {embedded_config.get('modelTag') if embedded_config else 'NO CONFIG'}") # Extract GPU model from embedded config first (most reliable) gpu_model = None @@ -1236,18 +1605,60 @@ def stream_structured_multiturn_response(): if not gpu_model and vgpu_profile and vgpu_profile not in [None, "null", ""]: gpu_model = vgpu_profile.split('-')[0] - # Final fallback + # Final fallback (BSE is the wizard default) if not gpu_model: - gpu_model = "L40S" + gpu_model = "BSE" logger.info(f"Using GPU model: {gpu_model} for multiturn RAG chain") + # Initialize workload with default value + workload = "RAG" # Default to RAG for multiturn queries + # Try to extract from description if not in parameters if not model_name: payload = parse_vgpu_query(json_data.get("description", "")) model_name = model_name or payload.get("Model") - precision = precision or payload.get("Precision", "fp16").lower() + precision = precision or payload.get("Precision", "fp8").lower() workload = payload.get("Workload", "RAG") + else: + # Even if model_name exists, try to extract workload from description + payload = parse_vgpu_query(json_data.get("description", "")) + workload = payload.get("Workload", "RAG") + + # PRIORITY: Use modelTag from embedded_config directly if available + # This is the AUTHORITATIVE source - user selected this in the wizard + model_tag = None + if embedded_config and embedded_config.get('modelTag'): + model_tag = embedded_config.get('modelTag') + logger.info(f"Using modelTag from embedded config for multiturn (authoritative): {model_tag}") + else: + # FALLBACK: Extract model from query text (e.g. "running nvidia/model-name") + # This handles cases where embedded config isn't sent + import re + query_model_match = re.search(r'running\s+([\w\-/\.]+/[\w\-\.]+)', query, re.IGNORECASE) + if query_model_match: + model_tag = query_model_match.group(1) + logger.info(f"Extracted model from query text: {model_tag}") + elif model_name: + # Fallback to model_name extraction only if no embedded config + if "/" in model_name: + model_tag = model_name + logger.info(f"Using HuggingFace model tag directly: {model_tag}") + else: + model_tag = model_extractor.extract(model_name) + if not model_tag: + # No fallback to hardcoded model - use what was provided + logger.warning(f"No match for model '{model_name}', keeping as-is") + model_tag = model_name # Use the provided name, don't substitute + + # CRITICAL: ALWAYS update model_name with extracted model_tag for VGPURequest + # The model_tag from query/embedded_config is authoritative over params defaults + if model_tag: + model_name = model_tag + logger.info(f"Using model_tag for calculator: {model_name}") + + # Get precision from embedded config (default to fp8 - wizard default) + precision_from_config = (embedded_config.get('precision', 'fp8') if embedded_config else precision or 'fp8').lower() # Build properly structured parameters with correct field names corrected_params = { @@ -1259,7 +1670,8 @@ def stream_structured_multiturn_response(): "e2e_latency": None, "time_to_first_token": None, "throughput": None, - "model_tag": model_tag + "model_tag": model_tag, + "precision": precision_from_config.upper() } # If we have model info and it's a workload we can calculate, enhance with calculator @@ -1280,6 +1692,9 @@ def stream_structured_multiturn_response(): # Use vgpu_profile from calculator (not LLM) for accurate profile selection corrected_params["vgpu_profile"] = calculation.resultant_configuration.vgpu_profile corrected_params["max_kv_tokens"] = calculation.resultant_configuration.max_kv_tokens + # Use calculator's total_memory_gb directly - it already includes all components + # This replaces the LLM's estimate with the actual calculated value + corrected_params["gpu_memory_size"] = calculation.resultant_configuration.total_memory_gb # Add GPU model name (especially useful for passthrough configurations) corrected_params["gpu_model"] = calculation.resultant_configuration.gpu_name # Add GPU count (especially useful for passthrough configurations) @@ -1292,12 +1707,111 @@ def stream_structured_multiturn_response(): logger.info("Enhanced multiturn with calculator results: %s", corrected_params) except Exception as e: + import math logger.warning("Calculator enhancement failed in multiturn: %s", e) + # Fallback: Calculate profile based on gpu_memory_size + # Use vGPU only if single profile fits, otherwise passthrough + gpu_memory_size = corrected_params.get("gpu_memory_size", 24) + available_profiles = { + 'BSE': [8, 12, 24, 48, 96], + 'L40S': [8, 12, 24, 48], + 'L40': [8, 12, 24, 48], + 'A40': [8, 12, 24, 48], + 'L4': [4, 8, 12, 24] + } + profiles = available_profiles.get(gpu_model, [8, 12, 24, 48]) + physical_memory = {'BSE': 96, 'L40S': 48, 'L40': 48, 'A40': 48, 'L4': 24}.get(gpu_model, 48) + + # Find smallest single profile that fits + selected_profile = None + for profile in sorted(profiles): + if profile * 0.95 >= gpu_memory_size: + selected_profile = profile + break + + if selected_profile: + corrected_params["vgpu_profile"] = f"{gpu_model}-{selected_profile}Q" + corrected_params["gpu_count"] = 1 + else: + # No single profile fits - use passthrough + corrected_params["vgpu_profile"] = None + corrected_params["gpu_count"] = math.ceil(gpu_memory_size / (physical_memory * 0.95)) + corrected_params["gpu_model"] = f"{gpu_model} (passthrough)" + logger.info(f"Fallback profile: {corrected_params['vgpu_profile']} x{corrected_params.get('gpu_count', 1)}") + + # ========== Extract RAG Configuration from Query ========== + # Detect if this is a RAG workload from query text + is_rag_workload = 'RAG' in query or 'Retrieval-Augmented' in query or 'embedding model' in query.lower() + + if is_rag_workload: + import re + rag_config = {} + rag_breakdown = {"workload_type": "rag"} + + # Extract embedding model (e.g., "using embedding model nvidia/nvolveqa-embed-large-1B") + embedding_match = re.search(r'embedding model\s+([\w\-/\.]+)', query, re.IGNORECASE) + if embedding_match: + embedding_model = embedding_match.group(1) + rag_config["embedding_model"] = embedding_model + rag_breakdown["embedding_model"] = embedding_model + # Estimate embedding memory based on model name + embedding_model_lower = embedding_model.lower() + if 'large' in embedding_model_lower or '1b' in embedding_model_lower: + embedding_mem = 2.0 + elif 'base' in embedding_model_lower or '400m' in embedding_model_lower: + embedding_mem = 0.8 + elif 'small' in embedding_model_lower or '200m' in embedding_model_lower: + embedding_mem = 0.4 + else: + embedding_mem = 1.0 + rag_breakdown["embedding_memory"] = f"{embedding_mem:.2f} GB" + + # Extract vector dimension (e.g., "1024d vectors") + dimension_match = re.search(r'(\d+)d\s*vectors', query, re.IGNORECASE) + if dimension_match: + vector_dimension = int(dimension_match.group(1)) + rag_config["vector_dimension"] = vector_dimension + rag_breakdown["vector_db_dimension"] = vector_dimension + + # Extract total vectors (e.g., "10000 total vectors") + vectors_match = re.search(r'(\d+)\s*total\s*vectors', query, re.IGNORECASE) + if vectors_match: + total_vectors = int(vectors_match.group(1)) + rag_config["total_vectors"] = total_vectors + rag_breakdown["vector_db_vectors"] = total_vectors + + # Calculate vector DB memory if we have both dimension and count + if rag_breakdown.get("vector_db_vectors") and rag_breakdown.get("vector_db_dimension"): + vector_mem_bytes = rag_breakdown["vector_db_vectors"] * rag_breakdown["vector_db_dimension"] * 4 * 1.5 + vector_mem_gb = vector_mem_bytes / (1024**3) + if vector_mem_gb < 0.1: + rag_breakdown["vector_db_memory"] = f"{vector_mem_gb * 1024:.1f} MB" + else: + rag_breakdown["vector_db_memory"] = f"{vector_mem_gb:.2f} GB" + + # Add RAG config and breakdown to params + if rag_config: + corrected_params["rag_config"] = rag_config + if any(k != "workload_type" for k in rag_breakdown.keys()): + corrected_params["rag_breakdown"] = rag_breakdown + logger.info(f"Added RAG breakdown to multiturn response: {rag_breakdown}") + + # Reconstruct description with correct model name and precision + final_profile = corrected_params.get("vgpu_profile", "Unknown") + final_precision = corrected_params.get("precision", "FP8") + final_model_name = model_tag.split('/')[-1] if model_tag and '/' in model_tag else (model_tag or "Unknown") + + # Use different description format for RAG vs Inference + if is_rag_workload and corrected_params.get("rag_config", {}).get("embedding_model"): + embedding_short = corrected_params["rag_config"]["embedding_model"].split('/')[-1] + corrected_description = f"{gpu_model} with vGPU profile {final_profile} for RAG (Retrieval-Augmented Generation) with {final_model_name} and {embedding_short}" + else: + corrected_description = f"{gpu_model} with vGPU profile {final_profile} for inference of {final_model_name} ({final_precision})" # Build the final response with corrected field names final_response = { "title": json_data.get("title", "generate_vgpu_config"), - "description": json_data.get("description", ""), + "description": corrected_description, "parameters": corrected_params } @@ -1351,6 +1865,137 @@ def stream_structured_multiturn_response(): return iter([json.dumps(error_response.model_dump(), ensure_ascii=False, indent=2)]), [] + def _conversational_chain(self, + query: str, + chat_history: List[Dict[str, Any]], + reranker_top_k: int, + vdb_top_k: int, + collection_name: str, + **kwargs) -> tuple: + """ + Execute a conversational RAG chain that returns plain text responses. + Used for the chat panel where users ask follow-up questions about their config. + """ + try: + document_embedder = get_embedding_model(model=kwargs.get("embedding_model"), url=kwargs.get("embedding_endpoint")) + vs = get_vectorstore(document_embedder, collection_name, kwargs.get("vdb_endpoint")) + if vs is None: + raise APIError("Vector store not initialized properly.", 500) + + llm = get_llm(**kwargs) + ranker = get_ranking_model(model=kwargs.get("reranker_model"), url=kwargs.get("reranker_endpoint"), top_n=reranker_top_k) + top_k = vdb_top_k if ranker and kwargs.get("enable_reranker") else reranker_top_k + retriever = vs.as_retriever(search_kwargs={"k": top_k}) + + # Build conversation history for the prompt + conversation_history = [] + user_provided_context = "" + history_count = int(os.environ.get("CONVERSATION_HISTORY", 15)) * 2 * -1 + chat_history = chat_history[history_count:] + + for message in chat_history: + if message.role == "system": + # Capture the system message context from frontend (contains vGPU config details) + user_provided_context = message.content + logger.info(f"[CONVERSATIONAL] Found system context: {user_provided_context[:200]}...") + else: + conversation_history.append((message.role, message.content)) + + # Build system prompt - include user's configuration context if provided + base_prompt = """You are a helpful AI assistant with expertise in NVIDIA GPUs, vGPU technology, LLMs, and AI infrastructure. + +Answer the user's question directly and conversationally. Use the retrieved documents AND the configuration context to support your answers. + +Guidelines: +- Be concise but thorough +- Use plain text only, no JSON or structured output +- If asked about model parameters, GPU profiles, or vGPU configurations, explain clearly +- For technical questions, provide specific details when available +- Reference the user's specific configuration when answering +- If you don't know something, say so honestly""" + + # Add user's configuration context if provided + if user_provided_context: + system_prompt = f"""{base_prompt} + +=== USER'S CURRENT VGPU CONFIGURATION === +{user_provided_context} + +=== ADDITIONAL CONTEXT FROM KNOWLEDGE BASE === +{{context}}""" + else: + system_prompt = f"""{base_prompt} + +Context from knowledge base: +{{context}}""" + + logger.info(f"[CONVERSATIONAL] System prompt length: {len(system_prompt)}") + + # Retrieve relevant documents + retriever_query = query + if kwargs.get("enable_query_rewriting") and conversation_history: + contextualize_q_system_prompt = ( + "Given a chat history and the latest user question " + "which might reference context in the chat history, " + "formulate a standalone question which can be understood " + "without the chat history. Do NOT answer the question, " + "just reformulate it if needed and otherwise return it as is." + ) + q_prompt = ChatPromptTemplate.from_messages([ + ("system", contextualize_q_system_prompt), + MessagesPlaceholder("chat_history"), + ("human", "{input}"), + ]) + query_rewriter_llm = get_llm( + model=settings.query_rewriter.model_name, + llm_endpoint=settings.query_rewriter.server_url, + **query_rewriter_llm_config + ) + # Create chain: prompt -> LLM -> string output + query_rewriter_chain = q_prompt | query_rewriter_llm | StrOutputParser() + retriever_query = query_rewriter_chain.invoke( + {"input": query, "chat_history": conversation_history}, + config={'run_name': 'query-rewriter'} + ) + logger.info(f"Conversational query rewritten to: {retriever_query}") + + # Get documents + docs_raw = retriever.invoke(retriever_query) + if ranker and kwargs.get("enable_reranker"): + docs_raw = ranker.invoke({"query": retriever_query, "documents": docs_raw}) + + docs = [format_document_with_source(d) for d in docs_raw[:reranker_top_k]] + context_str = "\n\n".join(docs) if docs else "No relevant documents found." + + # Build the prompt + messages = [("system", system_prompt)] + messages.extend(conversation_history) + messages.append(("user", query)) + + prompt = ChatPromptTemplate.from_messages(messages) + chain = prompt | llm | StrOutputParser() + + def stream_conversational_response(): + """Yield plain text chunks - server.py handles SSE formatting.""" + try: + for chunk in chain.stream({"context": context_str}): + # Just yield the raw text - server.py will format as SSE + yield chunk + except Exception as e: + logger.error(f"Error in conversational stream: {e}") + yield f"I apologize, but I encountered an error: {str(e)}" + + # Return generator and context for citations + context_to_show = docs_raw[:reranker_top_k] if docs_raw else [] + return stream_conversational_response(), context_to_show + + except Exception as e: + logger.error(f"Error in conversational chain: {e}") + def error_stream(): + yield "I'm sorry, I encountered an error processing your question. Please try again." + return error_stream(), [] + + def document_search(self, content: str, messages: List, reranker_top_k: int, vdb_top_k: int, collection_name: str = "", **kwargs) -> List[Dict[str, Any]]: """Search for the most relevant documents for the given search parameters. It's called when the `/search` API is invoked. diff --git a/community/ai-vws-sizing-advisor/src/prompt.yaml b/community/ai-vws-sizing-advisor/src/prompt.yaml index 056d23a7d..67a11daad 100644 --- a/community/ai-vws-sizing-advisor/src/prompt.yaml +++ b/community/ai-vws-sizing-advisor/src/prompt.yaml @@ -8,6 +8,7 @@ chat_template: | ### STEP 2: Pick smallest profile where (profile × 0.95) >= workload Reserve 5% headroom to avoid running at 100% capacity. + If no single profile fits, use GPU passthrough. **Available profiles per GPU:** - **L40S**: 8Q, 12Q, 24Q, 48Q @@ -17,6 +18,7 @@ chat_template: | - **BSE** (RTX Pro 6000): 8Q, 12Q, 24Q, 48Q, 96Q **Profile selection rule: Pick smallest profile where (profile × 0.95) >= workload** + **If no single profile fits, use GPU passthrough (entire GPUs, no vGPU)** Examples (95% usable capacity): - Workload needs 10 GB on BSE → 12×0.95=11.4≥10 → Pick BSE-12Q ✓ @@ -29,26 +31,25 @@ chat_template: | - Workload needs 45 GB on BSE → 48×0.95=45.6≥45 → Pick BSE-48Q ✓ - Workload needs 46 GB on BSE → 48×0.95=45.6<46 → Pick BSE-96Q (96×0.95=91.2≥46) ✓ - Workload needs 90 GB on BSE → 96×0.95=91.2≥90 → Pick BSE-96Q ✓ - - Workload needs 92 GB on BSE → 96×0.95=91.2<92 → vgpu_profile=null, recommend "1× BSE GPU passthrough" ✓ - - Workload needs 120 GB on BSE → Exceeds single GPU → vgpu_profile=null, recommend "2× BSE GPU passthrough" ✓ + - **Workload needs 92 GB on BSE → 96×0.95=91.2<92 → vgpu_profile=null, "2× BSE GPU passthrough" ✓** + - **Workload needs 96 GB on BSE → 96×0.95=91.2<96 → vgpu_profile=null, "2× BSE GPU passthrough" ✓** + - **Workload needs 120 GB on BSE → 96×0.95=91.2<120 → vgpu_profile=null, "2× BSE GPU passthrough" ✓** - **Workload needs 22 GB on L4 → 24×0.95=22.8≥22 → Pick L4-24Q ✓** - - **Workload needs 23 GB on L4 → 24×0.95=22.8<23 → vgpu_profile=null, recommend "1× L4 GPU passthrough" ✓** - - **Workload needs 24 GB on L4 → 24×0.95=22.8<24 → vgpu_profile=null, recommend "1× L4 GPU passthrough" ✓** - - **Workload needs 25 GB on L4 → Exceeds single L4 → vgpu_profile=null, recommend "2× L4 GPU passthrough" ✓** + - **Workload needs 23 GB on L4 → 24×0.95=22.8<23 → vgpu_profile=null, "2× L4 GPU passthrough" ✓** + - **Workload needs 50 GB on L40S → 48×0.95=45.6<50 → vgpu_profile=null, "2× L40S GPU passthrough" ✓** - **If workload exceeds max vGPU profile capacity with 5% headroom:** - - Max usable capacities: BSE-96Q (91.2GB), L40S-48Q (45.6GB), L40-48Q (45.6GB), A40-48Q (45.6GB), L4-24Q (22.8GB) + **IMPORTANT: Use vGPU profiles ONLY when workload fits in a SINGLE profile!** + - If workload fits in single profile: use smallest vGPU profile that fits + - If workload exceeds max single profile: use GPU passthrough (entire GPUs, no vGPU) + - Max usable: BSE-96Q=91.2GB, L40S-48Q=45.6GB, L40-48Q=45.6GB, A40-48Q=45.6GB, L4-24Q=22.8GB + + **If workload > max single profile capacity → use passthrough:** - Set `vgpu_profile` to null - - In description field, recommend: "Use X× [GPU model] with full GPU passthrough (no vGPU)" - - **IMPORTANT: Even with passthrough, reserve 5% for driver/OS overhead. Use 95% of physical memory.** - - Calculate GPUs needed: ceil(workload / (physical_gpu × 0.95)) - - Physical GPU capacities: BSE=96GB, L40S=48GB, L40=48GB, A40=48GB, L4=24GB - - Example: 92GB on BSE → ceil(92/91.2)=2 GPUs → "Use 2× BSE GPU passthrough" (96GB × 2 = 192GB total) - - Example: 120GB on BSE → ceil(120/91.2)=2 GPUs → "Use 2× BSE GPU passthrough" (96GB × 2 = 192GB total) - - Example: 185GB on BSE → ceil(185/91.2)=3 GPUs → "Use 3× BSE GPU passthrough" (96GB × 3 = 288GB total) - - Example: 24GB on L4 → ceil(24/22.8)=2 GPUs → "Use 2× L4 GPU passthrough" (24GB × 2 = 48GB total) - - Example: 50GB on L40S → ceil(50/45.6)=2 GPUs → "Use 2× L40S GPU passthrough" (48GB × 2 = 96GB total) - - Example: 144GB on L40S → ceil(144/45.6)=4 GPUs → "Use 4× L40S GPU passthrough" (48GB × 4 = 192GB total) + - Recommend: "Use X× [GPU model] with full GPU passthrough (no vGPU)" + - Calculate GPUs: ceil(workload / (physical_gpu × 0.95)) + - Example: 92GB on BSE → vgpu_profile=null → "2× BSE GPU passthrough" + - Example: 50GB on L40S → vgpu_profile=null → "2× L40S GPU passthrough" + - Example: 23GB on L4 → vgpu_profile=null → "2× L4 GPU passthrough" ## System RAM Calculation: - Standard: (Model GB × 2.5) + (Concurrent Requests × 2GB) + 16GB @@ -69,7 +70,7 @@ chat_template: | ```json {{ "title": "generate_vgpu_config", - "description": "Brief 1-2 sentence summary", + "description": "{{GPU_MODEL}} with vGPU profile {{SELECTED_PROFILE}} for inference of {{MODEL_NAME}} ({{QUANTIZATION}})", "parameters": {{ "vgpu_profile": "BSE-48Q", "vcpu_count": 16, @@ -100,6 +101,20 @@ nemotron_thinking_prompt: | Example: If gpu_memory_size = 24GB on L40S → 24×0.95=22.8<24 → Pick L40S-48Q ✓ +chat_followup_template: | + You are an NVIDIA vGPU configuration specialist helping with follow-up questions about vGPU configurations. + + Use the conversation history and ingested vGPU documentation to answer questions about: + - vGPU profile details and specifications + - Configuration recommendations and alternatives + - Performance characteristics + - Deployment considerations + - Troubleshooting and optimization + + Keep responses concise and technical. Reference the provided documentation when available. + + If asked about a previously recommended configuration, use the conversation history to understand context. + You are an NVIDIA vGPU configuration specialist. ## SIMPLE 2-STEP PROCESS: @@ -152,6 +167,7 @@ rag_template: | Add 10% safety buffer: `recommended_memory = gpu_memory_size × 1.10` ### STEP 2: Pick the smallest profile >= recommended memory + If no single profile fits, use GPU passthrough. **Available profiles per GPU:** - **L40S**: 8Q, 12Q, 24Q, 48Q @@ -161,6 +177,7 @@ rag_template: | - **BSE** (RTX Pro 6000): 8Q, 12Q, 24Q, 48Q, 96Q **Profile selection rule: Pick smallest profile where (profile × 0.95) >= workload** + **If no single profile fits, use GPU passthrough (entire GPUs, no vGPU)** Examples (95% usable capacity): - Workload needs 10 GB on BSE → 12×0.95=11.4≥10 → Pick BSE-12Q ✓ @@ -173,26 +190,25 @@ rag_template: | - Workload needs 45 GB on BSE → 48×0.95=45.6≥45 → Pick BSE-48Q ✓ - Workload needs 46 GB on BSE → 48×0.95=45.6<46 → Pick BSE-96Q (96×0.95=91.2≥46) ✓ - Workload needs 90 GB on BSE → 96×0.95=91.2≥90 → Pick BSE-96Q ✓ - - Workload needs 92 GB on BSE → 96×0.95=91.2<92 → vgpu_profile=null, recommend "1× BSE GPU passthrough" ✓ - - Workload needs 120 GB on BSE → Exceeds single GPU → vgpu_profile=null, recommend "2× BSE GPU passthrough" ✓ + - **Workload needs 92 GB on BSE → 96×0.95=91.2<92 → vgpu_profile=null, "2× BSE GPU passthrough" ✓** + - **Workload needs 96 GB on BSE → 96×0.95=91.2<96 → vgpu_profile=null, "2× BSE GPU passthrough" ✓** + - **Workload needs 120 GB on BSE → 96×0.95=91.2<120 → vgpu_profile=null, "2× BSE GPU passthrough" ✓** - **Workload needs 22 GB on L4 → 24×0.95=22.8≥22 → Pick L4-24Q ✓** - - **Workload needs 23 GB on L4 → 24×0.95=22.8<23 → vgpu_profile=null, recommend "1× L4 GPU passthrough" ✓** - - **Workload needs 24 GB on L4 → 24×0.95=22.8<24 → vgpu_profile=null, recommend "1× L4 GPU passthrough" ✓** - - **Workload needs 25 GB on L4 → Exceeds single L4 → vgpu_profile=null, recommend "2× L4 GPU passthrough" ✓** + - **Workload needs 23 GB on L4 → 24×0.95=22.8<23 → vgpu_profile=null, "2× L4 GPU passthrough" ✓** + - **Workload needs 50 GB on L40S → 48×0.95=45.6<50 → vgpu_profile=null, "2× L40S GPU passthrough" ✓** + + **IMPORTANT: Use vGPU profiles ONLY when workload fits in a SINGLE profile!** + - If workload fits in single profile: use smallest vGPU profile that fits + - If workload exceeds max single profile: use GPU passthrough (entire GPUs, no vGPU) + - Max usable: BSE-96Q=91.2GB, L40S-48Q=45.6GB, L40-48Q=45.6GB, A40-48Q=45.6GB, L4-24Q=22.8GB - **If workload exceeds max vGPU profile capacity with 5% headroom:** - - Max usable capacities: BSE-96Q (91.2GB), L40S-48Q (45.6GB), L40-48Q (45.6GB), A40-48Q (45.6GB), L4-24Q (22.8GB) + **If workload > max single profile capacity → use passthrough:** - Set `vgpu_profile` to null - - In description field, recommend: "Use X× [GPU model] with full GPU passthrough (no vGPU)" - - **IMPORTANT: Even with passthrough, reserve 5% for driver/OS overhead. Use 95% of physical memory.** - - Calculate GPUs needed: ceil(workload / (physical_gpu × 0.95)) - - Physical GPU capacities: BSE=96GB, L40S=48GB, L40=48GB, A40=48GB, L4=24GB - - Example: 92GB on BSE → ceil(92/91.2)=2 GPUs → "Use 2× BSE GPU passthrough" (96GB × 2 = 192GB total) - - Example: 120GB on BSE → ceil(120/91.2)=2 GPUs → "Use 2× BSE GPU passthrough" (96GB × 2 = 192GB total) - - Example: 185GB on BSE → ceil(185/91.2)=3 GPUs → "Use 3× BSE GPU passthrough" (96GB × 3 = 288GB total) - - Example: 24GB on L4 → ceil(24/22.8)=2 GPUs → "Use 2× L4 GPU passthrough" (24GB × 2 = 48GB total) - - Example: 50GB on L40S → ceil(50/45.6)=2 GPUs → "Use 2× L40S GPU passthrough" (48GB × 2 = 96GB total) - - Example: 144GB on L40S → ceil(144/45.6)=4 GPUs → "Use 4× L40S GPU passthrough" (48GB × 4 = 192GB total) + - Recommend: "Use X× [GPU model] with full GPU passthrough (no vGPU)" + - Calculate GPUs: ceil(workload / (physical_gpu × 0.95)) + - Example: 92GB on BSE → vgpu_profile=null → "2× BSE GPU passthrough" + - Example: 50GB on L40S → vgpu_profile=null → "2× L40S GPU passthrough" + - Example: 23GB on L4 → vgpu_profile=null → "2× L4 GPU passthrough" ## System RAM Calculation: - Standard: (Model GB × 2.5) + (Concurrent Requests × 2GB) + 16GB @@ -213,7 +229,7 @@ rag_template: | ```json {{ "title": "generate_vgpu_config", - "description": "Brief 1-2 sentence summary", + "description": "{{GPU_MODEL}} with vGPU profile {{SELECTED_PROFILE}} for inference of {{MODEL_NAME}} ({{QUANTIZATION}})", "parameters": {{ "vgpu_profile": "BSE-48Q", "vcpu_count": 16, @@ -307,10 +323,11 @@ reflection_response_regeneration_prompt: You are an expert NVIDIA vGPU configuration specialist. Generate a grounded vGPU configuration description based ONLY on information explicitly found in the provided context documents. - Your description should be concise (1-2 sentences) and mention: - 1. The recommended vGPU profile - 2. Configuration feasibility - 3. Key constraints if any + Your description should be ULTRA-CONCISE (single sentence, <50 words) using this format: + "{GPU_MODEL} with vGPU profile {SELECTED_PROFILE} for inference of {MODEL_NAME} ({QUANTIZATION})" + + Example: "L40S with vGPU profile L40S-48Q for inference of Llama-3.1-8B-Instruct (FP16)" + Do NOT include extra details about memory margins, safety, or system RAM. CRITICAL RULES: - Use ONLY vGPU profiles that appear exactly in the context (e.g., L40S-8Q, L4-4Q) diff --git a/community/ai-vws-sizing-advisor/src/server.py b/community/ai-vws-sizing-advisor/src/server.py index 7af3243b2..980020e78 100644 --- a/community/ai-vws-sizing-advisor/src/server.py +++ b/community/ai-vws-sizing-advisor/src/server.py @@ -244,6 +244,10 @@ class Prompt(BaseModel): description="Enable or disable citations as part of response.", default=os.getenv("ENABLE_CITATIONS", "True").lower() in ["true", "True"], ) + conversational_mode: bool = Field( + description="Enable conversational mode for plain text responses instead of structured JSON output.", + default=False, + ) model: str = Field( description="Name of NIM LLM model to be used for inference.", default=os.getenv("APP_LLM_MODELNAME", "").strip('"'), @@ -905,14 +909,31 @@ async def generate_answer(request: Request, prompt: Prompt) -> StreamingResponse # Helper function to escape JSON-like structures in content def escape_json_content(content: str) -> str: - """Escape curly braces in content to avoid JSON parsing issues""" - return content.replace("{", "{{").replace("}", "}}") + """Escape curly braces in content to avoid JSON parsing issues. + IMPORTANT: Preserve embedded config as-is (don't escape).""" + import re + # Extract any embedded config first + config_match = re.search(r'', content, re.DOTALL) + if config_match: + # Preserve the embedded config, escape the rest + before = content[:config_match.start()] + config_section = config_match.group(0) # The entire + after = content[config_match.end():] + escaped_before = before.replace("{", "{{").replace("}", "}}") + escaped_after = after.replace("{", "{{").replace("}", "}}") + return escaped_before + config_section + escaped_after + else: + return content.replace("{", "{{").replace("}", "}}") # The last user message will be the query for the rag or llm chain last_user_message = next((message.content for message in reversed(chat_history) if message.role == 'user'), None) + # DEBUG: Log raw message before escape + logger.info(f"[RAW MESSAGE DEBUG] Raw last_user_message (first 500 chars): {last_user_message[:500] if last_user_message else 'None'}") + logger.info(f"[RAW MESSAGE DEBUG] Contains VGPU_CONFIG: {' embedded config as-is (don't escape).""" + import re + # Extract any embedded config first + config_match = re.search(r'', content, re.DOTALL) + if config_match: + # Preserve the embedded config, escape the rest + before = content[:config_match.start()] + config_section = config_match.group(0) # The entire + after = content[config_match.end():] + escaped_before = before.replace("{", "{{").replace("}", "}}") + escaped_after = after.replace("{", "{{").replace("}", "}}") + return escaped_before + config_section + escaped_after + else: + return content.replace("{", "{{").replace("}", "}}") # The last user message will be the query for the rag or llm chain last_user_message = next((message.content for message in reversed(chat_history) if message.role == 'user'),