-
- {messages.map((msg) => (
-
+
+
+ {/* Show centered button when no messages */}
+ {messages.length === 0 ? (
+
+
setIsWizardOpen(true)}
+ className="bg-gradient-to-r from-green-600 to-green-700 text-white px-8 py-5 rounded-lg shadow-lg hover:from-green-700 hover:to-green-800 transition-all duration-200 hover:scale-[1.02] flex items-center justify-center space-x-3"
+ title="Open Workload Configuration Wizard"
>
-
-
- {msg.content
- ? renderMessageContent(msg.content, false, msg.id)
- : msg.role === "assistant" && streamState.isTyping
- ? renderMessageContent("", true, msg.id)
- : ""}
+
+
+
+
Create vGPU Sizing Recommendation
+
+
+ ) : (
+
+
+ {messages.map((msg) => (
+
+
+
+ {msg.content
+ ? renderMessageContent(msg.content, false, msg.id)
+ : msg.role === "assistant" && streamState.isTyping
+ ? renderMessageContent("", true, msg.id)
+ : ""}
+
+
-
+ ))}
+
- ))}
-
-
-
-
-
-
-
setIsWizardOpen(true)}
- className="w-full bg-gradient-to-r from-green-600 to-green-700 text-white p-4 rounded-lg shadow-lg hover:from-green-700 hover:to-green-800 transition-all duration-200 hover:scale-[1.02] flex items-center justify-center space-x-3"
- title="Open Workload Configuration Wizard"
- >
-
-
-
- vGPU
- Initialize Sizing Job
-
-
+
+ )}
diff --git a/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/VGPUConfigCard.tsx b/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/VGPUConfigCard.tsx
index 54889e8b8..940ce5571 100644
--- a/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/VGPUConfigCard.tsx
+++ b/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/VGPUConfigCard.tsx
@@ -15,7 +15,32 @@
"use client";
-import { useState, ReactNode, useRef } from "react";
+import { useState, ReactNode } from "react";
+
+// Tooltip trigger component - displays content in card's bottom banner
+const TooltipTrigger = ({
+ content,
+ children,
+ onShow,
+ onHide
+}: {
+ content: string;
+ children: ReactNode;
+ onShow: (content: string) => void;
+ onHide: () => void;
+}) => {
+ return (
+
+
onShow(content)}
+ onMouseLeave={onHide}
+ className="cursor-help"
+ >
+ {children}
+
+
+ );
+};
interface VGPUConfig {
title: string;
@@ -25,6 +50,7 @@ interface VGPUConfig {
gpu_model?: string | null;
vcpu_count?: number | null;
gpu_memory_size?: number | null;
+ gpu_count?: number | null;
system_RAM?: number | null;
concurrent_users?: number | null;
rag_breakdown?: {
@@ -37,6 +63,9 @@ interface VGPUConfig {
vector_db_dimension?: number;
reranker_model?: string;
reranker_memory?: string;
+ // Token configuration
+ prompt_size?: number;
+ response_size?: number;
};
// Legacy fields for backward compatibility (to be removed)
vGPU_profile?: string | null;
@@ -61,33 +90,10 @@ interface VGPUConfig {
interface VGPUConfigCardProps {
config: VGPUConfig;
+ hideAdvancedDetails?: boolean;
+ showOnlyAdvancedDetails?: boolean;
}
-// Tooltip trigger component - displays content in card's bottom banner
-const TooltipTrigger = ({
- content,
- children,
- onShow,
- onHide
-}: {
- content: string;
- children: ReactNode;
- onShow: (content: string) => void;
- onHide: () => void;
-}) => {
- return (
-
-
onShow(content)}
- onMouseLeave={onHide}
- className="cursor-help"
- >
- {children}
-
-
- );
-};
-
// Parameter definitions for tooltips - detailed explanations for users
const parameterDefinitions: { [key: string]: string } = {
vgpu_profile: "The specific NVIDIA vGPU profile (e.g., L40S-24Q, BSE-48Q) that partitions the physical GPU. The number indicates VRAM in GB, and 'Q' means it's optimized for compute workloads.",
@@ -102,6 +108,7 @@ const parameterDefinitions: { [key: string]: string } = {
time_to_first_token: "Time from request start until the first output token is generated (TTFT). Critical for streaming responses and perceived responsiveness. Heavily influenced by prompt length.",
throughput: "Number of tokens the system can generate per second across all concurrent requests. Higher throughput means better overall capacity and efficiency.",
model_tag: "The specific LLM model identifier (e.g., meta-llama/Llama-3-8b-instruct). Used to determine model size, architecture, and memory requirements.",
+ precision: "Numerical precision for model inference. FP16 (16-bit) offers high accuracy with moderate memory. FP8 (8-bit) reduces memory by ~50% with minimal accuracy loss. FP4 (4-bit) offers maximum memory savings for inference-only workloads.",
vector_db_vectors: "Total number of document embeddings stored in the vector database. More vectors = larger knowledge base but requires more memory for the vector index.",
vector_db_dimension: "Dimensionality of each embedding vector (determined by the embedding model). Common dimensions: 384, 768, 1024, 1536. Higher dimensions capture more semantic information but require more memory.",
// Legacy fields (kept for backward compatibility)
@@ -167,6 +174,19 @@ const ParameterIcon = ({ type, className = "w-4 h-4" }: { type: string; classNam
}
};
+// Helper to darken a color
+const darkenColor = (color: string, amount: number = 0.4): string => {
+ // Handle hex colors
+ if (color.startsWith('#')) {
+ const hex = color.slice(1);
+ const r = Math.max(0, Math.floor(parseInt(hex.slice(0, 2), 16) * (1 - amount)));
+ const g = Math.max(0, Math.floor(parseInt(hex.slice(2, 4), 16) * (1 - amount)));
+ const b = Math.max(0, Math.floor(parseInt(hex.slice(4, 6), 16) * (1 - amount)));
+ return `#${r.toString(16).padStart(2, '0')}${g.toString(16).padStart(2, '0')}${b.toString(16).padStart(2, '0')}`;
+ }
+ return color;
+};
+
// Circular Progress Chart Component
const VRAMUsageChart = ({
usedVRAM,
@@ -179,19 +199,31 @@ const VRAMUsageChart = ({
numGPUs: number;
gpuModel?: string;
}) => {
- const percentage = Math.min((usedVRAM / totalVRAM) * 100, 100);
+ // Calculate percentages - usable is 95% of total (5% reserved for system overhead)
+ const usableVRAM = totalVRAM * 0.95;
+ const reservedVRAM = totalVRAM * 0.05;
+ const percentage = Math.min((usedVRAM / usableVRAM) * 100, 100);
+ const reservedPercentage = 5; // Fixed 5% reserved for system overhead
+ // Cap the used percentage at 95% so overhead segment is always visible
+ // The dial shows: 0-95% for actual usage, 95-100% for reserved overhead
+ const usedPercentageOfTotal = Math.min((usedVRAM / totalVRAM) * 100, 95);
+
const radius = 80;
const strokeWidth = 12;
const normalizedRadius = radius - strokeWidth * 2;
const circumference = normalizedRadius * 2 * Math.PI;
- const strokeDashoffset = circumference - (percentage / 100) * circumference;
- // Determine fit category and color
- const getFitCategory = (pct: number): { label: string; color: string; bgColor: string; textColor: string } => {
+ // Calculate stroke offsets - overhead segment starts at 95% position
+ const usedStrokeDashoffset = circumference - (usedPercentageOfTotal / 100) * circumference;
+ const reservedStrokeDashoffset = circumference - (95 / 100) * circumference;
+
+ // Determine fit category and color based on usable percentage
+ const getFitCategory = (pct: number): { label: string; color: string; darkColor: string; bgColor: string; textColor: string } => {
if (pct >= 90) {
return {
label: "TIGHT",
color: "#ef4444", // red-500
+ darkColor: darkenColor("#ef4444", 0.5), // darker red for overhead
bgColor: "rgba(239, 68, 68, 0.1)", // red with opacity
textColor: "#fca5a5" // red-300
};
@@ -199,6 +231,7 @@ const VRAMUsageChart = ({
return {
label: "MODERATE",
color: "#76b900", // NVIDIA green
+ darkColor: darkenColor("#76b900", 0.5), // darker green for overhead
bgColor: "rgba(118, 185, 0, 0.1)", // green with opacity
textColor: "#a3e635" // lime-400
};
@@ -206,6 +239,7 @@ const VRAMUsageChart = ({
return {
label: "COMFORTABLE",
color: "#10b981", // emerald-500
+ darkColor: darkenColor("#10b981", 0.5), // darker emerald for overhead
bgColor: "rgba(16, 185, 129, 0.1)", // emerald with opacity
textColor: "#6ee7b7" // emerald-300
};
@@ -240,19 +274,43 @@ const VRAMUsageChart = ({
cx={radius}
cy={radius}
/>
- {/* Progress circle */}
+ {/* Used VRAM circle */}
+ {/* Reserved/Overhead segment (5% - striped pattern to stand out) */}
+
+ {/* Overhead indicator line (subtle marker at 95% position) */}
+
{/* Center text */}
@@ -279,14 +337,14 @@ const VRAMUsageChart = ({
{/* Usage details */}
- {usedVRAM.toFixed(1)} GB
+ {usedVRAM.toFixed(1)} GB + {reservedVRAM.toFixed(1)} overhead
of {totalVRAM.toFixed(0)} GB VRAM
{numGPUs > 1 && (
- ({numGPUs}× {gpuModel || 'GPU'} GPUs with {(totalVRAM / numGPUs).toFixed(0)}GB each)
+ ({numGPUs}× {gpuModel || 'GPU'} with {(totalVRAM / numGPUs).toFixed(0)}GB each)
)}
@@ -320,7 +378,7 @@ const getIconType = (key: string): string => {
}
};
-export default function VGPUConfigCard({ config }: VGPUConfigCardProps) {
+export default function VGPUConfigCard({ config, hideAdvancedDetails = false, showOnlyAdvancedDetails = false }: VGPUConfigCardProps) {
const [isExpanded, setIsExpanded] = useState(true);
const [showAdvancedDetails, setShowAdvancedDetails] = useState(false);
const [showRawJSON, setShowRawJSON] = useState(false);
@@ -451,6 +509,8 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) {
return 'Performance Tier';
case 'concurrent_users':
return 'Concurrent Users';
+ case 'precision':
+ return 'Precision';
default:
return key.replace(/_/g, ' ').replace(/^./, str => str.toUpperCase());
}
@@ -458,8 +518,8 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) {
const isRelevantConfig = Object.values(config.parameters).some(value => value !== null && value !== undefined);
- // Fields to exclude from display
- const excludedFields = ['total_CPU_count', 'total_cpu_count', 'rag_breakdown'];
+ // Fields to exclude from display (RAG-specific fields are shown in RAG Components section)
+ const excludedFields = ['total_CPU_count', 'total_cpu_count', 'rag_breakdown', 'rag_config', 'gpu_count', 'gpu_model', 'embedding_model', 'vector_db_vectors', 'vector_db_dimension'];
// Separate key and advanced parameters, excluding unwanted fields
const keyParams = Object.entries(config.parameters).filter(([key]) =>
@@ -538,7 +598,9 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) {
// For passthrough, use 95% usable capacity (reserve 5% for driver/OS overhead)
const usablePerGpu = gpuCapacity * 0.95;
- const numGPUs = Math.ceil(estimatedVRAM / usablePerGpu);
+ // Use gpu_count from backend if available (backend has already calculated this correctly)
+ const backendGpuCount = config.parameters.gpu_count;
+ const numGPUs = backendGpuCount && backendGpuCount >= 1 ? backendGpuCount : Math.ceil(estimatedVRAM / usablePerGpu);
return {
used: estimatedVRAM,
@@ -553,8 +615,65 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) {
const singleGPUCapacity = getGPUCapacityFromProfile(profile);
if (!singleGPUCapacity) return null;
- // Calculate number of GPUs needed (ceiling)
- const numGPUs = Math.ceil(estimatedVRAM / singleGPUCapacity);
+ // Use gpu_count from backend if available AND valid
+ // Validate that backend gpu_count provides enough capacity for required VRAM
+ const backendGpuCount = config.parameters.gpu_count;
+ let numGPUs: number;
+ let needsLargerProfile = false;
+
+ // Small epsilon for floating point comparison (0.1 GB tolerance)
+ const EPSILON = 0.1;
+
+ if (backendGpuCount && backendGpuCount >= 1) {
+ const backendCapacity = backendGpuCount * singleGPUCapacity;
+ // Check if backend calculation provides enough capacity (with 5% headroom)
+ const usableBackendCapacity = backendCapacity * 0.95;
+ // Use epsilon for floating point comparison (24 * 0.95 = 22.799999... not 22.8)
+ if (usableBackendCapacity + EPSILON >= estimatedVRAM) {
+ numGPUs = backendGpuCount;
+ } else {
+ // Backend config is invalid - recalculate based on usable capacity per GPU
+ const usablePerGpu = singleGPUCapacity * 0.95;
+ numGPUs = Math.ceil(estimatedVRAM / (usablePerGpu + EPSILON));
+ console.warn(`Backend gpu_count (${backendGpuCount}) insufficient for ${estimatedVRAM}GB. Recalculated to ${numGPUs}.`);
+ }
+ } else {
+ // No backend count - calculate ourselves with 5% headroom per GPU
+ const usablePerGpu = singleGPUCapacity * 0.95;
+ numGPUs = Math.ceil(estimatedVRAM / (usablePerGpu + EPSILON));
+ }
+
+ // Check if there's a larger vGPU profile available that could reduce GPU count
+ // Only show warning if: 1) numGPUs > 1, 2) it's a vGPU profile, AND 3) a larger profile exists
+ if (numGPUs > 1 && profile) {
+ // Get max profile capacity for this GPU family
+ const gpuFamily = profile.split('-')[0];
+ const maxProfiles: { [key: string]: number } = {
+ 'BSE': 96,
+ 'L40S': 48,
+ 'L40': 48,
+ 'A40': 48,
+ 'L4': 24
+ };
+ const maxProfileCapacity = maxProfiles[gpuFamily] || singleGPUCapacity;
+
+ // Only flag as needing larger profile if:
+ // 1. Current profile is NOT the largest available for this GPU family, AND
+ // 2. A larger profile would actually reduce the number of GPUs needed
+ if (singleGPUCapacity < maxProfileCapacity) {
+ const usableMaxProfile = maxProfileCapacity * 0.95;
+ const gpusNeededWithMaxProfile = Math.ceil(estimatedVRAM / usableMaxProfile);
+
+ // Only show warning if upgrading to max profile would reduce GPU count
+ if (gpusNeededWithMaxProfile < numGPUs) {
+ needsLargerProfile = true;
+ console.warn(`Consider ${gpuFamily}-${maxProfileCapacity}Q profile which would only need ${gpusNeededWithMaxProfile} GPU(s) instead of ${numGPUs}.`);
+ }
+ }
+ // Note: If already using largest profile, multi-GPU vGPU is the correct solution
+ // No warning needed - this is expected behavior for large workloads
+ }
+
// Calculate total capacity across all GPUs
const totalCapacity = numGPUs * singleGPUCapacity;
@@ -563,20 +682,85 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) {
total: totalCapacity,
numGPUs: numGPUs,
singleGPUCapacity: singleGPUCapacity,
- isPassthrough: false
+ isPassthrough: false,
+ needsLargerProfile: needsLargerProfile
};
};
const vramUsage = getVRAMUsageData();
+ // If showing only advanced details, render just that section
+ if (showOnlyAdvancedDetails) {
+ return (
+
+ {advancedParams.length > 0 && (
+
+
setShowAdvancedDetails(!showAdvancedDetails)}
+ className="flex items-center gap-2 text-gray-400 hover:text-[#76b900]/70 transition-all duration-150 ease-in-out mb-2 group"
+ >
+
+
+
+ Advanced Details
+
+
+ {showAdvancedDetails && (
+
+
+
+ {advancedParams.map(([key, value], index) => (
+
+
+
+
+
+
+ {getParameterLabel(key)}
+
+ {parameterDefinitions[key] && (
+
+
+
+
+
+
+ {parameterDefinitions[key]}
+
+
+
+ )}
+
+
+ {formatParameterValue(key, value)}
+
+
+
+
+ ))}
+
+
+
+ )}
+
+ )}
+
+ );
+ }
+
return (
-
+
{/* Content */}
{isExpanded && (
-
+
{/* Host Capabilities Context */}
{config.host_capabilities && (
-
+
@@ -608,12 +792,12 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) {
)}
-
+
{/* VRAM Usage Chart / JSON View */}
{vramUsage && (
-
+
{/* Header - Always visible */}
-
+
@@ -627,7 +811,7 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) {
?
-
+
{config.rationale || "This configuration balances performance and resource efficiency for your specific AI workload, ensuring optimal GPU utilization while maintaining cost-effectiveness."}
@@ -698,16 +882,20 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) {
gpuModel={vramUsage.gpuModel}
/>
-
-
-
Configuration Summary
-
+
+
+
Configuration Summary
+
- Required VRAM:
+ Required VRAM:
{vramUsage.used.toFixed(1)} GB
- GPU Profile:
+ 5% Reserved Overhead:
+ {(vramUsage.total * 0.05).toFixed(1)} GB
+
+
+ GPU Profile:
{config.parameters.vgpu_profile || config.parameters.vGPU_profile ||
GPU Passthrough Required
@@ -715,11 +903,16 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) {
- GPUs Required:
+ GPUs Required:
{vramUsage.numGPUs}
+ {vramUsage.needsLargerProfile && (
+
+ ⚠️ Consider a larger vGPU profile or GPU passthrough. Multi-GPU vGPU profiles typically require separate VMs.
+
+ )}
- Total Capacity:
+ Total Capacity:
{vramUsage.total.toFixed(0)} GB
@@ -727,174 +920,154 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) {
Utilization Guidelines
-
- ●
+
+ ●
Comfortable (0-60%): Ideal for production with room for growth
-
- ●
+
+ ●
Moderate (60-90%): Efficient utilization with performance buffer
-
- ●
+
+ ●
Tight (90-100%): Consider larger GPU profile or additional units
- >
- )}
-
- )}
-
- {/* Key Parameters Section */}
- {keyParams.length > 0 && (
-
-
Key Parameters
-
-
-
- {keyParams.map(([key, value], index) => (
-
-
-
-
-
- {getParameterLabel(key)}
+
+ {/* Key Parameters Section - Inside VRAM Analysis */}
+ {keyParams.length > 0 && (
+
+
Key Parameters
+
+
+
+ {keyParams.map(([key, value], index) => (
+
+
+
+
+
+ {getParameterLabel(key)}
+
+ {parameterDefinitions[key] && (
+
+
+
+
+
+
+ {parameterDefinitions[key]}
+
+
+
+ )}
+
+
+
+ {formatParameterValue(key, value)}
- {parameterDefinitions[key] && (
-
setKeyParamsTooltip(null)}
- >
-
-
-
-
- )}
-
-
- {formatParameterValue(key, value)}
-
+ ))}
- ))}
-
-
-
- {/* Tooltip Banner for Key Parameters */}
- {keyParamsTooltip && (
-
-
-
-
-
-
{keyParamsTooltip}
- )}
-
+
+ )}
+ >
+ )}
)}
- {/* RAG Components Breakdown - Only show for RAG workloads */}
+ {/* RAG Components - Only show for RAG workloads */}
{config.parameters.rag_breakdown &&
config.parameters.rag_breakdown.workload_type === 'rag' &&
(config.parameters.rag_breakdown.embedding_model || config.parameters.rag_breakdown.vector_db_memory) && (
-
-
+
+
- RAG Components Memory
+ RAG Components
-
-
+
+ {/* Embedding Model and Vector Database side by side - compact */}
+
{/* Embedding Model */}
{config.parameters.rag_breakdown.embedding_model && (
-
-
-
Embedding Model
-
- {config.parameters.rag_breakdown.embedding_model}
+
+
+
Embedding Model
+
+ {config.parameters.rag_breakdown.embedding_model.split('/').pop()}
{config.parameters.rag_breakdown.vector_db_dimension && (
-
- Output: {config.parameters.rag_breakdown.vector_db_dimension}D vectors
+
+ {config.parameters.rag_breakdown.vector_db_dimension}D output
)}
-
-
- {config.parameters.rag_breakdown.embedding_memory}
-
+
+ {config.parameters.rag_breakdown.embedding_memory}
)}
{/* Vector Database */}
{config.parameters.rag_breakdown.vector_db_memory && (
-
-
-
Vector Database Index
-
- {config.parameters.rag_breakdown.vector_db_vectors &&
- config.parameters.rag_breakdown.vector_db_dimension && (
- <>
-
- {config.parameters.rag_breakdown.vector_db_vectors >= 10000000 ? 'Extra Large' :
- config.parameters.rag_breakdown.vector_db_vectors >= 1000000 ? 'Large' :
- config.parameters.rag_breakdown.vector_db_vectors >= 100000 ? 'Medium' : 'Small'}
-
-
- {config.parameters.rag_breakdown.vector_db_vectors.toLocaleString()} vectors × {config.parameters.rag_breakdown.vector_db_dimension}D
-
- >
- )}
- {(!config.parameters.rag_breakdown.vector_db_vectors ||
- !config.parameters.rag_breakdown.vector_db_dimension) && (
-
Index memory
- )}
-
+
+
+
Vector Database
+ {config.parameters.rag_breakdown.vector_db_vectors &&
+ config.parameters.rag_breakdown.vector_db_dimension ? (
+ <>
+
+ {config.parameters.rag_breakdown.vector_db_vectors >= 10000000 ? 'Extra Large' :
+ config.parameters.rag_breakdown.vector_db_vectors >= 1000000 ? 'Large' :
+ config.parameters.rag_breakdown.vector_db_vectors >= 100000 ? 'Medium' : 'Small'}
+
+
+ {config.parameters.rag_breakdown.vector_db_vectors.toLocaleString()} × {config.parameters.rag_breakdown.vector_db_dimension}D
+
+ >
+ ) : (
+
Index memory
+ )}
-
-
- {config.parameters.rag_breakdown.vector_db_memory}
-
+
+ {config.parameters.rag_breakdown.vector_db_memory}
)}
+
- {/* Reranker Model */}
- {config.parameters.rag_breakdown.reranker_model && (
-
-
-
Reranker Model
-
- {config.parameters.rag_breakdown.reranker_model}
-
-
-
-
- {config.parameters.rag_breakdown.reranker_memory}
-
+ {/* Reranker Model - full width below if present */}
+ {config.parameters.rag_breakdown.reranker_model && (
+
+
+
Reranker
+
+ {config.parameters.rag_breakdown.reranker_model}
- )}
-
+
+ {config.parameters.rag_breakdown.reranker_memory}
+
+
+ )}
)}
{/* Advanced Details - Collapsible */}
- {advancedParams.length > 0 && (
+ {!hideAdvancedDetails && advancedParams.length > 0 && (
setShowAdvancedDetails(!showAdvancedDetails)}
@@ -942,69 +1115,6 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) {
))}
-
- {/* Add RAG-specific vector DB details */}
- {config.parameters.rag_breakdown?.vector_db_vectors && (
-
-
-
-
-
-
- Vector DB Vectors
-
- {parameterDefinitions['vector_db_vectors'] && (
-
setAdvancedTooltip(null)}
- >
-
-
-
-
- )}
-
-
- {config.parameters.rag_breakdown.vector_db_vectors.toLocaleString()}
-
-
-
-
- )}
-
- {config.parameters.rag_breakdown?.vector_db_dimension && (
-
-
-
-
-
-
- Vector Dimension
-
- {parameterDefinitions['vector_db_dimension'] && (
-
setAdvancedTooltip(null)}
- >
-
-
-
-
- )}
-
-
- {config.parameters.rag_breakdown.vector_db_dimension}D
-
-
-
-
- )}
@@ -1025,7 +1135,7 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) {
{/* Notes/Recommendations */}
{config.notes && config.notes.length > 0 && (
-
+
@@ -1044,7 +1154,7 @@ export default function VGPUConfigCard({ config }: VGPUConfigCardProps) {
{/* No config warning */}
{!isRelevantConfig && (
-
+
diff --git a/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/WorkloadConfigWizard.tsx b/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/WorkloadConfigWizard.tsx
index d776eef14..8c87f9e4f 100644
--- a/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/WorkloadConfigWizard.tsx
+++ b/community/ai-vws-sizing-advisor/frontend/src/app/components/Chat/WorkloadConfigWizard.tsx
@@ -52,14 +52,14 @@ export default function WorkloadConfigWizard({
}: WorkloadConfigWizardProps) {
const [config, setConfig] = useState({
workloadType: "",
- specificModel: "",
+ specificModel: "nemotron-30b-fp8",
modelSize: "",
batchSize: "",
promptSize: "1024",
responseSize: "256",
embeddingModel: "nvidia/nvolveqa-embed-large-1B",
- gpuInventory: { "DC": 1 },
- precision: "fp16",
+ gpuInventory: { "BSE": 1 },
+ precision: "fp8",
vectorDimension: "1024", // Default to 1024 (matches default embedding model)
numberOfVectors: "10000", // Default to 10,000
advancedConfig: {
@@ -89,23 +89,34 @@ export default function WorkloadConfigWizard({
const data = await response.json();
if (data.models && data.models.length > 0) {
// Use modelTag as value to ensure uniqueness (full model ID like "org/model-name")
- const formattedModels = data.models.map((model: any) => ({
- value: model.modelTag.toLowerCase().replace(/\//g, '-').replace(/\./g, '-'),
- label: model.label,
- modelTag: model.modelTag
- }));
- setDynamicModels(formattedModels);
- console.log(`✓ Successfully loaded ${formattedModels.length} models from HuggingFace`);
+ const formattedModels = data.models
+ .filter((model: any) => model && model.modelTag) // Filter out invalid models
+ .map((model: any) => ({
+ value: (model.modelTag || '').toLowerCase().replace(/\//g, '-').replace(/\./g, '-'),
+ label: model.label || model.modelTag || 'Unknown Model',
+ modelTag: model.modelTag
+ }));
+ // Always prepend Nemotron as the first/default option
+ const nemotronModel = {
+ value: "nemotron-30b-fp8",
+ label: "NVIDIA Nemotron-3 Nano 30B",
+ modelTag: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
+ };
+ setDynamicModels([nemotronModel, ...formattedModels]);
+ console.log(`✓ Successfully loaded ${formattedModels.length + 1} models (including Nemotron)`);
} else {
- console.warn('No models returned from API');
+ console.warn('No models returned from API, using fallback');
+ setIsLoadingModels(false);
}
} else {
console.warn('API returned non-OK status:', response.status);
+ setIsLoadingModels(false);
}
} catch (error) {
console.error('Failed to fetch dynamic models:', error);
console.log('Using fallback model list');
// Fallback to hardcoded models will be used
+ setIsLoadingModels(false);
} finally {
setIsLoadingModels(false);
}
@@ -146,15 +157,16 @@ export default function WorkloadConfigWizard({
];
const availableGPUInventory = [
- { value: "DC", label: "NVIDIA RTX Pro 6000 BSE", desc: "96GB GDDR7 with ECC, Blackwell, passive‑cooled dual‑slot PCIe Gen5 – Enterprise AI/graphics, scientific computing & virtual workstations" },
- { value: "l40s", label: "NVIDIA L40S", desc: "48GB GDDR6 with ECC, Ada Lovelace, 350W - ML training & inference + virtual workstations" },
- { value: "l40", label: "NVIDIA L40", desc: "48GB GDDR6 with ECC, Ada Lovelace - Virtual workstations & compute workloads" },
- { value: "l4", label: "NVIDIA L4", desc: "24GB GDDR6 with ECC, Ada Lovelace, 72W - AI inference, small model training & 3D graphics" },
- { value: "a40", label: "NVIDIA A40", desc: "48GB GDDR6 with ECC, Ampere, 300W - 3D design & mixed virtual workstation workloads" },
+ { value: "BSE", label: "NVIDIA RTX Pro 6000 BSE", desc: "96GB GDDR7 with ECC, Blackwell, passive‑cooled dual‑slot PCIe Gen5 – Enterprise AI/graphics, scientific computing & virtual workstations" },
+ { value: "L40S", label: "NVIDIA L40S", desc: "48GB GDDR6 with ECC, Ada Lovelace, 350W - ML training & inference + virtual workstations" },
+ { value: "L40", label: "NVIDIA L40", desc: "48GB GDDR6 with ECC, Ada Lovelace - Virtual workstations & compute workloads" },
+ { value: "L4", label: "NVIDIA L4", desc: "24GB GDDR6 with ECC, Ada Lovelace, 72W - AI inference, small model training & 3D graphics" },
+ { value: "A40", label: "NVIDIA A40", desc: "48GB GDDR6 with ECC, Ampere, 300W - 3D design & mixed virtual workstation workloads" },
];
// Fallback hardcoded models in case dynamic fetch fails
const fallbackModels = [
+ { value: "nemotron-30b-fp8", label: "NVIDIA Nemotron-3 Nano 30B", modelTag: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8" },
{ value: "llama-3-8b", label: "Llama-3-8B", modelTag: "meta-llama/Meta-Llama-3-8B-Instruct" },
{ value: "llama-3-70b", label: "Llama-3-70B", modelTag: "meta-llama/Meta-Llama-3-70B-Instruct" },
{ value: "llama-3.1-8b", label: "Llama-3.1-8B", modelTag: "meta-llama/Llama-3.1-8B-Instruct" },
@@ -170,8 +182,8 @@ export default function WorkloadConfigWizard({
const specificModels = dynamicModels.length > 0 ? dynamicModels : fallbackModels;
const precisionOptions = [
- { value: "fp16", label: "FP16", desc: "Half precision - Recommended balance of performance and accuracy" },
- { value: "fp8", label: "FP8", desc: "8-bit floating point - Higher performance with good accuracy" },
+ { value: "fp8", label: "FP8", desc: "8-bit floating point - Recommended for best performance with good accuracy" },
+ { value: "fp16", label: "FP16", desc: "Half precision - Higher accuracy, more memory usage" },
{ value: "fp4", label: "FP4", desc: "4-bit floating point - Maximum performance, lower accuracy" },
];
@@ -358,7 +370,7 @@ export default function WorkloadConfigWizard({
parts.push(`with ${precisionLabel} precision`);
} else {
// Recommended precision
- parts.push(`with FP16 precision`);
+ parts.push(`with FP8 precision`);
}
// Add retrieval configuration for RAG workloads
@@ -382,7 +394,20 @@ export default function WorkloadConfigWizard({
// Determine the model tag to use
let modelTagToUse = null;
if (config.specificModel && config.specificModel !== 'unknown') {
+ // First try to find in dynamic/fallback models
modelTagToUse = specificModels.find(m => m.value === config.specificModel)?.modelTag || null;
+
+ // Hardcoded fallback for common models if lookup fails
+ if (!modelTagToUse) {
+ const modelTagFallbacks: Record = {
+ 'nemotron-30b-fp8': 'nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8',
+ 'llama-3-8b': 'meta-llama/Meta-Llama-3-8B-Instruct',
+ 'llama-3-70b': 'meta-llama/Meta-Llama-3-70B-Instruct',
+ 'llama-3.1-8b': 'meta-llama/Llama-3.1-8B-Instruct',
+ 'llama-3.1-70b': 'meta-llama/Llama-3.3-70B-Instruct',
+ };
+ modelTagToUse = modelTagFallbacks[config.specificModel] || config.specificModel;
+ }
} else if (config.specificModel === 'unknown' && config.modelSize) {
// Use default model for the size category
modelTagToUse = getDefaultModelForSize(config.modelSize).modelTag;
@@ -398,14 +423,14 @@ export default function WorkloadConfigWizard({
responseSize: config.responseSize ? parseInt(config.responseSize) : 256,
embeddingModel: config.workloadType === 'rag' ? (config.embeddingModel || getRecommendedEmbeddingModel()) : null,
gpuInventory: config.gpuInventory,
- precision: config.precision || 'fp16',
+ precision: config.precision || 'fp8',
// Add retrieval config for RAG
...(config.workloadType === 'rag' && {
vectorDimension: config.vectorDimension ? parseInt(config.vectorDimension) : null,
numberOfVectors: config.numberOfVectors ? parseInt(config.numberOfVectors) : null,
}),
// Add computed values for easier backend processing
- selectedGPU: Object.keys(config.gpuInventory)[0] || 'DC',
+ selectedGPU: Object.keys(config.gpuInventory)[0] || 'BSE',
gpuCount: Object.values(config.gpuInventory)[0] as number || 1,
// Include advanced configuration
advancedConfig: config.advancedConfig,
@@ -419,17 +444,17 @@ export default function WorkloadConfigWizard({
const query = generateQuery();
onSubmit(query);
onClose();
- // Reset form
+ // Reset form - use same defaults as initial state
setConfig({
workloadType: "",
- specificModel: "",
+ specificModel: "nemotron-30b-fp8", // Default to Nemotron model
modelSize: "",
batchSize: "",
promptSize: "1024",
responseSize: "256",
embeddingModel: "nvidia/nvolveqa-embed-large-1B",
- gpuInventory: { "DC": 1 },
- precision: "fp16",
+ gpuInventory: { "BSE": 1 },
+ precision: "fp8",
vectorDimension: "1024", // Default to 1024 (matches default embedding model)
numberOfVectors: "10000", // Default to 10,000
advancedConfig: {
@@ -548,13 +573,10 @@ export default function WorkloadConfigWizard({
className="w-full p-3 rounded-lg bg-neutral-800 border border-neutral-600 text-white mb-4"
disabled={isLoadingModels}
>
-
- {isLoadingModels ? "Loading models from HuggingFace..." : "Select a specific model"}
-
- Unknown / Not Sure
{specificModels.map((model) => (
{model.label}
))}
+ Unknown / Not Sure
{!isLoadingModels && dynamicModels.length > 0 && (
✓ {dynamicModels.length} models loaded from HuggingFace
@@ -705,12 +727,12 @@ export default function WorkloadConfigWizard({
GPU Selection
{
// Clear existing inventory and set the new one
setConfig(prev => ({
...prev,
- gpuInventory: e.target.value ? { [e.target.value]: 1 } : { "DC": 1 }
+ gpuInventory: e.target.value ? { [e.target.value]: 1 } : { "BSE": 1 }
}));
}}
className="w-full px-3 py-2 rounded-md bg-neutral-800 border border-neutral-700 text-white text-sm focus:outline-none focus:ring-1 focus:ring-green-500 focus:border-green-500"
diff --git a/community/ai-vws-sizing-advisor/frontend/src/app/components/Header/Header.tsx b/community/ai-vws-sizing-advisor/frontend/src/app/components/Header/Header.tsx
index 906e05b7f..d63f94238 100644
--- a/community/ai-vws-sizing-advisor/frontend/src/app/components/Header/Header.tsx
+++ b/community/ai-vws-sizing-advisor/frontend/src/app/components/Header/Header.tsx
@@ -23,6 +23,10 @@ interface HeaderProps {
}
export default function Header({ onToggleSidebar, activePanel }: HeaderProps) {
+ // Pull model names from centralized configuration (via environment variables)
+ const reasoningModel = (process.env.NEXT_PUBLIC_MODEL_NAME || "nvidia/llama-3.3-nemotron-super-49b-v1").replace(/^nvidia\//, '');
+ const embeddingModel = (process.env.NEXT_PUBLIC_EMBEDDING_MODEL || "nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1").replace(/^nvidia\//, '');
+
return (
@@ -36,9 +40,14 @@ export default function Header({ onToggleSidebar, activePanel }: HeaderProps) {
-
- RAG Chat Model: {process.env.NEXT_PUBLIC_MODEL_NAME || "Meta Llama 3.1 8B"}
-
+
+
+ Reasoning: {reasoningModel}
+
+
+ Embedding: {embeddingModel}
+
+
void;
+ chatHistory: Array<{
+ role: "user" | "assistant";
+ content: string;
+ citations?: Array<{ text: string; source: string; document_type: string }>;
+ }>;
+ isLoading?: boolean;
+ onCloseChat?: () => void;
+}
+
+export default function ChatPanel({
+ vgpuConfig,
+ onSendMessage,
+ chatHistory,
+ isLoading = false,
+ onCloseChat,
+}: ChatPanelProps) {
+ const [inputMessage, setInputMessage] = useState("");
+ const messagesEndRef = useRef(null);
+
+ const handleSubmit = (e: React.FormEvent) => {
+ e.preventDefault();
+ if (inputMessage.trim() && !isLoading) {
+ onSendMessage(inputMessage.trim());
+ setInputMessage("");
+ }
+ };
+
+ // Auto-scroll to bottom when new messages arrive
+ useEffect(() => {
+ messagesEndRef.current?.scrollIntoView({ behavior: "smooth" });
+ }, [chatHistory, isLoading]);
+
+ return (
+
+ {/* Chat Header */}
+
+
+
+ Ask Questions About Your Configuration
+
+ {onCloseChat && (
+
+
+
+
+
+ )}
+
+
+
+ {/* Chat Messages - Scrollable */}
+
+ {chatHistory.length === 0 ? (
+
+
Examples
+
+
+
- What does this profile mean?
+
- Can it support 10 concurrent users?
+
- Should I use the next larger profile?
+
- What are the RAM requirements?
+
+
+
+ ) : (
+ chatHistory.map((msg, idx) => (
+
+
+
+ {/* Display citations if available */}
+ {msg.citations && msg.citations.length > 0 && (
+
+
+
+
+
+
+
Referenced Documents:
+
+
+ {[...new Set(msg.citations.map(c => c.source))].map((source, i) => (
+
+ • {source.split('/').pop() || source}
+
+ ))}
+
+
+
+ )}
+
+ ))
+ )}
+ {isLoading && (
+
+ )}
+
+
+
+ {/* Input Area - At bottom of chat panel */}
+
+
+
+
+ );
+}
+
diff --git a/community/ai-vws-sizing-advisor/frontend/src/app/components/RightSidebar/RightSidebar.tsx b/community/ai-vws-sizing-advisor/frontend/src/app/components/RightSidebar/RightSidebar.tsx
index e057db93b..8ac8ac64b 100644
--- a/community/ai-vws-sizing-advisor/frontend/src/app/components/RightSidebar/RightSidebar.tsx
+++ b/community/ai-vws-sizing-advisor/frontend/src/app/components/RightSidebar/RightSidebar.tsx
@@ -17,9 +17,22 @@
import { useEffect, useState } from "react";
import Citations from "./Citations";
+import ChatPanel from "./ChatPanel";
import { useSidebar } from "../../context/SidebarContext";
-export default function RightSidebar() {
+interface RightSidebarProps {
+ vgpuConfig?: any;
+ onSendChatMessage?: (message: string) => void;
+ chatHistory?: Array<{ role: "user" | "assistant"; content: string }>;
+ isChatLoading?: boolean;
+}
+
+export default function RightSidebar({
+ vgpuConfig,
+ onSendChatMessage,
+ chatHistory = [],
+ isChatLoading = false,
+}: RightSidebarProps) {
const { activePanel, closeSidebar, activeCitations } = useSidebar();
const [displayPanel, setDisplayPanel] = useState(activePanel);
@@ -34,27 +47,56 @@ export default function RightSidebar() {
}
}, [activePanel]);
+ const getPanelTitle = () => {
+ if (displayPanel === "citations") return "Citations";
+ if (displayPanel === "chat") return "Configuration Chat";
+ return "";
+ };
+
return (
-
-
- Citations
-
-
- ×
-
-
-
-
+ {displayPanel !== "chat" && (
+
+
+ {getPanelTitle()}
+
+
+ ×
+
+
+ )}
+
+ {displayPanel === "citations" && (
+
+
+
+ )}
+ {displayPanel === "chat" && onSendChatMessage && (
+
+
+ ×
+
+
+
+ )}
diff --git a/community/ai-vws-sizing-advisor/frontend/src/app/context/SidebarContext.tsx b/community/ai-vws-sizing-advisor/frontend/src/app/context/SidebarContext.tsx
index ee083cd89..e4f07dacc 100644
--- a/community/ai-vws-sizing-advisor/frontend/src/app/context/SidebarContext.tsx
+++ b/community/ai-vws-sizing-advisor/frontend/src/app/context/SidebarContext.tsx
@@ -19,9 +19,9 @@ import { createContext, useContext, useState, ReactNode } from "react";
import { Citation } from "@/types/chat";
interface SidebarContextType {
- activePanel: "citations" | "settings" | null;
+ activePanel: "citations" | "settings" | "chat" | null;
activeCitations: Citation[];
- toggleSidebar: (panel: "citations" | "settings") => void;
+ toggleSidebar: (panel: "citations" | "settings" | "chat") => void;
closeSidebar: () => void;
setActiveCitations: (citations: Citation[]) => void;
}
@@ -30,11 +30,11 @@ const SidebarContext = createContext
(undefined);
export function SidebarProvider({ children }: { children: ReactNode }) {
const [activePanel, setActivePanel] = useState<
- "citations" | "settings" | null
+ "citations" | "settings" | "chat" | null
>(null);
const [activeCitations, setActiveCitations] = useState([]);
- const toggleSidebar = (panel: "citations" | "settings") => {
+ const toggleSidebar = (panel: "citations" | "settings" | "chat") => {
setActivePanel(activePanel === panel ? null : panel);
};
diff --git a/community/ai-vws-sizing-advisor/frontend/src/app/globals.css b/community/ai-vws-sizing-advisor/frontend/src/app/globals.css
index c099bf4b3..1c3a98c54 100644
--- a/community/ai-vws-sizing-advisor/frontend/src/app/globals.css
+++ b/community/ai-vws-sizing-advisor/frontend/src/app/globals.css
@@ -20,6 +20,30 @@ body {
}
}
+/* Green scrollbar only for chat messages area */
+.chat-scrollbar::-webkit-scrollbar {
+ width: 10px;
+}
+
+.chat-scrollbar::-webkit-scrollbar-track {
+ background: #252525;
+ border-radius: 5px;
+}
+
+.chat-scrollbar::-webkit-scrollbar-thumb {
+ background: #76b900;
+ border-radius: 5px;
+}
+
+.chat-scrollbar::-webkit-scrollbar-thumb:hover {
+ background: #5a8c00;
+}
+
+.chat-scrollbar {
+ scrollbar-width: thin;
+ scrollbar-color: #76b900 #252525;
+}
+
@keyframes typing {
0% {
content: "";
diff --git a/community/ai-vws-sizing-advisor/frontend/src/types/chat.ts b/community/ai-vws-sizing-advisor/frontend/src/types/chat.ts
index ec60a8207..d2eff0e43 100644
--- a/community/ai-vws-sizing-advisor/frontend/src/types/chat.ts
+++ b/community/ai-vws-sizing-advisor/frontend/src/types/chat.ts
@@ -62,4 +62,5 @@ export interface GenerateRequest {
reranker_model?: string;
reranker_endpoint?: string;
stop?: string[];
+ conversational_mode?: boolean;
}
diff --git a/community/ai-vws-sizing-advisor/scripts/start_app.sh b/community/ai-vws-sizing-advisor/scripts/start_app.sh
index 4d6624fe2..c3c79cbc0 100755
--- a/community/ai-vws-sizing-advisor/scripts/start_app.sh
+++ b/community/ai-vws-sizing-advisor/scripts/start_app.sh
@@ -90,6 +90,14 @@ docker_login() {
setup_environment() {
print_info "Setting up environment..."
+ # Source centralized model configuration first (highest priority)
+ if [ -f "$COMPOSE_DIR/model_config.env" ]; then
+ set -a
+ source "$COMPOSE_DIR/model_config.env"
+ set +a
+ print_status "Loaded centralized model configuration"
+ fi
+
# Source .env file
if [ -f "$COMPOSE_DIR/.env" ]; then
set -a
@@ -253,6 +261,11 @@ show_status() {
echo " • Ingestor API: http://localhost:8082"
echo " • Milvus: http://localhost:9011"
echo ""
+ echo -e "${BLUE}🤖 AI Models:${NC}"
+ echo " • Chat/LLM: ${APP_LLM_MODELNAME:-nvidia/llama-3.3-nemotron-super-49b-v1}"
+ echo " • Embedding: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}"
+ echo " • Config File: deploy/compose/model_config.env"
+ echo ""
echo -e "${BLUE}📚 Knowledge Base:${NC}"
echo " • Collection: vgpu_knowledge_base"
echo " • Location: ./vgpu_docs"
@@ -274,6 +287,7 @@ show_status() {
echo " • Stop Backend: ./scripts/stop_app.sh"
echo " • Restart App: ./scripts/restart_app.sh"
echo " • Logs: docker logs -f rag-server"
+ echo " • Change Models: Edit deploy/compose/model_config.env"
echo ""
}
diff --git a/community/ai-vws-sizing-advisor/src/apply_configuration.py b/community/ai-vws-sizing-advisor/src/apply_configuration.py
index 497baaea2..3874a17bf 100644
--- a/community/ai-vws-sizing-advisor/src/apply_configuration.py
+++ b/community/ai-vws-sizing-advisor/src/apply_configuration.py
@@ -112,7 +112,11 @@ def calculate_gpu_memory_utilization(
return 0.9
# Use recommended workload size if provided, otherwise extract from profile
+ # IMPORTANT: Add KV cache to workload - the calculator provides model memory only
workload_memory_gb = recommended_workload_gb
+ if workload_memory_gb and kv_cache_gb:
+ workload_memory_gb = recommended_workload_gb + kv_cache_gb
+ logger.info(f" Total workload = {recommended_workload_gb}GB (model) + {kv_cache_gb:.2f}GB (KV cache) = {workload_memory_gb:.2f}GB")
if not workload_memory_gb:
# Extract profile memory size from vGPU profile name (e.g., "DC-12Q" → 12)
@@ -753,6 +757,7 @@ def run_command(cmd: str, shell: bool = True) -> tuple:
# Build docker command - only include max-model-len if specified
# Note: gpu_util may exceed 0.90 intentionally - vLLM will adapt KV cache to available memory
+ # Use vLLM v0.12.0+ for proper NemotronH (hybrid Mamba-Transformer) architecture support
docker_cmd_parts = [
"docker run -d --runtime nvidia --gpus all",
f"--name {container_name}",
@@ -760,9 +765,10 @@ def run_command(cmd: str, shell: bool = True) -> tuple:
f'-e "HUGGING_FACE_HUB_TOKEN={hf_token}"',
"-p 8000:8000",
"--ipc=host",
- "vllm/vllm-openai:latest",
+ "vllm/vllm-openai:v0.12.0",
f"--model {model}",
- f"--gpu-memory-utilization {gpu_util:.2f}"
+ f"--gpu-memory-utilization {gpu_util:.2f}",
+ "--trust-remote-code"
]
# Only add max-model-len if explicitly specified (let vLLM auto-detect otherwise)
diff --git a/community/ai-vws-sizing-advisor/src/calculator.py b/community/ai-vws-sizing-advisor/src/calculator.py
index 901010aaf..fcc0c6683 100644
--- a/community/ai-vws-sizing-advisor/src/calculator.py
+++ b/community/ai-vws-sizing-advisor/src/calculator.py
@@ -378,6 +378,8 @@ def _initialize_model_specs(self) -> List[ModelSpec]:
ModelSpec(name="Falcon-40B", params_billion=40, d_model=8192, n_layers=60),
ModelSpec(name="Falcon-180B", params_billion=180, d_model=14848, n_layers=80),
ModelSpec(name="Qwen-14B", params_billion=14, d_model=5120, n_layers=40),
+ # NVIDIA Nemotron model - 30B parameters
+ ModelSpec(name="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8", params_billion=30, d_model=8192, n_layers=48),
]
# Try to dynamically fetch popular models from HuggingFace
@@ -566,11 +568,25 @@ def get_available_gpus(self) -> List[str]:
def _find_model(self, model_name: str) -> Optional[ModelSpec]:
"""Find model specification by name, fetching from HuggingFace if not found"""
- # First, try to find in existing specs
+ # First, try exact match in existing specs
for model in self.model_specs:
if model.name == model_name or model.name.lower() == model_name.lower():
return model
+ # Try partial match for common patterns
+ lower_name = model_name.lower()
+ for model in self.model_specs:
+ lower_model_name = model.name.lower()
+ # Check if model name appears in query OR query appears in model name
+ # e.g., "nemotron-30b-fp8" in "nvidia/nvidia-nemotron-3-nano-30b-a3b-fp8" OR vice versa
+ if lower_model_name in lower_name or lower_name in lower_model_name:
+ logging.info(f"Partial match found: '{model.name}' matches '{model_name}'")
+ return model
+ # Check for Nemotron patterns specifically (handles "nemotron-30b-fp8" -> "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8")
+ if 'nemotron' in lower_name and 'nemotron' in lower_model_name:
+ logging.info(f"Nemotron match found: '{model.name}' for '{model_name}'")
+ return model
+
# If not found, try to create it dynamically from HuggingFace
logging.info(f"Model '{model_name}' not in cache, attempting to fetch from HuggingFace")
@@ -594,7 +610,32 @@ def _find_model(self, model_name: str) -> Optional[ModelSpec]:
except Exception as e:
logging.warning(f"Could not extract model from '{model_name}': {e}")
- return None
+ # Final fallback: Try to create a model spec from the name using parameter extraction
+ # This handles cases like "Llama-3-70B-Custom" where we can extract "70B"
+ params_billion = extract_model_params_from_name(model_name)
+ if params_billion:
+ logging.info(f"Creating dynamic model spec for '{model_name}' with {params_billion}B params")
+ estimated = estimate_model_spec_from_params(params_billion, model_name)
+ fallback_spec = ModelSpec(
+ name=model_name,
+ params_billion=params_billion,
+ n_layers=estimated.get('n_layers', 32),
+ d_model=estimated.get('d_model', 4096),
+ max_context_length=32768
+ )
+ self.model_specs.append(fallback_spec)
+ return fallback_spec
+
+ # Absolute last resort: Use a default 8B model spec
+ logging.warning(f"Could not determine model specs for '{model_name}', using default 8B model")
+ default_spec = ModelSpec(
+ name=model_name,
+ params_billion=8.0,
+ n_layers=32,
+ d_model=4096,
+ max_context_length=32768
+ )
+ return default_spec
def _find_gpu(self, gpu_name: str) -> Optional[GPUSpec]:
"""Find GPU specification by name"""
@@ -649,65 +690,57 @@ def _get_available_profiles(self, gpu_family: str) -> List[int]:
def _recommend_vgpu_profile(self, total_memory_needed: float, gpu_family: str,
safety_buffer_gb: float = 0.0) -> Dict[str, Any]:
"""
- Recommend vGPU profile based on total memory needed with 5% headroom.
+ Recommend vGPU profile based on total memory needed with 5% headroom reserve.
- CRITICAL RULE: Pick the SMALLEST profile where (profile × 0.95) >= total_memory_needed
- This reserves 5% headroom to avoid running at 100% capacity.
+ CRITICAL RULE: Use vGPU profiles ONLY when workload fits in a SINGLE profile.
+ If workload exceeds max single profile capacity, use GPU passthrough.
Logic:
- - If total > (max_profile × 0.95): recommend passthrough
- - Otherwise: find smallest profile where (profile × 0.95) >= total_memory_needed
+ 1. If workload fits in single profile (workload ≤ profile × 0.95): use smallest fitting profile
+ 2. If workload > max_profile × 0.95: recommend passthrough with N GPUs
"""
+ import math
physical_memory = self._get_physical_gpu_memory(gpu_family)
available_profiles = self._get_available_profiles(gpu_family)
# Get max profile for this GPU family
max_profile = max(available_profiles) if available_profiles else physical_memory
- max_profile_usable = max_profile * 0.95 # 95% usable capacity
-
- # Check if we need passthrough (exceeds max vGPU profile with 5% headroom)
- if total_memory_needed > max_profile_usable:
- # Calculate GPUs needed for passthrough
- # Even with passthrough, reserve ~5% for driver/OS overhead to avoid running at 100%
- import math
- usable_per_gpu = physical_memory * 0.95 # 95% usable capacity per GPU
- num_gpus_needed = math.ceil(total_memory_needed / usable_per_gpu)
- return {
- "type": "passthrough",
- "profile": None,
- "gpu_count": num_gpus_needed,
- "profile_memory_gb": physical_memory,
- "total_memory_available": physical_memory * num_gpus_needed,
- "recommendation": f"{num_gpus_needed}x {gpu_family} passthrough (no vGPU profile)",
- "reason": f"Workload requires {total_memory_needed:.1f}GB but max vGPU profile usable capacity is {max_profile_usable:.1f}GB ({max_profile}GB × 0.95). GPU passthrough provides ~95% usable capacity ({usable_per_gpu:.1f}GB per {physical_memory}GB GPU)."
- }
+ max_profile_usable = max_profile * 0.95 # 95% usable capacity (5% reserved)
+ usable_physical = physical_memory * 0.95 # 95% usable physical GPU memory
- # Find smallest profile where (profile × 0.95) >= total_memory_needed
+ # Try to find smallest single profile that fits
recommended_profile = None
-
for profile_size in sorted(available_profiles):
- usable_capacity = profile_size * 0.95 # 5% headroom
+ usable_capacity = profile_size * 0.95 # 5% headroom reserved
if usable_capacity >= total_memory_needed:
recommended_profile = profile_size
break
- # If no profile found (shouldn't happen), use largest
- if recommended_profile is None:
- recommended_profile = max(available_profiles)
- warning = f"Warning: No profile with enough capacity for {total_memory_needed:.1f}GB, using largest ({recommended_profile}GB)"
- else:
- warning = None
+ # If single profile found, use it
+ if recommended_profile is not None:
+ usable_capacity = recommended_profile * 0.95
+ return {
+ "type": "vgpu",
+ "profile": f"{gpu_family}-{recommended_profile}Q",
+ "gpu_count": 1,
+ "profile_memory_gb": recommended_profile,
+ "total_memory_available": recommended_profile,
+ "recommendation": f"1x {gpu_family}-{recommended_profile}Q vGPU profile",
+ "reason": f"Workload needs {total_memory_needed:.1f}GB, selected profile: {recommended_profile}GB (usable: {usable_capacity:.1f}GB with 5% reserved for system overhead)",
+ "warning": None
+ }
- usable_capacity = recommended_profile * 0.95
+ # No single profile fits - use GPU passthrough
+ # Calculate GPUs needed based on physical GPU memory
+ num_gpus_needed = math.ceil(total_memory_needed / usable_physical)
return {
- "type": "vgpu",
- "profile": f"{gpu_family}-{recommended_profile}Q",
- "gpu_count": 1,
- "profile_memory_gb": recommended_profile,
- "total_memory_available": recommended_profile,
- "recommendation": f"1x {gpu_family}-{recommended_profile}Q vGPU profile",
- "reason": f"Workload needs {total_memory_needed:.1f}GB, selected profile: {recommended_profile}GB (usable: {usable_capacity:.1f}GB with 5% headroom)",
- "warning": warning
+ "type": "passthrough",
+ "profile": None,
+ "gpu_count": num_gpus_needed,
+ "profile_memory_gb": physical_memory,
+ "total_memory_available": physical_memory * num_gpus_needed,
+ "recommendation": f"{num_gpus_needed}x {gpu_family} GPU passthrough",
+ "reason": f"Workload requires {total_memory_needed:.1f}GB which exceeds max vGPU profile capacity ({max_profile_usable:.1f}GB usable from {max_profile}GB profile). GPU passthrough with {num_gpus_needed} GPUs provides {physical_memory * num_gpus_needed}GB total capacity."
}
@@ -892,9 +925,11 @@ def calculate(self, request: VGPURequest) -> VGPUResult:
config = request.advanced_config if request.advanced_config else AdvancedCalculatorConfig()
# Find model
+ logging.info(f"Looking up model: '{request.model_name}'")
model = self._find_model(request.model_name)
if not model:
raise ValueError(f"Model '{request.model_name}' not found. Available: {self.get_available_models()}")
+ logging.info(f"Found model: '{model.name}' with {model.params_billion}B params")
# Get GPU family from vgpu_profile
gpu = self._find_gpu(request.vgpu_profile)
diff --git a/community/ai-vws-sizing-advisor/src/chains.py b/community/ai-vws-sizing-advisor/src/chains.py
index 559c76837..afc2737b0 100644
--- a/community/ai-vws-sizing-advisor/src/chains.py
+++ b/community/ai-vws-sizing-advisor/src/chains.py
@@ -47,7 +47,7 @@
from .utils import streaming_filter_think, get_streaming_filter_think_parser
from .reflection import ReflectionCounter, check_context_relevance, check_response_groundedness
from .utils import normalize_relevance_scores
-from .apply_configuration import model_extractor, GENERAL_FALLBACK_MODEL
+from .apply_configuration import model_extractor
# Import enhanced components
try:
@@ -62,7 +62,7 @@
"Llama-3-8B", "Llama-3-70B", "Llama-3.1-8B", "Llama-3.1-70B",
"Mistral-7B", "Falcon-7B", "Falcon-40B", "Falcon-180B", "Qwen-14B"
]
-VALID_PRECISIONS = ["fp16", "int8", "INT-8", "int-8", "FP16", "FP-16"]
+VALID_PRECISIONS = ["fp16", "fp8", "int8", "INT-8", "int-8", "FP16", "FP-16", "FP8", "FP-8", "fp4", "FP4"]
def extract_embedded_config(query: str) -> dict:
"""Extract structured config from HTML comment in query (from WorkloadConfigWizard)."""
@@ -139,15 +139,18 @@ def parse_vgpu_query(query: str) -> dict:
if user_match:
result["Concurrent Users"] = int(user_match.group(1))
- # 4) Precision
- prec_match = re.search(r"\b(fp16|int8|INT-8)\b", query, re.IGNORECASE)
+ # 4) Precision - support FP8, FP16, FP4, INT8
+ prec_match = re.search(r"\b(fp16|fp8|fp4|int8|INT-8|FP-8)\b", query, re.IGNORECASE)
if prec_match:
- precision = prec_match.group(1).lower()
- if precision in VALID_PRECISIONS:
- if precision == "int-8" or precision == "INT-8":
- result["Precision"] = "int8"
- elif precision == "fp16" or precision == "FP16" or precision == "FP-16":
- result["Precision"] = "fp16"
+ precision = prec_match.group(1).lower().replace("-", "")
+ if precision in ["fp8", "fp-8"]:
+ result["Precision"] = "fp8"
+ elif precision in ["fp16", "fp-16"]:
+ result["Precision"] = "fp16"
+ elif precision in ["fp4", "fp-4"]:
+ result["Precision"] = "fp4"
+ elif precision in ["int8", "int-8"]:
+ result["Precision"] = "int8"
else:
result["Precision"] = None
@@ -167,7 +170,7 @@ def parse_vgpu_query(query: str) -> dict:
# 5) Default precision if not specified
if not result["Precision"]:
- result["Precision"] = "fp16"
+ result["Precision"] = "fp8" # Default to FP8 for modern inference
if not result["Model"]:
result["Model"] = "Llama-3-8B"
if not result["Concurrent Users"]:
@@ -188,7 +191,7 @@ class StructuredResponse(BaseModel):
description="Function title for vGPU configuration generation"
)
description: str = Field(
- description="Brief summary of the recommended configuration (1-2 sentences max)"
+ description="Brief summary including: GPU family, vGPU profile, workload type (RAG/Inference), model name, AND precision (FP8/FP16/FP4). Example: 'BSE with vGPU profile BSE-48Q for RAG (Nemotron-30B) with FP8 precision'"
)
parameters: Dict[str, Any] = Field(
description="vGPU configuration parameters"
@@ -446,11 +449,20 @@ def stream_structured_response():
try:
# Extract embedded config if present (from WorkloadConfigWizard)
embedded_config = extract_embedded_config(query)
+ logger.info(f"[LLM_CHAIN DEBUG] Extracted embedded_config: {embedded_config}")
+ logger.info(f"[LLM_CHAIN DEBUG] modelTag from config: {embedded_config.get('modelTag') if embedded_config else 'NO CONFIG'}")
- # Try to get model name from various sources
- model_name = (corrected_params.get("model_tag") or
- (embedded_config.get('modelTag') if embedded_config else None) or
- (embedded_config.get('specificModel') if embedded_config else None))
+ # PRIORITY: Get model from embedded_config FIRST (wizard selection)
+ # This ensures we use Nemotron when user selects it, not the LLM's guess
+ model_name = None
+ if embedded_config:
+ model_name = embedded_config.get('modelTag') or embedded_config.get('specificModel')
+ if model_name:
+ logger.info(f"Using model from embedded config in llm_chain: {model_name}")
+
+ # Only fall back to LLM params if embedded config didn't have it
+ if not model_name:
+ model_name = corrected_params.get("model_tag")
# ALWAYS call calculator if we have a model name (regardless of vgpu_profile)
# The calculator will determine the correct profile/passthrough based on workload
@@ -459,7 +471,7 @@ def stream_structured_response():
# Get configuration parameters
batch_size = int(embedded_config.get('batchSize', 1)) if embedded_config else 1
- precision = (embedded_config.get('precision', 'fp16') if embedded_config else 'fp16').lower()
+ precision = (embedded_config.get('precision', 'fp8') if embedded_config else 'fp8').lower()
prompt_size = int(embedded_config.get('promptSize', 1024)) if embedded_config else 1024
response_size = int(embedded_config.get('responseSize', 256)) if embedded_config else 256
@@ -478,9 +490,9 @@ def stream_structured_response():
if not gpu_model and corrected_params.get("vgpu_profile") and corrected_params["vgpu_profile"] not in [None, "null", ""]:
gpu_model = corrected_params["vgpu_profile"].split('-')[0]
- # Final fallback
+ # Final fallback (BSE is the wizard default)
if not gpu_model:
- gpu_model = "L40S"
+ gpu_model = "BSE"
logger.info(f"Using GPU model: {gpu_model} for calculator")
@@ -531,12 +543,125 @@ def stream_structured_response():
logger.info("Enhanced LLM response with calculator results: %s", corrected_params)
except Exception as e:
+ import math
logger.warning("Calculator enhancement failed in llm_chain: %s", e)
+ # Fallback: Calculate profile based on gpu_memory_size
+ # Use vGPU only if single profile fits, otherwise passthrough
+ gpu_memory_size = corrected_params.get("gpu_memory_size", 24)
+ if not gpu_model:
+ gpu_model = "BSE" # Default
+ available_profiles = {
+ 'BSE': [8, 12, 24, 48, 96],
+ 'L40S': [8, 12, 24, 48],
+ 'L40': [8, 12, 24, 48],
+ 'A40': [8, 12, 24, 48],
+ 'L4': [4, 8, 12, 24]
+ }
+ profiles = available_profiles.get(gpu_model, [8, 12, 24, 48])
+ physical_memory = {'BSE': 96, 'L40S': 48, 'L40': 48, 'A40': 48, 'L4': 24}.get(gpu_model, 48)
+
+ # Find smallest single profile that fits
+ selected_profile = None
+ for profile in sorted(profiles):
+ if profile * 0.95 >= gpu_memory_size:
+ selected_profile = profile
+ break
+
+ if selected_profile:
+ corrected_params["vgpu_profile"] = f"{gpu_model}-{selected_profile}Q"
+ corrected_params["gpu_count"] = 1
+ else:
+ # No single profile fits - use passthrough
+ corrected_params["vgpu_profile"] = None
+ corrected_params["gpu_count"] = math.ceil(gpu_memory_size / (physical_memory * 0.95))
+ corrected_params["gpu_model"] = f"{gpu_model} (passthrough)"
+ logger.info(f"Fallback profile: {corrected_params['vgpu_profile']} x{corrected_params.get('gpu_count', 1)}")
+
+ # Ensure embedded_config is available for final processing
+ if 'embedded_config' not in dir() or embedded_config is None:
+ embedded_config = extract_embedded_config(query)
+
+ # Add rag_breakdown fallback for RAG workloads if not already present
+ workload_type = embedded_config.get('workloadType', 'inference') if embedded_config else 'inference'
+ if workload_type == 'rag' and "rag_breakdown" not in corrected_params and embedded_config:
+ rag_breakdown = {"workload_type": "rag"}
+
+ embedding_model = embedded_config.get('embeddingModel')
+ vector_db_vectors = embedded_config.get('numberOfVectors')
+ vector_db_dimension = embedded_config.get('vectorDimension')
+
+ if vector_db_vectors:
+ vector_db_vectors = int(vector_db_vectors) if isinstance(vector_db_vectors, str) else vector_db_vectors
+ if vector_db_dimension:
+ vector_db_dimension = int(vector_db_dimension) if isinstance(vector_db_dimension, str) else vector_db_dimension
+
+ if embedding_model:
+ rag_breakdown["embedding_model"] = embedding_model
+ embedding_model_lower = embedding_model.lower()
+ if 'large' in embedding_model_lower or '1b' in embedding_model_lower:
+ embedding_mem = 2.0
+ elif 'base' in embedding_model_lower or '110m' in embedding_model_lower:
+ embedding_mem = 0.5
+ elif 'small' in embedding_model_lower:
+ embedding_mem = 0.25
+ else:
+ embedding_mem = 1.0
+ rag_breakdown["embedding_memory"] = f"{embedding_mem:.2f} GB"
+
+ if vector_db_vectors and vector_db_dimension:
+ rag_breakdown["vector_db_vectors"] = vector_db_vectors
+ rag_breakdown["vector_db_dimension"] = vector_db_dimension
+ vector_mem_bytes = vector_db_vectors * vector_db_dimension * 4 * 1.5
+ vector_mem_gb = vector_mem_bytes / (1024**3)
+ if vector_mem_gb < 0.1:
+ rag_breakdown["vector_db_memory"] = f"{vector_mem_gb * 1024:.1f} MB"
+ else:
+ rag_breakdown["vector_db_memory"] = f"{vector_mem_gb:.2f} GB"
+
+ corrected_params["rag_breakdown"] = rag_breakdown
+ logger.info("Added rag_breakdown fallback in llm_chain: %s", rag_breakdown)
+
+ # CRITICAL: Use modelTag from embedded_config for the final response
+ final_model_tag = None
+ if embedded_config and embedded_config.get('modelTag'):
+ final_model_tag = embedded_config.get('modelTag')
+ logger.info(f"Using modelTag from embedded config for llm_chain final: {final_model_tag}")
+ if not final_model_tag:
+ # FALLBACK: Extract model from query text (e.g. "running nvidia/model-name")
+ import re
+ query_model_match = re.search(r'running\s+([\w\-/\.]+/[\w\-\.]+)', query, re.IGNORECASE)
+ if query_model_match:
+ final_model_tag = query_model_match.group(1)
+ logger.info(f"Extracted model from query text for llm_chain: {final_model_tag}")
+ else:
+ final_model_tag = corrected_params.get("model_tag") or "Unknown"
+
+ # Update corrected_params to ensure JSON has the correct model_tag
+ corrected_params["model_tag"] = final_model_tag
+
+ # Add precision from embedded config (default to FP8 - wizard default)
+ if embedded_config and embedded_config.get('precision'):
+ corrected_params["precision"] = embedded_config.get('precision').upper()
+ else:
+ corrected_params["precision"] = "FP8" # Default if not specified (matches wizard default)
+
+ # Get GPU model from embedded config or profile (default to BSE - wizard default)
+ final_gpu_model = "BSE"
+ if embedded_config and embedded_config.get('selectedGPU'):
+ final_gpu_model = embedded_config.get('selectedGPU')
+ elif corrected_params.get("vgpu_profile"):
+ final_gpu_model = corrected_params["vgpu_profile"].split('-')[0]
+
+ # Reconstruct description with correct model name and precision
+ final_profile = corrected_params.get("vgpu_profile", "Unknown")
+ final_precision = corrected_params.get("precision", "FP8")
+ final_model_name = final_model_tag.split('/')[-1] if '/' in final_model_tag else final_model_tag
+ corrected_description = f"{final_gpu_model} with vGPU profile {final_profile} for inference of {final_model_name} ({final_precision})"
# Build the final response with corrected field names
final_response = {
"title": json_data.get("title", "generate_vgpu_config"),
- "description": json_data.get("description", ""),
+ "description": corrected_description,
"parameters": corrected_params
}
@@ -723,7 +848,55 @@ def rag_chain( # pylint: disable=arguments-differ
# Log for debugging
logger.info(f"Final structured response after reflection: {structured_final.description[:200]}...")
- return iter([json.dumps(structured_final.model_dump(), ensure_ascii=False, indent=2)]), context_to_show
+ # Enhance response with rag_breakdown for RAG workloads
+ final_response = structured_final.model_dump()
+ embedded_config = extract_embedded_config(query)
+ workload_type = embedded_config.get('workloadType', 'inference') if embedded_config else 'inference'
+
+ if workload_type == 'rag' and embedded_config:
+ params = final_response.get("parameters", {})
+ if "rag_breakdown" not in params:
+ rag_breakdown = {"workload_type": "rag"}
+
+ # Extract RAG config from embedded config
+ embedding_model = embedded_config.get('embeddingModel')
+ vector_db_vectors = embedded_config.get('numberOfVectors')
+ vector_db_dimension = embedded_config.get('vectorDimension')
+
+ if vector_db_vectors:
+ vector_db_vectors = int(vector_db_vectors) if isinstance(vector_db_vectors, str) else vector_db_vectors
+ if vector_db_dimension:
+ vector_db_dimension = int(vector_db_dimension) if isinstance(vector_db_dimension, str) else vector_db_dimension
+
+ if embedding_model:
+ rag_breakdown["embedding_model"] = embedding_model
+ # Calculate embedding memory based on model size
+ embedding_model_lower = embedding_model.lower()
+ if 'large' in embedding_model_lower or '1b' in embedding_model_lower:
+ embedding_mem = 2.0
+ elif 'base' in embedding_model_lower or '110m' in embedding_model_lower:
+ embedding_mem = 0.5
+ elif 'small' in embedding_model_lower:
+ embedding_mem = 0.25
+ else:
+ embedding_mem = 1.0
+ rag_breakdown["embedding_memory"] = f"{embedding_mem:.2f} GB"
+
+ if vector_db_vectors and vector_db_dimension:
+ rag_breakdown["vector_db_vectors"] = vector_db_vectors
+ rag_breakdown["vector_db_dimension"] = vector_db_dimension
+ vector_mem_bytes = vector_db_vectors * vector_db_dimension * 4 * 1.5
+ vector_mem_gb = vector_mem_bytes / (1024**3)
+ if vector_mem_gb < 0.1:
+ rag_breakdown["vector_db_memory"] = f"{vector_mem_gb * 1024:.1f} MB"
+ else:
+ rag_breakdown["vector_db_memory"] = f"{vector_mem_gb:.2f} GB"
+
+ params["rag_breakdown"] = rag_breakdown
+ final_response["parameters"] = params
+ logger.info("Added rag_breakdown to reflection response: %s", rag_breakdown)
+
+ return iter([json.dumps(final_response, ensure_ascii=False, indent=2)]), context_to_show
else:
def stream_structured_rag_response():
try:
@@ -740,10 +913,23 @@ def stream_structured_rag_response():
# Extract GPU info and model info from wherever the LLM put it
vgpu_profile = params.get("vgpu_profile") or ""
- model_name = params.get("model_name") or params.get("model")
- # Extract embedded config to get the actual GPU model selected by user
+ # Extract embedded config to get the actual GPU model and LLM model selected by user
embedded_config = extract_embedded_config(query)
+ logger.info(f"[RAG DEBUG] Extracted embedded_config: {embedded_config}")
+ logger.info(f"[RAG DEBUG] modelTag from config: {embedded_config.get('modelTag') if embedded_config else 'NO CONFIG'}")
+
+ # PRIORITY: Get model from embedded config (wizard selection) FIRST
+ # This ensures we use Nemotron when user selects it, not fallback to LLM's guess
+ model_name = None
+ if embedded_config:
+ model_name = embedded_config.get('modelTag') or embedded_config.get('specificModel')
+ if model_name:
+ logger.info(f"Using model from embedded config: {model_name}")
+
+ # Fallback to LLM params only if embedded config didn't have it
+ if not model_name:
+ model_name = params.get("model_name") or params.get("model")
# Extract GPU model from embedded config first (most reliable)
gpu_model = None
@@ -762,38 +948,73 @@ def stream_structured_rag_response():
if not gpu_model and vgpu_profile and vgpu_profile not in [None, "null", ""]:
gpu_model = vgpu_profile.split('-')[0]
- # Final fallback
+ # Final fallback (BSE is the wizard default)
if not gpu_model:
- gpu_model = "L40S"
+ gpu_model = "BSE"
logger.info(f"Using GPU model: {gpu_model} for RAG chain")
+
+ # Initialize workload with default value
+ workload = "RAG" # Default to RAG for RAG chain queries
+ prompt_size = None
+ response_size = None
# Try to extract from description if not in parameters
if not model_name:
payload = parse_vgpu_query(query)
model_name = model_name or payload.get("Model")
- precision = payload.get("Precision", "fp16").lower()
- workload = payload.get("Workload") or payload.get("workload")
+ precision = payload.get("Precision", "fp8").lower()
+ workload = payload.get("Workload") or payload.get("workload") or "RAG"
prompt_size = payload.get("Prompt Size")
response_size = payload.get("Response Size")
-
logger.info("Extracted model name: %s, precision: %s, workload: %s, prompt size: %s, response size: %s", model_name, precision, workload, prompt_size, response_size)
+ else:
+ # Even if model_name exists, try to extract workload from query
+ payload = parse_vgpu_query(query)
+ workload = payload.get("Workload") or payload.get("workload") or "RAG"
+ prompt_size = payload.get("Prompt Size")
+ response_size = payload.get("Response Size")
# Build properly structured parameters with correct field names
+ # PRIORITY: Use the modelTag from embedded_config directly if available
+ # This is the AUTHORITATIVE source - user selected this in the wizard
model_tag = None
- if model_name:
- # Check if model_name is already a HuggingFace model tag (contains "/")
- if "/" in model_name:
- # Use the full HF model tag directly
- model_tag = model_name
- logger.info(f"Using HuggingFace model tag directly: {model_tag}")
- else:
- # Use the dynamic model extractor for simplified names
- model_tag = model_extractor.extract(model_name)
- # If no match found, use general fallback model
- if not model_tag:
- logger.info(f"No exact match for model '{model_name}', using fallback: {GENERAL_FALLBACK_MODEL}")
- model_tag = GENERAL_FALLBACK_MODEL
-
+ if embedded_config and embedded_config.get('modelTag'):
+ # Embedded config has the full HuggingFace model tag from wizard - USE THIS
+ model_tag = embedded_config.get('modelTag')
+ logger.info(f"Using modelTag from embedded config (authoritative): {model_tag}")
+ else:
+ # FALLBACK: Extract model from query text (e.g. "running nvidia/model-name")
+ # This handles cases where embedded config isn't sent
+ import re
+ query_model_match = re.search(r'running\s+([\w\-/\.]+/[\w\-\.]+)', query, re.IGNORECASE)
+ if query_model_match:
+ model_tag = query_model_match.group(1)
+ logger.info(f"Extracted model from query text: {model_tag}")
+ elif model_name:
+ # Fallback to model_name extraction only if no embedded config
+ if "/" in model_name:
+ model_tag = model_name
+ logger.info(f"Using HuggingFace model tag directly: {model_tag}")
+ else:
+ model_tag = model_extractor.extract(model_name)
+ if not model_tag:
+ # No fallback to hardcoded model - use what was provided
+ logger.warning(f"No match for model '{model_name}', keeping as-is")
+ model_tag = model_name # Use the provided name, don't substitute
+
+ # CRITICAL: ALWAYS update model_name with extracted model_tag for VGPURequest
+ # The model_tag from query/embedded_config is authoritative over params defaults
+ if model_tag:
+ model_name = model_tag
+ logger.info(f"Using model_tag for calculator: {model_name}")
+
+ # Get precision from embedded config (default to fp8 which is the wizard default)
+ precision_from_config = (embedded_config.get('precision', 'fp8') if embedded_config else precision or 'fp8').lower()
+
+ # Get prompt/response sizes from embedded config first
+ prompt_size_from_config = int(embedded_config.get('promptSize', 1024)) if embedded_config else (prompt_size or 1024)
+ response_size_from_config = int(embedded_config.get('responseSize', 256)) if embedded_config else (response_size or 256)
+
corrected_params = {
"vgpu_profile": params.get("vgpu_profile"),
"vcpu_count": ((params.get("system_RAM") or 96) // 4),
@@ -804,6 +1025,10 @@ def stream_structured_rag_response():
"time_to_first_token": None,
"throughput": None,
"model_tag": model_tag,
+ # Add precision and prompt/response sizes
+ "precision": precision_from_config.upper(),
+ "prompt_size": prompt_size_from_config,
+ "response_size": response_size_from_config,
}
@@ -839,10 +1064,17 @@ def stream_structured_rag_response():
logger.info(f"Using batch size (concurrent requests): {batch_size}")
# Extract RAG-specific parameters if workload type is RAG
- # First try embedded config, then fall back to extracted workload from LLM response
+ # First try embedded config, then detect from query text, then fall back to LLM response
workload_type = embedded_config.get('workloadType', 'inference') if embedded_config else 'inference'
+
+ # ROBUST RAG DETECTION: Check query text directly for RAG indicators
+ is_rag_query = ('RAG' in query or 'Retrieval-Augmented' in query or
+ 'embedding model' in query.lower() or 'vector' in query.lower())
+ if is_rag_query:
+ workload_type = 'rag'
+ logger.info(f"Workload type set to 'rag' based on query text analysis")
# If embedded config says inference but LLM extracted "RAG", use that instead
- if workload_type.lower() == 'inference' and workload and 'rag' in workload.lower():
+ elif workload_type.lower() == 'inference' and workload and 'rag' in workload.lower():
workload_type = 'rag'
logger.info(f"Workload type set to 'rag' based on LLM extraction: {workload}")
else:
@@ -855,8 +1087,13 @@ def stream_structured_rag_response():
# Try to get from embedded config first
if embedded_config:
embedding_model = embedded_config.get('embeddingModel')
- vector_db_vectors = embedded_config.get('numberOfVectors')
- vector_db_dimension = embedded_config.get('vectorDimension')
+ # Convert to integers if present (they come as strings from JSON)
+ num_vectors = embedded_config.get('numberOfVectors')
+ vec_dim = embedded_config.get('vectorDimension')
+ if num_vectors:
+ vector_db_vectors = int(num_vectors) if isinstance(num_vectors, str) else num_vectors
+ if vec_dim:
+ vector_db_dimension = int(vec_dim) if isinstance(vec_dim, str) else vec_dim
# If not in embedded config, try to extract from query text
if not embedding_model:
@@ -967,12 +1204,136 @@ def stream_structured_rag_response():
logger.info("Enhanced with calculator results: %s", corrected_params)
except Exception as e:
+ import traceback
logger.warning("Calculator enhancement failed: %s", e)
+ logger.warning("Traceback: %s", traceback.format_exc())
+ # Fallback: Calculate profile based on gpu_memory_size even if calculator fails
+ gpu_memory_size = corrected_params.get("gpu_memory_size", 24)
+ # Profile selection: Pick smallest profile where profile × 0.95 >= workload
+ # If no single profile fits, use passthrough
+ available_profiles = {
+ 'BSE': [8, 12, 24, 48, 96],
+ 'L40S': [8, 12, 24, 48],
+ 'L40': [8, 12, 24, 48],
+ 'A40': [8, 12, 24, 48],
+ 'L4': [4, 8, 12, 24]
+ }
+ profiles = available_profiles.get(gpu_model, [8, 12, 24, 48])
+ physical_memory = {'BSE': 96, 'L40S': 48, 'L40': 48, 'A40': 48, 'L4': 24}.get(gpu_model, 48)
+
+ # Find smallest single profile where profile × 0.95 >= workload
+ selected_profile = None
+ for profile in sorted(profiles):
+ if profile * 0.95 >= gpu_memory_size:
+ selected_profile = profile
+ break
+
+ if selected_profile:
+ # Single vGPU profile fits
+ corrected_params["vgpu_profile"] = f"{gpu_model}-{selected_profile}Q"
+ corrected_params["gpu_count"] = 1
+ logger.info(f"Fallback: Using {corrected_params['vgpu_profile']}")
+ else:
+ # No single profile fits - use passthrough
+ corrected_params["vgpu_profile"] = None
+ corrected_params["gpu_count"] = math.ceil(gpu_memory_size / (physical_memory * 0.95))
+ corrected_params["gpu_model"] = f"{gpu_model} (passthrough)"
+ logger.info(f"Fallback: Using passthrough with {corrected_params['gpu_count']}x {gpu_model}")
+
+ # Add RAG-specific fields to the response if this is a RAG workload
+ if workload_type == 'rag':
+ # Add top-level RAG fields
+ if embedding_model:
+ corrected_params["embedding_model"] = embedding_model
+ if vector_db_vectors:
+ corrected_params["vector_db_vectors"] = vector_db_vectors
+ if vector_db_dimension:
+ corrected_params["vector_db_dimension"] = vector_db_dimension
+
+ # Build rag_breakdown if not already present (from calculator)
+ if "rag_breakdown" not in corrected_params:
+ rag_breakdown = {"workload_type": "rag"}
+ if embedding_model:
+ rag_breakdown["embedding_model"] = embedding_model
+ # Calculate embedding memory based on model size (approximate)
+ # Common embedding models and their approximate sizes:
+ embedding_model_lower = embedding_model.lower()
+ if 'large' in embedding_model_lower or '1b' in embedding_model_lower:
+ embedding_mem = 2.0 # ~1B params at FP16
+ elif 'base' in embedding_model_lower or '110m' in embedding_model_lower:
+ embedding_mem = 0.5 # ~110M params at FP16
+ elif 'small' in embedding_model_lower:
+ embedding_mem = 0.25 # ~33M params at FP16
+ else:
+ embedding_mem = 1.0 # Default estimate
+ rag_breakdown["embedding_memory"] = f"{embedding_mem:.2f} GB"
+
+ if vector_db_vectors and vector_db_dimension:
+ rag_breakdown["vector_db_vectors"] = vector_db_vectors
+ rag_breakdown["vector_db_dimension"] = vector_db_dimension
+ # Calculate vector DB memory: vectors * dimension * 4 bytes (float32) + 50% overhead for index
+ vector_mem_bytes = vector_db_vectors * vector_db_dimension * 4 * 1.5
+ vector_mem_gb = vector_mem_bytes / (1024**3)
+ if vector_mem_gb < 0.1:
+ rag_breakdown["vector_db_memory"] = f"{vector_mem_gb * 1024:.1f} MB"
+ else:
+ rag_breakdown["vector_db_memory"] = f"{vector_mem_gb:.2f} GB"
+ elif vector_db_vectors:
+ rag_breakdown["vector_db_vectors"] = vector_db_vectors
+ elif vector_db_dimension:
+ rag_breakdown["vector_db_dimension"] = vector_db_dimension
+
+ # Add prompt/response size info
+ rag_breakdown["prompt_size"] = prompt_size_from_config
+ rag_breakdown["response_size"] = response_size_from_config
+ corrected_params["rag_breakdown"] = rag_breakdown
+ logger.info("Built RAG breakdown manually: %s", rag_breakdown)
+ else:
+ logger.info("Using rag_breakdown from calculator: %s", corrected_params["rag_breakdown"])
+
+ # Reconstruct description with correct format: GPU family, profile, workload, model, precision
+ final_profile = corrected_params.get("vgpu_profile", f"{gpu_model}-12Q")
+ final_precision = corrected_params.get("precision", precision_from_config.upper())
+
+ # CRITICAL: Get model_tag from embedded_config first (most reliable source)
+ # This ensures the JSON model_tag matches what the user selected in the wizard
+ final_model_tag = None
+ if embedded_config and embedded_config.get('modelTag'):
+ final_model_tag = embedded_config.get('modelTag')
+ logger.info(f"Using modelTag from embedded config for final response: {final_model_tag}")
+ if not final_model_tag:
+ final_model_tag = corrected_params.get("model_tag") or model_tag or "Unknown"
+
+ # Update corrected_params to ensure JSON has the correct model_tag
+ corrected_params["model_tag"] = final_model_tag
+
+ if workload_type == 'rag':
+ # Format: "L40S with vGPU profile L40S-48Q for RAG (model-name) with embedding-model (FP8)"
+ emb_model_name = embedding_model.split('/')[-1] if embedding_model else "embedding"
+ final_model_name = final_model_tag.split('/')[-1] if '/' in final_model_tag else final_model_tag
+ corrected_description = f"{gpu_model} with vGPU profile {final_profile} for RAG ({final_model_name}) with {emb_model_name} ({final_precision})"
+
+ # Add rag_config sub-object with RAG-specific configuration
+ rag_config = {
+ "workload_type": "rag",
+ "embedding_model": embedding_model,
+ "vector_dimension": vector_db_dimension,
+ "total_vectors": vector_db_vectors,
+ }
+ # Remove None values
+ rag_config = {k: v for k, v in rag_config.items() if v is not None}
+ if rag_config:
+ corrected_params["rag_config"] = rag_config
+ logger.info(f"Added rag_config to response: {rag_config}")
+ else:
+ # Format: "L40S with vGPU profile L40S-48Q for inference of model-name (FP8)"
+ final_model_name = final_model_tag.split('/')[-1] if '/' in final_model_tag else final_model_tag
+ corrected_description = f"{gpu_model} with vGPU profile {final_profile} for inference of {final_model_name} ({final_precision})"
# Build the final response with corrected field names
final_response = {
"title": json_data.get("title", "generate_vgpu_config"),
- "description": json_data.get("description", ""),
+ "description": corrected_description,
"parameters": corrected_params
}
@@ -1033,6 +1394,12 @@ def rag_chain_with_multiturn(self,
**kwargs) -> Generator[str, None, None]:
"""Execute a Retrieval Augmented Generation chain using the components defined above."""
+ # Check for conversational mode - return plain text instead of structured JSON
+ conversational_mode = kwargs.get("conversational_mode", False)
+ if conversational_mode:
+ logger.info("Using CONVERSATIONAL mode for chat query: %s", query[:100])
+ return self._conversational_chain(query, chat_history, reranker_top_k, vdb_top_k, collection_name, **kwargs)
+
# Determine if enhanced mode should be used
use_enhanced = self._should_use_enhanced_mode(query)
logger.info("Using %s multiturn RAG mode for query: %s", "enhanced" if use_enhanced else "standard", query)
@@ -1214,10 +1581,12 @@ def stream_structured_multiturn_response():
# Extract GPU info and model info from wherever the LLM put it
vgpu_profile = params.get("vgpu_profile") or ""
model_name = params.get("model_name") or params.get("model")
- precision = params.get("precision", "fp16").lower() if params.get("precision") else "fp16"
+ precision = params.get("precision", "fp8").lower() if params.get("precision") else "fp8"
# Extract embedded config to get the actual GPU model selected by user
embedded_config = extract_embedded_config(query)
+ logger.info(f"[MULTITURN DEBUG] Extracted embedded_config: {embedded_config}")
+ logger.info(f"[MULTITURN DEBUG] modelTag from config: {embedded_config.get('modelTag') if embedded_config else 'NO CONFIG'}")
# Extract GPU model from embedded config first (most reliable)
gpu_model = None
@@ -1236,18 +1605,60 @@ def stream_structured_multiturn_response():
if not gpu_model and vgpu_profile and vgpu_profile not in [None, "null", ""]:
gpu_model = vgpu_profile.split('-')[0]
- # Final fallback
+ # Final fallback (BSE is the wizard default)
if not gpu_model:
- gpu_model = "L40S"
+ gpu_model = "BSE"
logger.info(f"Using GPU model: {gpu_model} for multiturn RAG chain")
+ # Initialize workload with default value
+ workload = "RAG" # Default to RAG for multiturn queries
+
# Try to extract from description if not in parameters
if not model_name:
payload = parse_vgpu_query(json_data.get("description", ""))
model_name = model_name or payload.get("Model")
- precision = precision or payload.get("Precision", "fp16").lower()
+ precision = precision or payload.get("Precision", "fp8").lower()
workload = payload.get("Workload", "RAG")
+ else:
+ # Even if model_name exists, try to extract workload from description
+ payload = parse_vgpu_query(json_data.get("description", ""))
+ workload = payload.get("Workload", "RAG")
+
+ # PRIORITY: Use modelTag from embedded_config directly if available
+ # This is the AUTHORITATIVE source - user selected this in the wizard
+ model_tag = None
+ if embedded_config and embedded_config.get('modelTag'):
+ model_tag = embedded_config.get('modelTag')
+ logger.info(f"Using modelTag from embedded config for multiturn (authoritative): {model_tag}")
+ else:
+ # FALLBACK: Extract model from query text (e.g. "running nvidia/model-name")
+ # This handles cases where embedded config isn't sent
+ import re
+ query_model_match = re.search(r'running\s+([\w\-/\.]+/[\w\-\.]+)', query, re.IGNORECASE)
+ if query_model_match:
+ model_tag = query_model_match.group(1)
+ logger.info(f"Extracted model from query text: {model_tag}")
+ elif model_name:
+ # Fallback to model_name extraction only if no embedded config
+ if "/" in model_name:
+ model_tag = model_name
+ logger.info(f"Using HuggingFace model tag directly: {model_tag}")
+ else:
+ model_tag = model_extractor.extract(model_name)
+ if not model_tag:
+ # No fallback to hardcoded model - use what was provided
+ logger.warning(f"No match for model '{model_name}', keeping as-is")
+ model_tag = model_name # Use the provided name, don't substitute
+
+ # CRITICAL: ALWAYS update model_name with extracted model_tag for VGPURequest
+ # The model_tag from query/embedded_config is authoritative over params defaults
+ if model_tag:
+ model_name = model_tag
+ logger.info(f"Using model_tag for calculator: {model_name}")
+
+ # Get precision from embedded config (default to fp8 - wizard default)
+ precision_from_config = (embedded_config.get('precision', 'fp8') if embedded_config else precision or 'fp8').lower()
# Build properly structured parameters with correct field names
corrected_params = {
@@ -1259,7 +1670,8 @@ def stream_structured_multiturn_response():
"e2e_latency": None,
"time_to_first_token": None,
"throughput": None,
- "model_tag": model_tag
+ "model_tag": model_tag,
+ "precision": precision_from_config.upper()
}
# If we have model info and it's a workload we can calculate, enhance with calculator
@@ -1280,6 +1692,9 @@ def stream_structured_multiturn_response():
# Use vgpu_profile from calculator (not LLM) for accurate profile selection
corrected_params["vgpu_profile"] = calculation.resultant_configuration.vgpu_profile
corrected_params["max_kv_tokens"] = calculation.resultant_configuration.max_kv_tokens
+ # Use calculator's total_memory_gb directly - it already includes all components
+ # This replaces the LLM's estimate with the actual calculated value
+ corrected_params["gpu_memory_size"] = calculation.resultant_configuration.total_memory_gb
# Add GPU model name (especially useful for passthrough configurations)
corrected_params["gpu_model"] = calculation.resultant_configuration.gpu_name
# Add GPU count (especially useful for passthrough configurations)
@@ -1292,12 +1707,111 @@ def stream_structured_multiturn_response():
logger.info("Enhanced multiturn with calculator results: %s", corrected_params)
except Exception as e:
+ import math
logger.warning("Calculator enhancement failed in multiturn: %s", e)
+ # Fallback: Calculate profile based on gpu_memory_size
+ # Use vGPU only if single profile fits, otherwise passthrough
+ gpu_memory_size = corrected_params.get("gpu_memory_size", 24)
+ available_profiles = {
+ 'BSE': [8, 12, 24, 48, 96],
+ 'L40S': [8, 12, 24, 48],
+ 'L40': [8, 12, 24, 48],
+ 'A40': [8, 12, 24, 48],
+ 'L4': [4, 8, 12, 24]
+ }
+ profiles = available_profiles.get(gpu_model, [8, 12, 24, 48])
+ physical_memory = {'BSE': 96, 'L40S': 48, 'L40': 48, 'A40': 48, 'L4': 24}.get(gpu_model, 48)
+
+ # Find smallest single profile that fits
+ selected_profile = None
+ for profile in sorted(profiles):
+ if profile * 0.95 >= gpu_memory_size:
+ selected_profile = profile
+ break
+
+ if selected_profile:
+ corrected_params["vgpu_profile"] = f"{gpu_model}-{selected_profile}Q"
+ corrected_params["gpu_count"] = 1
+ else:
+ # No single profile fits - use passthrough
+ corrected_params["vgpu_profile"] = None
+ corrected_params["gpu_count"] = math.ceil(gpu_memory_size / (physical_memory * 0.95))
+ corrected_params["gpu_model"] = f"{gpu_model} (passthrough)"
+ logger.info(f"Fallback profile: {corrected_params['vgpu_profile']} x{corrected_params.get('gpu_count', 1)}")
+
+ # ========== Extract RAG Configuration from Query ==========
+ # Detect if this is a RAG workload from query text
+ is_rag_workload = 'RAG' in query or 'Retrieval-Augmented' in query or 'embedding model' in query.lower()
+
+ if is_rag_workload:
+ import re
+ rag_config = {}
+ rag_breakdown = {"workload_type": "rag"}
+
+ # Extract embedding model (e.g., "using embedding model nvidia/nvolveqa-embed-large-1B")
+ embedding_match = re.search(r'embedding model\s+([\w\-/\.]+)', query, re.IGNORECASE)
+ if embedding_match:
+ embedding_model = embedding_match.group(1)
+ rag_config["embedding_model"] = embedding_model
+ rag_breakdown["embedding_model"] = embedding_model
+ # Estimate embedding memory based on model name
+ embedding_model_lower = embedding_model.lower()
+ if 'large' in embedding_model_lower or '1b' in embedding_model_lower:
+ embedding_mem = 2.0
+ elif 'base' in embedding_model_lower or '400m' in embedding_model_lower:
+ embedding_mem = 0.8
+ elif 'small' in embedding_model_lower or '200m' in embedding_model_lower:
+ embedding_mem = 0.4
+ else:
+ embedding_mem = 1.0
+ rag_breakdown["embedding_memory"] = f"{embedding_mem:.2f} GB"
+
+ # Extract vector dimension (e.g., "1024d vectors")
+ dimension_match = re.search(r'(\d+)d\s*vectors', query, re.IGNORECASE)
+ if dimension_match:
+ vector_dimension = int(dimension_match.group(1))
+ rag_config["vector_dimension"] = vector_dimension
+ rag_breakdown["vector_db_dimension"] = vector_dimension
+
+ # Extract total vectors (e.g., "10000 total vectors")
+ vectors_match = re.search(r'(\d+)\s*total\s*vectors', query, re.IGNORECASE)
+ if vectors_match:
+ total_vectors = int(vectors_match.group(1))
+ rag_config["total_vectors"] = total_vectors
+ rag_breakdown["vector_db_vectors"] = total_vectors
+
+ # Calculate vector DB memory if we have both dimension and count
+ if rag_breakdown.get("vector_db_vectors") and rag_breakdown.get("vector_db_dimension"):
+ vector_mem_bytes = rag_breakdown["vector_db_vectors"] * rag_breakdown["vector_db_dimension"] * 4 * 1.5
+ vector_mem_gb = vector_mem_bytes / (1024**3)
+ if vector_mem_gb < 0.1:
+ rag_breakdown["vector_db_memory"] = f"{vector_mem_gb * 1024:.1f} MB"
+ else:
+ rag_breakdown["vector_db_memory"] = f"{vector_mem_gb:.2f} GB"
+
+ # Add RAG config and breakdown to params
+ if rag_config:
+ corrected_params["rag_config"] = rag_config
+ if any(k != "workload_type" for k in rag_breakdown.keys()):
+ corrected_params["rag_breakdown"] = rag_breakdown
+ logger.info(f"Added RAG breakdown to multiturn response: {rag_breakdown}")
+
+ # Reconstruct description with correct model name and precision
+ final_profile = corrected_params.get("vgpu_profile", "Unknown")
+ final_precision = corrected_params.get("precision", "FP8")
+ final_model_name = model_tag.split('/')[-1] if model_tag and '/' in model_tag else (model_tag or "Unknown")
+
+ # Use different description format for RAG vs Inference
+ if is_rag_workload and corrected_params.get("rag_config", {}).get("embedding_model"):
+ embedding_short = corrected_params["rag_config"]["embedding_model"].split('/')[-1]
+ corrected_description = f"{gpu_model} with vGPU profile {final_profile} for RAG (Retrieval-Augmented Generation) with {final_model_name} and {embedding_short}"
+ else:
+ corrected_description = f"{gpu_model} with vGPU profile {final_profile} for inference of {final_model_name} ({final_precision})"
# Build the final response with corrected field names
final_response = {
"title": json_data.get("title", "generate_vgpu_config"),
- "description": json_data.get("description", ""),
+ "description": corrected_description,
"parameters": corrected_params
}
@@ -1351,6 +1865,137 @@ def stream_structured_multiturn_response():
return iter([json.dumps(error_response.model_dump(), ensure_ascii=False, indent=2)]), []
+ def _conversational_chain(self,
+ query: str,
+ chat_history: List[Dict[str, Any]],
+ reranker_top_k: int,
+ vdb_top_k: int,
+ collection_name: str,
+ **kwargs) -> tuple:
+ """
+ Execute a conversational RAG chain that returns plain text responses.
+ Used for the chat panel where users ask follow-up questions about their config.
+ """
+ try:
+ document_embedder = get_embedding_model(model=kwargs.get("embedding_model"), url=kwargs.get("embedding_endpoint"))
+ vs = get_vectorstore(document_embedder, collection_name, kwargs.get("vdb_endpoint"))
+ if vs is None:
+ raise APIError("Vector store not initialized properly.", 500)
+
+ llm = get_llm(**kwargs)
+ ranker = get_ranking_model(model=kwargs.get("reranker_model"), url=kwargs.get("reranker_endpoint"), top_n=reranker_top_k)
+ top_k = vdb_top_k if ranker and kwargs.get("enable_reranker") else reranker_top_k
+ retriever = vs.as_retriever(search_kwargs={"k": top_k})
+
+ # Build conversation history for the prompt
+ conversation_history = []
+ user_provided_context = ""
+ history_count = int(os.environ.get("CONVERSATION_HISTORY", 15)) * 2 * -1
+ chat_history = chat_history[history_count:]
+
+ for message in chat_history:
+ if message.role == "system":
+ # Capture the system message context from frontend (contains vGPU config details)
+ user_provided_context = message.content
+ logger.info(f"[CONVERSATIONAL] Found system context: {user_provided_context[:200]}...")
+ else:
+ conversation_history.append((message.role, message.content))
+
+ # Build system prompt - include user's configuration context if provided
+ base_prompt = """You are a helpful AI assistant with expertise in NVIDIA GPUs, vGPU technology, LLMs, and AI infrastructure.
+
+Answer the user's question directly and conversationally. Use the retrieved documents AND the configuration context to support your answers.
+
+Guidelines:
+- Be concise but thorough
+- Use plain text only, no JSON or structured output
+- If asked about model parameters, GPU profiles, or vGPU configurations, explain clearly
+- For technical questions, provide specific details when available
+- Reference the user's specific configuration when answering
+- If you don't know something, say so honestly"""
+
+ # Add user's configuration context if provided
+ if user_provided_context:
+ system_prompt = f"""{base_prompt}
+
+=== USER'S CURRENT VGPU CONFIGURATION ===
+{user_provided_context}
+
+=== ADDITIONAL CONTEXT FROM KNOWLEDGE BASE ===
+{{context}}"""
+ else:
+ system_prompt = f"""{base_prompt}
+
+Context from knowledge base:
+{{context}}"""
+
+ logger.info(f"[CONVERSATIONAL] System prompt length: {len(system_prompt)}")
+
+ # Retrieve relevant documents
+ retriever_query = query
+ if kwargs.get("enable_query_rewriting") and conversation_history:
+ contextualize_q_system_prompt = (
+ "Given a chat history and the latest user question "
+ "which might reference context in the chat history, "
+ "formulate a standalone question which can be understood "
+ "without the chat history. Do NOT answer the question, "
+ "just reformulate it if needed and otherwise return it as is."
+ )
+ q_prompt = ChatPromptTemplate.from_messages([
+ ("system", contextualize_q_system_prompt),
+ MessagesPlaceholder("chat_history"),
+ ("human", "{input}"),
+ ])
+ query_rewriter_llm = get_llm(
+ model=settings.query_rewriter.model_name,
+ llm_endpoint=settings.query_rewriter.server_url,
+ **query_rewriter_llm_config
+ )
+ # Create chain: prompt -> LLM -> string output
+ query_rewriter_chain = q_prompt | query_rewriter_llm | StrOutputParser()
+ retriever_query = query_rewriter_chain.invoke(
+ {"input": query, "chat_history": conversation_history},
+ config={'run_name': 'query-rewriter'}
+ )
+ logger.info(f"Conversational query rewritten to: {retriever_query}")
+
+ # Get documents
+ docs_raw = retriever.invoke(retriever_query)
+ if ranker and kwargs.get("enable_reranker"):
+ docs_raw = ranker.invoke({"query": retriever_query, "documents": docs_raw})
+
+ docs = [format_document_with_source(d) for d in docs_raw[:reranker_top_k]]
+ context_str = "\n\n".join(docs) if docs else "No relevant documents found."
+
+ # Build the prompt
+ messages = [("system", system_prompt)]
+ messages.extend(conversation_history)
+ messages.append(("user", query))
+
+ prompt = ChatPromptTemplate.from_messages(messages)
+ chain = prompt | llm | StrOutputParser()
+
+ def stream_conversational_response():
+ """Yield plain text chunks - server.py handles SSE formatting."""
+ try:
+ for chunk in chain.stream({"context": context_str}):
+ # Just yield the raw text - server.py will format as SSE
+ yield chunk
+ except Exception as e:
+ logger.error(f"Error in conversational stream: {e}")
+ yield f"I apologize, but I encountered an error: {str(e)}"
+
+ # Return generator and context for citations
+ context_to_show = docs_raw[:reranker_top_k] if docs_raw else []
+ return stream_conversational_response(), context_to_show
+
+ except Exception as e:
+ logger.error(f"Error in conversational chain: {e}")
+ def error_stream():
+ yield "I'm sorry, I encountered an error processing your question. Please try again."
+ return error_stream(), []
+
+
def document_search(self, content: str, messages: List, reranker_top_k: int, vdb_top_k: int, collection_name: str = "", **kwargs) -> List[Dict[str, Any]]:
"""Search for the most relevant documents for the given search parameters.
It's called when the `/search` API is invoked.
diff --git a/community/ai-vws-sizing-advisor/src/prompt.yaml b/community/ai-vws-sizing-advisor/src/prompt.yaml
index 056d23a7d..67a11daad 100644
--- a/community/ai-vws-sizing-advisor/src/prompt.yaml
+++ b/community/ai-vws-sizing-advisor/src/prompt.yaml
@@ -8,6 +8,7 @@ chat_template: |
### STEP 2: Pick smallest profile where (profile × 0.95) >= workload
Reserve 5% headroom to avoid running at 100% capacity.
+ If no single profile fits, use GPU passthrough.
**Available profiles per GPU:**
- **L40S**: 8Q, 12Q, 24Q, 48Q
@@ -17,6 +18,7 @@ chat_template: |
- **BSE** (RTX Pro 6000): 8Q, 12Q, 24Q, 48Q, 96Q
**Profile selection rule: Pick smallest profile where (profile × 0.95) >= workload**
+ **If no single profile fits, use GPU passthrough (entire GPUs, no vGPU)**
Examples (95% usable capacity):
- Workload needs 10 GB on BSE → 12×0.95=11.4≥10 → Pick BSE-12Q ✓
@@ -29,26 +31,25 @@ chat_template: |
- Workload needs 45 GB on BSE → 48×0.95=45.6≥45 → Pick BSE-48Q ✓
- Workload needs 46 GB on BSE → 48×0.95=45.6<46 → Pick BSE-96Q (96×0.95=91.2≥46) ✓
- Workload needs 90 GB on BSE → 96×0.95=91.2≥90 → Pick BSE-96Q ✓
- - Workload needs 92 GB on BSE → 96×0.95=91.2<92 → vgpu_profile=null, recommend "1× BSE GPU passthrough" ✓
- - Workload needs 120 GB on BSE → Exceeds single GPU → vgpu_profile=null, recommend "2× BSE GPU passthrough" ✓
+ - **Workload needs 92 GB on BSE → 96×0.95=91.2<92 → vgpu_profile=null, "2× BSE GPU passthrough" ✓**
+ - **Workload needs 96 GB on BSE → 96×0.95=91.2<96 → vgpu_profile=null, "2× BSE GPU passthrough" ✓**
+ - **Workload needs 120 GB on BSE → 96×0.95=91.2<120 → vgpu_profile=null, "2× BSE GPU passthrough" ✓**
- **Workload needs 22 GB on L4 → 24×0.95=22.8≥22 → Pick L4-24Q ✓**
- - **Workload needs 23 GB on L4 → 24×0.95=22.8<23 → vgpu_profile=null, recommend "1× L4 GPU passthrough" ✓**
- - **Workload needs 24 GB on L4 → 24×0.95=22.8<24 → vgpu_profile=null, recommend "1× L4 GPU passthrough" ✓**
- - **Workload needs 25 GB on L4 → Exceeds single L4 → vgpu_profile=null, recommend "2× L4 GPU passthrough" ✓**
+ - **Workload needs 23 GB on L4 → 24×0.95=22.8<23 → vgpu_profile=null, "2× L4 GPU passthrough" ✓**
+ - **Workload needs 50 GB on L40S → 48×0.95=45.6<50 → vgpu_profile=null, "2× L40S GPU passthrough" ✓**
- **If workload exceeds max vGPU profile capacity with 5% headroom:**
- - Max usable capacities: BSE-96Q (91.2GB), L40S-48Q (45.6GB), L40-48Q (45.6GB), A40-48Q (45.6GB), L4-24Q (22.8GB)
+ **IMPORTANT: Use vGPU profiles ONLY when workload fits in a SINGLE profile!**
+ - If workload fits in single profile: use smallest vGPU profile that fits
+ - If workload exceeds max single profile: use GPU passthrough (entire GPUs, no vGPU)
+ - Max usable: BSE-96Q=91.2GB, L40S-48Q=45.6GB, L40-48Q=45.6GB, A40-48Q=45.6GB, L4-24Q=22.8GB
+
+ **If workload > max single profile capacity → use passthrough:**
- Set `vgpu_profile` to null
- - In description field, recommend: "Use X× [GPU model] with full GPU passthrough (no vGPU)"
- - **IMPORTANT: Even with passthrough, reserve 5% for driver/OS overhead. Use 95% of physical memory.**
- - Calculate GPUs needed: ceil(workload / (physical_gpu × 0.95))
- - Physical GPU capacities: BSE=96GB, L40S=48GB, L40=48GB, A40=48GB, L4=24GB
- - Example: 92GB on BSE → ceil(92/91.2)=2 GPUs → "Use 2× BSE GPU passthrough" (96GB × 2 = 192GB total)
- - Example: 120GB on BSE → ceil(120/91.2)=2 GPUs → "Use 2× BSE GPU passthrough" (96GB × 2 = 192GB total)
- - Example: 185GB on BSE → ceil(185/91.2)=3 GPUs → "Use 3× BSE GPU passthrough" (96GB × 3 = 288GB total)
- - Example: 24GB on L4 → ceil(24/22.8)=2 GPUs → "Use 2× L4 GPU passthrough" (24GB × 2 = 48GB total)
- - Example: 50GB on L40S → ceil(50/45.6)=2 GPUs → "Use 2× L40S GPU passthrough" (48GB × 2 = 96GB total)
- - Example: 144GB on L40S → ceil(144/45.6)=4 GPUs → "Use 4× L40S GPU passthrough" (48GB × 4 = 192GB total)
+ - Recommend: "Use X× [GPU model] with full GPU passthrough (no vGPU)"
+ - Calculate GPUs: ceil(workload / (physical_gpu × 0.95))
+ - Example: 92GB on BSE → vgpu_profile=null → "2× BSE GPU passthrough"
+ - Example: 50GB on L40S → vgpu_profile=null → "2× L40S GPU passthrough"
+ - Example: 23GB on L4 → vgpu_profile=null → "2× L4 GPU passthrough"
## System RAM Calculation:
- Standard: (Model GB × 2.5) + (Concurrent Requests × 2GB) + 16GB
@@ -69,7 +70,7 @@ chat_template: |
```json
{{
"title": "generate_vgpu_config",
- "description": "Brief 1-2 sentence summary",
+ "description": "{{GPU_MODEL}} with vGPU profile {{SELECTED_PROFILE}} for inference of {{MODEL_NAME}} ({{QUANTIZATION}})",
"parameters": {{
"vgpu_profile": "BSE-48Q",
"vcpu_count": 16,
@@ -100,6 +101,20 @@ nemotron_thinking_prompt: |
Example: If gpu_memory_size = 24GB on L40S → 24×0.95=22.8<24 → Pick L40S-48Q ✓
|thinking|>
+chat_followup_template: |
+ You are an NVIDIA vGPU configuration specialist helping with follow-up questions about vGPU configurations.
+
+ Use the conversation history and ingested vGPU documentation to answer questions about:
+ - vGPU profile details and specifications
+ - Configuration recommendations and alternatives
+ - Performance characteristics
+ - Deployment considerations
+ - Troubleshooting and optimization
+
+ Keep responses concise and technical. Reference the provided documentation when available.
+
+ If asked about a previously recommended configuration, use the conversation history to understand context.
+
You are an NVIDIA vGPU configuration specialist.
## SIMPLE 2-STEP PROCESS:
@@ -152,6 +167,7 @@ rag_template: |
Add 10% safety buffer: `recommended_memory = gpu_memory_size × 1.10`
### STEP 2: Pick the smallest profile >= recommended memory
+ If no single profile fits, use GPU passthrough.
**Available profiles per GPU:**
- **L40S**: 8Q, 12Q, 24Q, 48Q
@@ -161,6 +177,7 @@ rag_template: |
- **BSE** (RTX Pro 6000): 8Q, 12Q, 24Q, 48Q, 96Q
**Profile selection rule: Pick smallest profile where (profile × 0.95) >= workload**
+ **If no single profile fits, use GPU passthrough (entire GPUs, no vGPU)**
Examples (95% usable capacity):
- Workload needs 10 GB on BSE → 12×0.95=11.4≥10 → Pick BSE-12Q ✓
@@ -173,26 +190,25 @@ rag_template: |
- Workload needs 45 GB on BSE → 48×0.95=45.6≥45 → Pick BSE-48Q ✓
- Workload needs 46 GB on BSE → 48×0.95=45.6<46 → Pick BSE-96Q (96×0.95=91.2≥46) ✓
- Workload needs 90 GB on BSE → 96×0.95=91.2≥90 → Pick BSE-96Q ✓
- - Workload needs 92 GB on BSE → 96×0.95=91.2<92 → vgpu_profile=null, recommend "1× BSE GPU passthrough" ✓
- - Workload needs 120 GB on BSE → Exceeds single GPU → vgpu_profile=null, recommend "2× BSE GPU passthrough" ✓
+ - **Workload needs 92 GB on BSE → 96×0.95=91.2<92 → vgpu_profile=null, "2× BSE GPU passthrough" ✓**
+ - **Workload needs 96 GB on BSE → 96×0.95=91.2<96 → vgpu_profile=null, "2× BSE GPU passthrough" ✓**
+ - **Workload needs 120 GB on BSE → 96×0.95=91.2<120 → vgpu_profile=null, "2× BSE GPU passthrough" ✓**
- **Workload needs 22 GB on L4 → 24×0.95=22.8≥22 → Pick L4-24Q ✓**
- - **Workload needs 23 GB on L4 → 24×0.95=22.8<23 → vgpu_profile=null, recommend "1× L4 GPU passthrough" ✓**
- - **Workload needs 24 GB on L4 → 24×0.95=22.8<24 → vgpu_profile=null, recommend "1× L4 GPU passthrough" ✓**
- - **Workload needs 25 GB on L4 → Exceeds single L4 → vgpu_profile=null, recommend "2× L4 GPU passthrough" ✓**
+ - **Workload needs 23 GB on L4 → 24×0.95=22.8<23 → vgpu_profile=null, "2× L4 GPU passthrough" ✓**
+ - **Workload needs 50 GB on L40S → 48×0.95=45.6<50 → vgpu_profile=null, "2× L40S GPU passthrough" ✓**
+
+ **IMPORTANT: Use vGPU profiles ONLY when workload fits in a SINGLE profile!**
+ - If workload fits in single profile: use smallest vGPU profile that fits
+ - If workload exceeds max single profile: use GPU passthrough (entire GPUs, no vGPU)
+ - Max usable: BSE-96Q=91.2GB, L40S-48Q=45.6GB, L40-48Q=45.6GB, A40-48Q=45.6GB, L4-24Q=22.8GB
- **If workload exceeds max vGPU profile capacity with 5% headroom:**
- - Max usable capacities: BSE-96Q (91.2GB), L40S-48Q (45.6GB), L40-48Q (45.6GB), A40-48Q (45.6GB), L4-24Q (22.8GB)
+ **If workload > max single profile capacity → use passthrough:**
- Set `vgpu_profile` to null
- - In description field, recommend: "Use X× [GPU model] with full GPU passthrough (no vGPU)"
- - **IMPORTANT: Even with passthrough, reserve 5% for driver/OS overhead. Use 95% of physical memory.**
- - Calculate GPUs needed: ceil(workload / (physical_gpu × 0.95))
- - Physical GPU capacities: BSE=96GB, L40S=48GB, L40=48GB, A40=48GB, L4=24GB
- - Example: 92GB on BSE → ceil(92/91.2)=2 GPUs → "Use 2× BSE GPU passthrough" (96GB × 2 = 192GB total)
- - Example: 120GB on BSE → ceil(120/91.2)=2 GPUs → "Use 2× BSE GPU passthrough" (96GB × 2 = 192GB total)
- - Example: 185GB on BSE → ceil(185/91.2)=3 GPUs → "Use 3× BSE GPU passthrough" (96GB × 3 = 288GB total)
- - Example: 24GB on L4 → ceil(24/22.8)=2 GPUs → "Use 2× L4 GPU passthrough" (24GB × 2 = 48GB total)
- - Example: 50GB on L40S → ceil(50/45.6)=2 GPUs → "Use 2× L40S GPU passthrough" (48GB × 2 = 96GB total)
- - Example: 144GB on L40S → ceil(144/45.6)=4 GPUs → "Use 4× L40S GPU passthrough" (48GB × 4 = 192GB total)
+ - Recommend: "Use X× [GPU model] with full GPU passthrough (no vGPU)"
+ - Calculate GPUs: ceil(workload / (physical_gpu × 0.95))
+ - Example: 92GB on BSE → vgpu_profile=null → "2× BSE GPU passthrough"
+ - Example: 50GB on L40S → vgpu_profile=null → "2× L40S GPU passthrough"
+ - Example: 23GB on L4 → vgpu_profile=null → "2× L4 GPU passthrough"
## System RAM Calculation:
- Standard: (Model GB × 2.5) + (Concurrent Requests × 2GB) + 16GB
@@ -213,7 +229,7 @@ rag_template: |
```json
{{
"title": "generate_vgpu_config",
- "description": "Brief 1-2 sentence summary",
+ "description": "{{GPU_MODEL}} with vGPU profile {{SELECTED_PROFILE}} for inference of {{MODEL_NAME}} ({{QUANTIZATION}})",
"parameters": {{
"vgpu_profile": "BSE-48Q",
"vcpu_count": 16,
@@ -307,10 +323,11 @@ reflection_response_regeneration_prompt:
You are an expert NVIDIA vGPU configuration specialist. Generate a grounded vGPU configuration description
based ONLY on information explicitly found in the provided context documents.
- Your description should be concise (1-2 sentences) and mention:
- 1. The recommended vGPU profile
- 2. Configuration feasibility
- 3. Key constraints if any
+ Your description should be ULTRA-CONCISE (single sentence, <50 words) using this format:
+ "{GPU_MODEL} with vGPU profile {SELECTED_PROFILE} for inference of {MODEL_NAME} ({QUANTIZATION})"
+
+ Example: "L40S with vGPU profile L40S-48Q for inference of Llama-3.1-8B-Instruct (FP16)"
+ Do NOT include extra details about memory margins, safety, or system RAM.
CRITICAL RULES:
- Use ONLY vGPU profiles that appear exactly in the context (e.g., L40S-8Q, L4-4Q)
diff --git a/community/ai-vws-sizing-advisor/src/server.py b/community/ai-vws-sizing-advisor/src/server.py
index 7af3243b2..980020e78 100644
--- a/community/ai-vws-sizing-advisor/src/server.py
+++ b/community/ai-vws-sizing-advisor/src/server.py
@@ -244,6 +244,10 @@ class Prompt(BaseModel):
description="Enable or disable citations as part of response.",
default=os.getenv("ENABLE_CITATIONS", "True").lower() in ["true", "True"],
)
+ conversational_mode: bool = Field(
+ description="Enable conversational mode for plain text responses instead of structured JSON output.",
+ default=False,
+ )
model: str = Field(
description="Name of NIM LLM model to be used for inference.",
default=os.getenv("APP_LLM_MODELNAME", "").strip('"'),
@@ -905,14 +909,31 @@ async def generate_answer(request: Request, prompt: Prompt) -> StreamingResponse
# Helper function to escape JSON-like structures in content
def escape_json_content(content: str) -> str:
- """Escape curly braces in content to avoid JSON parsing issues"""
- return content.replace("{", "{{").replace("}", "}}")
+ """Escape curly braces in content to avoid JSON parsing issues.
+ IMPORTANT: Preserve embedded config as-is (don't escape)."""
+ import re
+ # Extract any embedded config first
+ config_match = re.search(r'', content, re.DOTALL)
+ if config_match:
+ # Preserve the embedded config, escape the rest
+ before = content[:config_match.start()]
+ config_section = config_match.group(0) # The entire
+ after = content[config_match.end():]
+ escaped_before = before.replace("{", "{{").replace("}", "}}")
+ escaped_after = after.replace("{", "{{").replace("}", "}}")
+ return escaped_before + config_section + escaped_after
+ else:
+ return content.replace("{", "{{").replace("}", "}}")
# The last user message will be the query for the rag or llm chain
last_user_message = next((message.content for message in reversed(chat_history) if message.role == 'user'),
None)
+ # DEBUG: Log raw message before escape
+ logger.info(f"[RAW MESSAGE DEBUG] Raw last_user_message (first 500 chars): {last_user_message[:500] if last_user_message else 'None'}")
+ logger.info(f"[RAW MESSAGE DEBUG] Contains VGPU_CONFIG: {' embedded config as-is (don't escape)."""
+ import re
+ # Extract any embedded config first
+ config_match = re.search(r'', content, re.DOTALL)
+ if config_match:
+ # Preserve the embedded config, escape the rest
+ before = content[:config_match.start()]
+ config_section = config_match.group(0) # The entire
+ after = content[config_match.end():]
+ escaped_before = before.replace("{", "{{").replace("}", "}}")
+ escaped_after = after.replace("{", "{{").replace("}", "}}")
+ return escaped_before + config_section + escaped_after
+ else:
+ return content.replace("{", "{{").replace("}", "}}")
# The last user message will be the query for the rag or llm chain
last_user_message = next((message.content for message in reversed(chat_history) if message.role == 'user'),