@@ -20,6 +20,7 @@ RHEL_MAJOR_VERSION=9
2020RHEL_MINOR_VERSION=${RHEL_MINOR_VERSION:- " " }
2121KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:- auto}
2222MODPROBE_CONFIG_DIR=" /etc/modprobe.d"
23+ FABRIC_MANAGER_FABRIC_MODE=${FABRIC_MANAGER_FABRIC_MODE:- 0}
2324
2425DRIVER_ARCH=${TARGETARCH/ amd64/ x86_64} && DRIVER_ARCH=${DRIVER_ARCH/ arm64/ aarch64}
2526echo " DRIVER_ARCH is $DRIVER_ARCH "
@@ -305,6 +306,87 @@ _ensure_nvlink5_prerequisites() (
305306 done
306307)
307308
309+ _configure_fabric_manager_config () {
310+ local fm_config_file=" $1 "
311+ local fmpm_socket_path=" $2 "
312+
313+ if [ " ${FABRIC_MANAGER_FABRIC_MODE} " = " 1" ]; then
314+ echo " Updating NVIDIA fabric manager configuration to fabric mode ${FABRIC_MANAGER_FABRIC_MODE} ..."
315+ sed -i " s/FABRIC_MODE=.*/FABRIC_MODE=${FABRIC_MANAGER_FABRIC_MODE} /g" $fm_config_file
316+
317+ echo " Updating NVIDIA fabric manager configuration to use a UNIX socket instead of TCP: ${fmpm_socket_path} "
318+ sed -i " s|^UNIX_SOCKET_PATH=.*|UNIX_SOCKET_PATH=${fmpm_socket_path} |g" $fm_config_file
319+ sed -i " s|^FM_CMD_UNIX_SOCKET_PATH=.*|FM_CMD_UNIX_SOCKET_PATH=${fmpm_socket_path} |g" $fm_config_file
320+ fi
321+ }
322+
323+ _setup_fabric_manager () {
324+ local fmpm_socket_path=" $1 "
325+
326+ if _assert_nvlink5_system; then
327+ _ensure_nvlink5_prerequisites || return 1
328+
329+ fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
330+ _configure_fabric_manager_config " $fm_config_file " " $fmpm_socket_path "
331+
332+ fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
333+ nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
334+ nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
335+
336+ echo " Starting NVIDIA fabric manager daemon for NVLink5+..."
337+
338+ /usr/bin/nvidia-fabricmanager-start.sh --mode start \
339+ --fm-config-file $fm_config_file \
340+ --fm-pid-file $fm_pid_file \
341+ --nvlsm-config-file $nvlsm_config_file \
342+ --nvlsm-pid-file $nvlsm_pid_file
343+
344+ # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
345+ elif _assert_nvswitch_system; then
346+ fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
347+ _configure_fabric_manager_config " $fm_config_file " " $fmpm_socket_path "
348+
349+ echo " Starting NVIDIA fabric manager daemon..."
350+ nv-fabricmanager -c $fm_config_file
351+ fi
352+ }
353+
354+ # Capture GPU PCI address to physical module ID mapping and persist to JSON file.
355+ _capture_gpu_mapping () {
356+ local gpu_mapping
357+
358+ echo " Capturing GPU PCI to Module ID mapping..."
359+ if command -v nvidia-smi > /dev/null 2>&1 ; then
360+ gpu_mapping=$( nvidia-smi -q | egrep " (Module|Bus).*Id" )
361+ if [ -n " $gpu_mapping " ]; then
362+ echo " $gpu_mapping "
363+ # Parse and convert to JSON format
364+ json_entries=" "
365+ module_id=" "
366+ while IFS= read -r line; do
367+ if [[ " $line " =~ Module\ Id.* :\ ([0-9]+) ]]; then
368+ module_id=" ${BASH_REMATCH[1]} "
369+ elif [[ " $line " =~ Bus\ Id.* :\ ([0-9A-Fa-f:\. ]+) ]] && [ -n " $module_id " ]; then
370+ pci_id=" ${BASH_REMATCH[1]} "
371+ if [ -n " $json_entries " ]; then
372+ json_entries=" ${json_entries} ,"
373+ fi
374+ json_entries=" ${json_entries} \" ${pci_id} \" : \" ${module_id} \" "
375+ module_id=" "
376+ fi
377+ done <<< " $gpu_mapping"
378+
379+ mkdir -p /run/nvidia-fabricmanager
380+ echo " {${json_entries} }" > /run/nvidia-fabricmanager/gpu-pci-module-mapping.json
381+ echo " GPU mapping saved to /run/nvidia-fabricmanager/gpu-pci-module-mapping.json"
382+ else
383+ echo " Warning: Could not retrieve GPU PCI to Module ID mapping"
384+ fi
385+ else
386+ echo " Warning: nvidia-smi not available for GPU mapping"
387+ fi
388+ }
389+
308390# For each kernel module configuration file mounted into the container,
309391# parse the file contents and extract the custom module parameters that
310392# are to be passed as input to 'modprobe'.
@@ -380,6 +462,7 @@ _load_driver() {
380462 local nv_fw_search_path=" $RUN_DIR /driver/lib/firmware"
381463 local set_fw_path=" true"
382464 local fw_path_config_file=" /sys/module/firmware_class/parameters/path"
465+ local fmpm_socket_path=" /run/nvidia-fabricmanager/fmpm.sock"
383466 for param in " ${NVIDIA_MODULE_PARAMS[@]} " ; do
384467 if [[ " $param " == " NVreg_EnableGpuFirmware=0" ]]; then
385468 set_fw_path=" false"
@@ -418,8 +501,12 @@ _load_driver() {
418501 set +o xtrace -o nounset
419502 fi
420503
421- echo " Starting NVIDIA persistence daemon..."
422- nvidia-persistenced --persistence-mode
504+ if [ " ${FABRIC_MANAGER_FABRIC_MODE} " = " 1" ]; then
505+ echo " Skipping NVIDIA persistence daemon..."
506+ else
507+ echo " Starting NVIDIA persistence daemon..."
508+ nvidia-persistenced --persistence-mode
509+ fi
423510
424511 if [ " ${DRIVER_TYPE} " = " vgpu" ]; then
425512 echo " Copying gridd.conf..."
@@ -437,25 +524,9 @@ _load_driver() {
437524 _start_vgpu_topology_daemon
438525 fi
439526
440- if _assert_nvlink5_system; then
441- _ensure_nvlink5_prerequisites || return 1
442- echo " Starting NVIDIA fabric manager daemon for NVLink5+..."
443-
444- fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
445- fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
446- nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
447- nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
448- /usr/bin/nvidia-fabricmanager-start.sh --mode start \
449- --fm-config-file $fm_config_file \
450- --fm-pid-file $fm_pid_file \
451- --nvlsm-config-file $nvlsm_config_file \
452- --nvlsm-pid-file $nvlsm_pid_file
527+ _setup_fabric_manager " ${fmpm_socket_path} "
453528
454- # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
455- elif _assert_nvswitch_system; then
456- echo " Starting NVIDIA fabric manager daemon..."
457- nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
458- fi
529+ _capture_gpu_mapping
459530}
460531
461532# Stop persistenced and unload the kernel modules if they are currently loaded.
0 commit comments