Skip to content

Commit 8dea1b9

Browse files
author
Michail Resvanis
committed
Add support for fabric manager shared-nvswitch mode
The changes include: - add the `FABRIC_MANAGER_FABRIC_MODE` env var that configures FM with either full-passthrough (0) or shared-nvswitch (1) mode. It defaults to 0. - when fabric manager mode is set to 0 no changes to the flow, i.e. execute the fabric manager daemon with its default configuration. - when fabric manager mode is set to 1: - edit the fabric manager configuration file and set `FABRIC_MODE=1`. - persist mapping of physical GPU module IDs to their PCIe address by creating a JSON file on disk (the physical GPU module IDs are available through nvidia-smi). - disable `nvidia-persistenced`, as the GPU devices should be unbound from the NVIDIA driver and bound to vfio-pci (a step executed by the vfio-manager). Signed-off-by: Michail Resvanis <mresvani@redhat.com>
1 parent e4f05d3 commit 8dea1b9

2 files changed

Lines changed: 108 additions & 21 deletions

File tree

rhel9/Dockerfile

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ ENV DISABLE_VGPU_VERSION_CHECK=$DISABLE_VGPU_VERSION_CHECK
5353
# Avoid dependency of container-toolkit for driver container
5454
ENV NVIDIA_VISIBLE_DEVICES=void
5555

56+
# Fabric manager fabric mode, default is 0 (full-passthrough)
57+
ARG FABRIC_MANAGER_FABRIC_MODE=0
58+
ENV FABRIC_MANAGER_FABRIC_MODE=$FABRIC_MANAGER_FABRIC_MODE
59+
5660
ADD install.sh /tmp/
5761

5862
RUN NVIDIA_GPGKEY_SUM=d0664fbbdb8c32356d45de36c5984617217b2d0bef41b93ccecd326ba3b80c87 && \
@@ -74,7 +78,19 @@ RUN if [ "$DRIVER_TYPE" != "vgpu" ]; then \
7478
cd drivers && \
7579
DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64} && \
7680
curl -fSsl -O $BASE_URL/$DRIVER_VERSION/NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run && \
77-
chmod +x NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run; fi
81+
chmod +x NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run && \
82+
versionArray=(${DRIVER_VERSION//./ }); \
83+
DRIVER_BRANCH=${versionArray[0]}; \
84+
dnf install git -y && \
85+
dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
86+
dnf module enable -y nvidia-driver:${DRIVER_BRANCH}-dkms && \
87+
dnf install -y nvidia-fabric-manager-${DRIVER_VERSION}-1 nvidia-fabric-manager-devel-${DRIVER_VERSION}-1 libnvidia-nscq-${DRIVER_BRANCH}-${DRIVER_VERSION}-1 jsoncpp-devel gcc-c++ make && \
88+
git clone https://github.com/mresvanis/Fabric-Manager-Client.git && \
89+
cd Fabric-Manager-Client && \
90+
git checkout fix-ignoring-unix-socket && \
91+
make fmpm && \
92+
cp fmpm /usr/bin/ && \
93+
chmod +x /usr/bin/fmpm; fi
7894

7995
# Fetch the installer, fabricmanager, libnvidia-nscq, libnvsdm, imex packages
8096
RUN sh /tmp/install.sh extrapkgsinstall

rhel9/nvidia-driver

Lines changed: 91 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ RHEL_MAJOR_VERSION=9
2020
RHEL_MINOR_VERSION=${RHEL_MINOR_VERSION:-""}
2121
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}
2222
MODPROBE_CONFIG_DIR="/etc/modprobe.d"
23+
FABRIC_MANAGER_FABRIC_MODE=${FABRIC_MANAGER_FABRIC_MODE:-0}
2324

2425
DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64}
2526
echo "DRIVER_ARCH is $DRIVER_ARCH"
@@ -305,6 +306,87 @@ _ensure_nvlink5_prerequisites() (
305306
done
306307
)
307308

309+
_configure_fabric_manager_config() {
310+
local fm_config_file="$1"
311+
local fmpm_socket_path="$2"
312+
313+
if [ "${FABRIC_MANAGER_FABRIC_MODE}" = "1" ]; then
314+
echo "Updating NVIDIA fabric manager configuration to fabric mode ${FABRIC_MANAGER_FABRIC_MODE}..."
315+
sed -i "s/FABRIC_MODE=.*/FABRIC_MODE=${FABRIC_MANAGER_FABRIC_MODE}/g" $fm_config_file
316+
317+
echo "Updating NVIDIA fabric manager configuration to use a UNIX socket instead of TCP: ${fmpm_socket_path}"
318+
sed -i "s|^UNIX_SOCKET_PATH=.*|UNIX_SOCKET_PATH=${fmpm_socket_path}|g" $fm_config_file
319+
sed -i "s|^FM_CMD_UNIX_SOCKET_PATH=.*|FM_CMD_UNIX_SOCKET_PATH=${fmpm_socket_path}|g" $fm_config_file
320+
fi
321+
}
322+
323+
_setup_fabric_manager() {
324+
local fmpm_socket_path="$1"
325+
326+
if _assert_nvlink5_system; then
327+
_ensure_nvlink5_prerequisites || return 1
328+
329+
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
330+
_configure_fabric_manager_config "$fm_config_file" "$fmpm_socket_path"
331+
332+
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
333+
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
334+
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
335+
336+
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
337+
338+
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
339+
--fm-config-file $fm_config_file \
340+
--fm-pid-file $fm_pid_file \
341+
--nvlsm-config-file $nvlsm_config_file \
342+
--nvlsm-pid-file $nvlsm_pid_file
343+
344+
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
345+
elif _assert_nvswitch_system; then
346+
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
347+
_configure_fabric_manager_config "$fm_config_file" "$fmpm_socket_path"
348+
349+
echo "Starting NVIDIA fabric manager daemon..."
350+
nv-fabricmanager -c $fm_config_file
351+
fi
352+
}
353+
354+
# Capture GPU PCI address to physical module ID mapping and persist to JSON file.
355+
_capture_gpu_mapping() {
356+
local gpu_mapping
357+
358+
echo "Capturing GPU PCI to Module ID mapping..."
359+
if command -v nvidia-smi >/dev/null 2>&1; then
360+
gpu_mapping=$(nvidia-smi -q | egrep "(Module|Bus).*Id")
361+
if [ -n "$gpu_mapping" ]; then
362+
echo "$gpu_mapping"
363+
# Parse and convert to JSON format
364+
json_entries=""
365+
module_id=""
366+
while IFS= read -r line; do
367+
if [[ "$line" =~ Module\ Id.*:\ ([0-9]+) ]]; then
368+
module_id="${BASH_REMATCH[1]}"
369+
elif [[ "$line" =~ Bus\ Id.*:\ ([0-9A-Fa-f:\.]+) ]] && [ -n "$module_id" ]; then
370+
pci_id="${BASH_REMATCH[1]}"
371+
if [ -n "$json_entries" ]; then
372+
json_entries="${json_entries},"
373+
fi
374+
json_entries="${json_entries}\"${pci_id}\": \"${module_id}\""
375+
module_id=""
376+
fi
377+
done <<< "$gpu_mapping"
378+
379+
mkdir -p /run/nvidia-fabricmanager
380+
echo "{${json_entries}}" > /run/nvidia-fabricmanager/gpu-pci-module-mapping.json
381+
echo "GPU mapping saved to /run/nvidia-fabricmanager/gpu-pci-module-mapping.json"
382+
else
383+
echo "Warning: Could not retrieve GPU PCI to Module ID mapping"
384+
fi
385+
else
386+
echo "Warning: nvidia-smi not available for GPU mapping"
387+
fi
388+
}
389+
308390
# For each kernel module configuration file mounted into the container,
309391
# parse the file contents and extract the custom module parameters that
310392
# are to be passed as input to 'modprobe'.
@@ -380,6 +462,7 @@ _load_driver() {
380462
local nv_fw_search_path="$RUN_DIR/driver/lib/firmware"
381463
local set_fw_path="true"
382464
local fw_path_config_file="/sys/module/firmware_class/parameters/path"
465+
local fmpm_socket_path="/run/nvidia-fabricmanager/fmpm.sock"
383466
for param in "${NVIDIA_MODULE_PARAMS[@]}"; do
384467
if [[ "$param" == "NVreg_EnableGpuFirmware=0" ]]; then
385468
set_fw_path="false"
@@ -418,8 +501,12 @@ _load_driver() {
418501
set +o xtrace -o nounset
419502
fi
420503

421-
echo "Starting NVIDIA persistence daemon..."
422-
nvidia-persistenced --persistence-mode
504+
if [ "${FABRIC_MANAGER_FABRIC_MODE}" = "1" ]; then
505+
echo "Skipping NVIDIA persistence daemon..."
506+
else
507+
echo "Starting NVIDIA persistence daemon..."
508+
nvidia-persistenced --persistence-mode
509+
fi
423510

424511
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
425512
echo "Copying gridd.conf..."
@@ -437,25 +524,9 @@ _load_driver() {
437524
_start_vgpu_topology_daemon
438525
fi
439526

440-
if _assert_nvlink5_system; then
441-
_ensure_nvlink5_prerequisites || return 1
442-
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
443-
444-
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
445-
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
446-
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
447-
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
448-
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
449-
--fm-config-file $fm_config_file \
450-
--fm-pid-file $fm_pid_file \
451-
--nvlsm-config-file $nvlsm_config_file \
452-
--nvlsm-pid-file $nvlsm_pid_file
527+
_setup_fabric_manager "${fmpm_socket_path}"
453528

454-
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
455-
elif _assert_nvswitch_system; then
456-
echo "Starting NVIDIA fabric manager daemon..."
457-
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
458-
fi
529+
_capture_gpu_mapping
459530
}
460531

461532
# Stop persistenced and unload the kernel modules if they are currently loaded.

0 commit comments

Comments
 (0)