Skip to content

Commit 5e40053

Browse files
authored
Merge pull request #1063 from Qubitol/1060-default-gpu-architecture
Detect Nvidia arch automatically also for multiple GPUs
2 parents 49c6c33 + 125b386 commit 5e40053

1 file changed

Lines changed: 23 additions & 6 deletions

File tree

  • epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu

epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -190,15 +190,32 @@ ifeq ($(BACKEND),cuda)
190190
# NVidia CUDA architecture flags
191191
# See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
192192
# See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
193-
# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
194-
# This will embed device code for 70, and PTX for 70+.
193+
# Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
194+
# then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
195+
# use nvidia-smi and validate output with grep before going forward
196+
DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
195197
# One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
196198
# Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
197-
MADGRAPH_CUDA_ARCHITECTURE ?= 70
198-
###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
199-
###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
200199
comma:=,
201-
GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
200+
MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
201+
# Convert to space-separated list for looping
202+
MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
203+
204+
# Fallback if detection failed (box has CUDA selected but probe failed)
205+
ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
206+
# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
207+
# This will embed device code for 70, and PTX for 70+
208+
MADGRAPH_CUDA_ARCHITECTURE := 70
209+
MADGRAPH_CUDA_ARCH_LIST := 70
210+
$(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
211+
$(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
212+
endif
213+
214+
# Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
215+
HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
216+
GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
217+
GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
218+
GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX)
202219
GPUFLAGS += $(GPUARCHFLAGS)
203220

204221
# Other NVidia-specific flags

0 commit comments

Comments
 (0)