diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo index bfd34580eb..7fac9eda15 160000 --- a/MG5aMC/mg5amcnlo +++ b/MG5aMC/mg5amcnlo @@ -1 +1 @@ -Subproject commit bfd34580eb59c2a027a502c89995e682a70a95b9 +Subproject commit 7fac9eda15ce8f4c8e9eb01704bfb3c0c3b558b8 diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index f8930a863f..7e54de7d8d 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +54,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0051648616790771484  +DEBUG: model prefixing takes 0.0028493404388427734  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -147,7 +146,7 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.002 s +1 processes with 2 diagrams generated in 0.003 s Total: 1 processes with 2 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -158,10 +157,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vecto INFO: initialize a new directory: CODEGEN_mad_ee_mumu INFO: remove old information in CODEGEN_mad_ee_mumu DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 @@ -173,22 +172,22 @@ FileWriter mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s -Wrote files for 8 helas calls in 0.285 s +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s +Wrote files for 8 helas calls in 0.063 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.122 s +ALOHA: aloha creates 3 routines in 0.123 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.152 s +ALOHA: aloha creates 7 routines in 0.122 s FFV1 FFV1 FFV2 @@ -197,32 +196,32 @@ ALOHA: aloha creates 7 routines in 0.152 s FFV4 FFV2_4 FFV2_4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. +Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README +/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README Run "open index.html" to see more information about this process. quit -real 0m4.542s -user 0m1.246s -sys 0m0.587s -Code generation completed in 5 seconds +real 0m1.894s +user 0m1.598s +sys 0m0.266s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -243,9 +242,9 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt @@ -273,9 +272,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt index 712b1897aa..7795e7e382 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat index 7aed5df7db..c3d8b1e1bd 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-19-g7fac9eda1 3.7.1 * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/.resolved-backend b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk b/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index bdea67b952..50cf4c47eb 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -1,5 +1,5 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode +('WARNING: loading of madgraph too slow!!!', 1.129610300064087) Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. It has been validated for the last time with version: 3.6.5 @@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,7 +46,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +55,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005957365036010742  +DEBUG: model prefixing takes 0.0025908946990966797  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -147,13 +147,13 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.002 s +1 processes with 2 diagrams generated in 0.003 s Total: 1 processes with 2 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 @@ -162,17 +162,17 @@ INFO: Processing color information for process: e+ e- > mu+ mu- @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. -Generated helas calls for 1 subprocesses (2 diagrams) in 0.002 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. +Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.171 s +ALOHA: aloha creates 4 routines in 0.137 s FFV1 FFV1 FFV2 @@ -181,17 +181,17 @@ ALOHA: aloha creates 4 routines in 0.171 s FFV4 FFV2_4 FFV2_4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m1.151s -user 0m0.372s -sys 0m0.155s -Code generation completed in 1 seconds +real 0m1.595s +user 0m1.385s +sys 0m0.131s +Code generation completed in 2 seconds diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk b/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index dbae24afe0..086655a6c3 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +54,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005540609359741211  +DEBUG: model prefixing takes 0.0028896331787109375  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,7 +147,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.004 s +1 processes with 3 diagrams generated in 0.006 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -159,10 +158,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_ INFO: initialize a new directory: CODEGEN_mad_gg_tt INFO: remove old information in CODEGEN_mad_gg_tt DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -174,49 +173,49 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s -Wrote files for 10 helas calls in 0.266 s +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s +Wrote files for 10 helas calls in 0.050 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.089 s +ALOHA: aloha creates 2 routines in 0.077 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.093 s +ALOHA: aloha creates 4 routines in 0.066 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. +Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README +/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m4.687s -user 0m1.163s -sys 0m0.619s -Code generation completed in 5 seconds +real 0m1.717s +user 0m1.422s +sys 0m0.279s +Code generation completed in 1 seconds ************************************************************ * * * W E L C O M E to * @@ -237,9 +236,9 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt @@ -267,9 +266,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt index 712b1897aa..7795e7e382 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat index 8b331b055f..af833f8d84 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-19-g7fac9eda1 3.7.1 * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/.resolved-backend b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 20cc72fd46..84563e6016 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +54,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00434565544128418  +DEBUG: model prefixing takes 0.0028569698333740234  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,13 +147,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.004 s +1 processes with 3 diagrams generated in 0.006 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -163,30 +162,30 @@ INFO: Processing color information for process: g g > t t~ @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. -Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. +Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.091 s +ALOHA: aloha creates 2 routines in 0.076 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.992s -user 0m0.334s -sys 0m0.123s -Code generation completed in 1 seconds +real 0m0.418s +user 0m0.368s +sys 0m0.045s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index 332a0806f1..e02a13bee6 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +54,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005877494812011719  +DEBUG: model prefixing takes 0.0028417110443115234  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,7 +147,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.004 s +1 processes with 3 diagrams generated in 0.006 s Total: 1 processes with 3 diagrams add process g g > t t~ g INFO: Checking for minimal orders which gives processes. @@ -156,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.010 s +1 processes with 16 diagrams generated in 0.013 s Total: 2 processes with 19 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -167,10 +166,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vect INFO: initialize a new directory: CODEGEN_mad_gg_tt01g INFO: remove old information in CODEGEN_mad_gg_tt01g DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @2 INFO: Processing color information for process: g g > t t~ g @2 @@ -184,9 +183,9 @@ FileWriter t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -195,25 +194,25 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  -Generated helas calls for 2 subprocesses (19 diagrams) in 0.023 s -Wrote files for 46 helas calls in 0.502 s +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1749]  +Generated helas calls for 2 subprocesses (19 diagrams) in 0.043 s +Wrote files for 46 helas calls in 0.142 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.190 s +ALOHA: aloha creates 5 routines in 0.164 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.187 s +ALOHA: aloha creates 10 routines in 0.147 s VVV1 VVV1 FFV1 @@ -223,32 +222,32 @@ ALOHA: aloha creates 10 routines in 0.187 s VVVV1 VVVV3 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. +Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README +/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README Run "open index.html" to see more information about this process. quit -real 0m5.233s -user 0m1.496s -sys 0m0.718s -Code generation completed in 5 seconds +real 0m2.211s +user 0m1.861s +sys 0m0.323s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -269,9 +268,9 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt @@ -299,9 +298,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt index 712b1897aa..7795e7e382 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat index 30bd3794c3..509884e6e3 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-19-g7fac9eda1 3.7.1 * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/.resolved-backend b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/.resolved-backend b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index b836987bc5..655f6496a9 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +54,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00551295280456543  +DEBUG: model prefixing takes 0.002906322479248047  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,7 +147,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.012 s +1 processes with 16 diagrams generated in 0.014 s Total: 1 processes with 16 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -159,10 +158,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector INFO: initialize a new directory: CODEGEN_mad_gg_ttg INFO: remove old information in CODEGEN_mad_gg_ttg DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 @@ -174,25 +173,25 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (16 diagrams) in 0.046 s -Wrote files for 36 helas calls in 0.368 s +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (16 diagrams) in 0.027 s +Wrote files for 36 helas calls in 0.077 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.190 s +ALOHA: aloha creates 5 routines in 0.166 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.194 s +ALOHA: aloha creates 10 routines in 0.184 s VVV1 VVV1 FFV1 @@ -202,32 +201,32 @@ ALOHA: aloha creates 10 routines in 0.194 s VVVV1 VVVV3 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. +Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README +/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README Run "open index.html" to see more information about this process. quit -real 0m4.945s -user 0m1.513s -sys 0m0.678s -Code generation completed in 5 seconds +real 0m2.048s +user 0m1.772s +sys 0m0.254s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -248,9 +247,9 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt @@ -278,9 +277,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt index 712b1897aa..7795e7e382 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat index 0fe3df08d4..4f43fc7e75 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-19-g7fac9eda1 3.7.1 * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/.resolved-backend b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index ba99f30bdf..a58fe51240 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +54,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005433082580566406  +DEBUG: model prefixing takes 0.0028769969940185547  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,13 +147,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.012 s +1 processes with 16 diagrams generated in 0.015 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 @@ -163,18 +162,18 @@ INFO: Processing color information for process: g g > t t~ g @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. -Generated helas calls for 1 subprocesses (16 diagrams) in 0.045 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. +Generated helas calls for 1 subprocesses (16 diagrams) in 0.026 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.206 s +ALOHA: aloha creates 5 routines in 0.167 s VVV1 VVV1 FFV1 @@ -184,17 +183,17 @@ ALOHA: aloha creates 5 routines in 0.206 s VVVV1 VVVV3 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m1.176s -user 0m0.468s -sys 0m0.131s -Code generation completed in 1 seconds +real 0m0.546s +user 0m0.501s +sys 0m0.038s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index ea9db152a3..89fd13be72 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +54,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004921674728393555  +DEBUG: model prefixing takes 0.002771615982055664  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,7 +147,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.080 s +1 processes with 123 diagrams generated in 0.105 s Total: 1 processes with 123 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -159,10 +158,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vecto INFO: initialize a new directory: CODEGEN_mad_gg_ttgg INFO: remove old information in CODEGEN_mad_gg_ttgg DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 @@ -174,25 +173,25 @@ FileWriter t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (123 diagrams) in 0.223 s -Wrote files for 222 helas calls in 0.654 s +DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (123 diagrams) in 0.306 s +Wrote files for 222 helas calls in 0.423 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.219 s +ALOHA: aloha creates 5 routines in 0.151 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.197 s +ALOHA: aloha creates 10 routines in 0.166 s VVV1 VVV1 FFV1 @@ -205,32 +204,32 @@ ALOHA: aloha creates 10 routines in 0.197 s VVVV3 VVVV4 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. +Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README +/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README Run "open index.html" to see more information about this process. quit -real 0m5.675s -user 0m2.118s -sys 0m0.681s -Code generation completed in 6 seconds +real 0m3.147s +user 0m2.809s +sys 0m0.313s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -251,9 +250,9 @@ Code generation completed in 6 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt @@ -281,9 +280,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt index 712b1897aa..7795e7e382 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat index 5fe0cb01be..ffcd35ce8c 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-19-g7fac9eda1 3.7.1 * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/.resolved-backend b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 7ff994126b..4465db9974 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +54,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.003983020782470703  +DEBUG: model prefixing takes 0.002856731414794922  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,13 +147,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.081 s +1 processes with 123 diagrams generated in 0.167 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 @@ -163,18 +162,18 @@ INFO: Processing color information for process: g g > t t~ g g @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.216 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. +Generated helas calls for 1 subprocesses (123 diagrams) in 0.271 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.204 s +ALOHA: aloha creates 5 routines in 0.153 s VVV1 VVV1 FFV1 @@ -187,17 +186,17 @@ ALOHA: aloha creates 5 routines in 0.204 s VVVV3 VVVV4 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.544s -user 0m0.774s -sys 0m0.144s -Code generation completed in 2 seconds +real 0m1.043s +user 0m0.977s +sys 0m0.055s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index ebb525b6f1..708b742d29 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +54,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0036034584045410156  +DEBUG: model prefixing takes 0.0028352737426757812  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,7 +147,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 0.963 s +1 processes with 1240 diagrams generated in 1.434 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -159,16 +158,16 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vect INFO: initialize a new directory: CODEGEN_mad_gg_ttggg INFO: remove old information in CODEGEN_mad_gg_ttggg DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] -INFO: Color-Flow passed to 1630 term in 3s. Introduce 3030 contraction +INFO: Color-Flow passed to 1630 term in 5s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h @@ -176,25 +175,25 @@ FileWriter t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (1240 diagrams) in 3.355 s -Wrote files for 2281 helas calls in 9.598 s +DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (1240 diagrams) in 4.232 s +Wrote files for 2281 helas calls in 10.524 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.194 s +ALOHA: aloha creates 5 routines in 0.271 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.231 s +ALOHA: aloha creates 10 routines in 0.148 s VVV1 VVV1 FFV1 @@ -207,32 +206,32 @@ ALOHA: aloha creates 10 routines in 0.231 s VVVV3 VVVV4 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. +Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README +/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README Run "open index.html" to see more information about this process. quit -real 0m20.546s -user 0m16.458s -sys 0m0.884s -Code generation completed in 20 seconds +real 0m21.162s +user 0m20.720s +sys 0m0.378s +Code generation completed in 21 seconds ************************************************************ * * * W E L C O M E to * @@ -253,9 +252,9 @@ Code generation completed in 20 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt @@ -283,9 +282,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt index 712b1897aa..7795e7e382 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat index 08a07273bc..a97972097e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-19-g7fac9eda1 3.7.1 * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/.resolved-backend b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 66cd67a19b..0b8e756576 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -55,7 +54,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0029516220092773438  +DEBUG: model prefixing takes 0.0029799938201904297  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -148,13 +147,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 0.953 s +1 processes with 1240 diagrams generated in 1.367 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 @@ -163,18 +162,18 @@ INFO: Processing color information for process: g g > t t~ g g g @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 3.379 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. +Generated helas calls for 1 subprocesses (1240 diagrams) in 4.242 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.214 s +ALOHA: aloha creates 5 routines in 0.190 s VVV1 VVV1 FFV1 @@ -187,17 +186,17 @@ ALOHA: aloha creates 5 routines in 0.214 s VVVV3 VVVV4 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m7.419s -user 0m6.626s -sys 0m0.185s -Code generation completed in 7 seconds +real 0m8.313s +user 0m8.187s +sys 0m0.095s +Code generation completed in 8 seconds diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 248fa16d65..cc9ea4ac2d 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -54,7 +53,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0052111148834228516  +DEBUG: model prefixing takes 0.003007173538208008  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -163,7 +162,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.041 s +8 processes with 40 diagrams generated in 0.053 s Total: 8 processes with 40 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -174,10 +173,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector INFO: initialize a new directory: CODEGEN_mad_gq_ttq INFO: remove old information in CODEGEN_mad_gq_ttq DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 INFO: Processing color information for process: g u > t t~ u @1 @@ -197,9 +196,9 @@ FileWriter t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -208,50 +207,50 @@ FileWriter t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  -Generated helas calls for 2 subprocesses (10 diagrams) in 0.017 s -Wrote files for 32 helas calls in 0.625 s +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1749]  +Generated helas calls for 2 subprocesses (10 diagrams) in 0.023 s +Wrote files for 32 helas calls in 0.110 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.094 s +ALOHA: aloha creates 2 routines in 0.086 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.080 s +ALOHA: aloha creates 4 routines in 0.065 s FFV1 FFV1 FFV1 FFV1 VVV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. +Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README +/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README Run "open index.html" to see more information about this process. quit -real 0m5.076s -user 0m1.391s -sys 0m0.672s -Code generation completed in 5 seconds +real 0m1.975s +user 0m1.676s +sys 0m0.274s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -272,9 +271,9 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt @@ -302,9 +301,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt index 712b1897aa..7795e7e382 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat index aba2f10b06..38476653c7 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-19-g7fac9eda1 3.7.1 * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/.resolved-backend b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/.resolved-backend b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk b/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index e76b814911..1518ee06b2 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -54,7 +53,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005757570266723633  +DEBUG: model prefixing takes 0.0035910606384277344  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -163,13 +162,13 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.040 s +8 processes with 40 diagrams generated in 0.054 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 INFO: Processing color information for process: g u > t t~ u @1 @@ -186,40 +185,40 @@ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  DEBUG: type(subproc_group)= [output.py at line 223]  DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=1 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. -Generated helas calls for 2 subprocesses (10 diagrams) in 0.016 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. +Generated helas calls for 2 subprocesses (10 diagrams) in 0.021 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.090 s +ALOHA: aloha creates 2 routines in 0.078 s FFV1 FFV1 FFV1 FFV1 VVV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m1.337s -user 0m0.375s -sys 0m0.160s -Code generation completed in 2 seconds +real 0m0.562s +user 0m0.491s +sys 0m0.063s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk b/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt index f374f8f313..d0de7d3966 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,14 +45,14 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  @@ -120,7 +119,7 @@ Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~ generate g g > b b~ HIW<=1 INFO: Trying process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Process has 4 diagrams -1 processes with 4 diagrams generated in 0.003 s +1 processes with 4 diagrams generated in 0.005 s Total: 1 processes with 4 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_heft_gg_bb --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -131,10 +130,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_heft_gg_bb --hel_recycling=False --ve INFO: initialize a new directory: CODEGEN_mad_heft_gg_bb INFO: remove old information in CODEGEN_mad_heft_gg_bb DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 @@ -146,55 +145,55 @@ FileWriter b b~ HIG<=1 HIW<=1 @1 INFO: Finding symmetric diagrams for subprocess group gg_bbx -DEBUG: len(subproc_diagrams_for_config) =  4 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (4 diagrams) in 0.005 s -Wrote files for 12 helas calls in 0.268 s +DEBUG: len(subproc_diagrams_for_config) =  4 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (4 diagrams) in 0.011 s +Wrote files for 12 helas calls in 0.079 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.159 s +ALOHA: aloha creates 4 routines in 0.221 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 8 routines in 0.152 s +ALOHA: aloha creates 8 routines in 0.131 s VVS3 VVV1 FFV1 FFV1 FFV1 FFS2 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h -INFO: Created file HelAmps_heft.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h +INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done. +Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README +/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README Run "open index.html" to see more information about this process. quit -real 0m4.654s -user 0m1.223s -sys 0m0.605s -Code generation completed in 5 seconds +real 0m2.373s +user 0m1.936s +sys 0m0.410s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -215,9 +214,9 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt @@ -245,9 +244,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt index 712b1897aa..7795e7e382 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat index 84c16b4cf4..f3c02d2ec5 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-19-g7fac9eda1 3.7.1 * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/.resolved-backend b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk b/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt index e04a2da479..37bd64eb68 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,26 +45,26 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft -INFO: download model from https://madgraph.mi.infn.it/Downloads/models/heft.tgz to the following directory: /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/models  ---2026-03-10 10:38:21-- https://madgraph.mi.infn.it/Downloads/models/heft.tgz -Resolving madgraph.mi.infn.it (madgraph.mi.infn.it)... 192.135.21.75 -Connecting to madgraph.mi.infn.it (madgraph.mi.infn.it)|192.135.21.75|:443... connected. +INFO: download model from http://madgraph.phys.ucl.ac.be/Downloads/models/heft.tgz to the following directory: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/models  +--2026-03-24 14:16:55-- http://madgraph.phys.ucl.ac.be/Downloads/models/heft.tgz +Resolving madgraph.phys.ucl.ac.be (madgraph.phys.ucl.ac.be)... 130.104.2.143 +Connecting to madgraph.phys.ucl.ac.be (madgraph.phys.ucl.ac.be)|130.104.2.143|:80... connected. HTTP request sent, awaiting response... 200 OK Length: 50876 (50K) [application/x-gzip] Saving to: ‘tmp.tgz’ - 0K .......... .......... .......... .......... ......... 100% 2.92M=0.02s + 0K .......... .......... .......... .......... ......... 100% 848K=0.06s -2026-03-10 10:38:22 (2.92 MB/s) - ‘tmp.tgz’ saved [50876/50876] +2026-03-24 14:16:55 (848 KB/s) - ‘tmp.tgz’ saved [50876/50876] heft/ heft/write_param_card.py @@ -102,7 +101,7 @@ INFO: load particles INFO: load vertices WARNING: coupling GC_13=-(complex(0,1)*GH) has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  WARNING: coupling GC_16=(complex(0,1)*Gphi)/8. has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  -DEBUG: model prefixing takes 0.007684946060180664  +DEBUG: model prefixing takes 0.0035004615783691406  INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: s u w+ at order: QED=1  @@ -168,13 +167,13 @@ Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~ generate g g > b b~ HIW<=1 INFO: Trying process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Process has 4 diagrams -1 processes with 4 diagrams generated in 0.003 s +1 processes with 4 diagrams generated in 0.004 s Total: 1 processes with 4 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_bb Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 @@ -183,34 +182,34 @@ INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. -Generated helas calls for 1 subprocesses (4 diagrams) in 0.005 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. +Generated helas calls for 1 subprocesses (4 diagrams) in 0.007 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.159 s +ALOHA: aloha creates 4 routines in 0.145 s VVS3 VVV1 FFV1 FFV1 FFV1 FFS2 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h -INFO: Created file HelAmps_heft.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h +INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. quit -real 0m1.669s -user 0m0.522s -sys 0m0.180s -Code generation completed in 2 seconds +real 0m0.816s +user 0m0.527s +sys 0m0.083s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk b/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt index 5067c06ff1..eb52b08042 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -54,7 +53,7 @@ set zerowidth_tchannel F import model sm-no_b_mass INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.003014802932739258  +DEBUG: model prefixing takes 0.0029153823852539062  INFO: Restrict model sm-no_b_mass with file models/sm/restrict_no_b_mass.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -178,7 +177,7 @@ INFO: Process u~ d > t t~ w- added to mirror process d u~ > t t~ w- INFO: Process c~ s > t t~ w- added to mirror process s c~ > t t~ w- INFO: Process d~ u > t t~ w+ added to mirror process u d~ > t t~ w+ INFO: Process s~ c > t t~ w+ added to mirror process c s~ > t t~ w+ -4 processes with 8 diagrams generated in 0.056 s +4 processes with 8 diagrams generated in 0.072 s Total: 4 processes with 8 diagrams add process p p > t t~ w j @1 INFO: Checking for minimal orders which gives processes. @@ -220,7 +219,7 @@ INFO: Process d~ g > t t~ w+ u~ added to mirror process g d~ > t t~ w+ u~ INFO: Process d~ u > t t~ w+ g added to mirror process u d~ > t t~ w+ g INFO: Process s~ g > t t~ w+ c~ added to mirror process g s~ > t t~ w+ c~ INFO: Process s~ c > t t~ w+ g added to mirror process c s~ > t t~ w+ g -12 processes with 144 diagrams generated in 0.331 s +12 processes with 144 diagrams generated in 0.421 s Total: 16 processes with 152 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_nobm_pp_ttW --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -231,10 +230,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_nobm_pp_ttW --hel_recycling=False --v INFO: initialize a new directory: CODEGEN_mad_nobm_pp_ttW INFO: remove old information in CODEGEN_mad_nobm_pp_ttW DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ w+ d WEIGHTED<=5 @1 INFO: Processing color information for process: g u > t t~ w+ d @1 @@ -268,9 +267,9 @@ FileWriter t t~ w+ d WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxwpd -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gd_ttxwmu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -279,9 +278,9 @@ FileWriter t t~ w- u WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gd_ttxwmu -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gux_ttxwmdx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -290,9 +289,9 @@ FileWriter t t~ w- d~ WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxwmdx -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gdx_ttxwpux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -301,9 +300,9 @@ FileWriter t t~ w+ u~ WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gdx_ttxwpux -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1749]  INFO: Creating files in directory P1_udx_ttxwpg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -312,9 +311,9 @@ FileWriter t t~ w+ g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group udx_ttxwpg -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1749]  INFO: Creating files in directory P1_dux_ttxwmg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -323,9 +322,9 @@ FileWriter t t~ w- g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group dux_ttxwmg -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1749]  INFO: Creating files in directory P0_udx_ttxwp DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -334,9 +333,9 @@ FileWriter t t~ w+ WEIGHTED<=4 INFO: Finding symmetric diagrams for subprocess group udx_ttxwp -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1749]  INFO: Creating files in directory P0_dux_ttxwm DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -345,21 +344,21 @@ FileWriter t t~ w- WEIGHTED<=4 INFO: Finding symmetric diagrams for subprocess group dux_ttxwm -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1748]  -Generated helas calls for 8 subprocesses (76 diagrams) in 0.104 s -Wrote files for 212 helas calls in 2.138 s +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1749]  +Generated helas calls for 8 subprocesses (76 diagrams) in 0.129 s +Wrote files for 212 helas calls in 0.520 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates VVV1 set of routines with options: P0 -ALOHA: aloha creates 3 routines in 0.123 s +ALOHA: aloha creates 3 routines in 0.105 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates VVV1 set of routines with options: P0 -ALOHA: aloha creates 6 routines in 0.122 s +ALOHA: aloha creates 6 routines in 0.125 s FFV1 FFV1 FFV1 @@ -367,32 +366,32 @@ ALOHA: aloha creates 6 routines in 0.122 s FFV2 FFV2 VVV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h -INFO: Created file HelAmps_sm_no_b_mass.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h +INFO: Created file HelAmps_sm_no_b_mass.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc INFO: Created files Parameters_sm_no_b_mass.h and Parameters_sm_no_b_mass.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done. +Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README +/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README Run "open index.html" to see more information about this process. quit -real 0m8.122s -user 0m2.522s -sys 0m1.075s -Code generation completed in 8 seconds +real 0m3.838s +user 0m3.253s +sys 0m0.542s +Code generation completed in 4 seconds ************************************************************ * * * W E L C O M E to * @@ -413,9 +412,9 @@ Code generation completed in 8 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt @@ -443,9 +442,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt index 712b1897aa..7795e7e382 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat index 3f652ded8d..df2a5aba90 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-19-g7fac9eda1 3.7.1 * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/.resolved-backend b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/.resolved-backend b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/.resolved-backend b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/.resolved-backend b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/.resolved-backend b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/.resolved-backend b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/.resolved-backend b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/.resolved-backend b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk b/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index a8e3a6d67a..276b354ed7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -54,7 +53,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004999399185180664  +DEBUG: model prefixing takes 0.0027861595153808594  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -165,7 +164,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.015 s +5 processes with 7 diagrams generated in 0.020 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -205,7 +204,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.070 s +13 processes with 76 diagrams generated in 0.088 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -371,7 +370,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 0.941 s +65 processes with 1119 diagrams generated in 1.258 s Total: 83 processes with 1202 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -382,10 +381,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vec INFO: initialize a new directory: CODEGEN_mad_pp_tt012j INFO: remove old information in CODEGEN_mad_pp_tt012j DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Processing color information for process: g g > t t~ g g @2 @@ -496,9 +495,9 @@ FileWriter t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1749]  INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -507,9 +506,9 @@ FileWriter t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1749]  INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -518,9 +517,9 @@ FileWriter t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1749]  INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -529,9 +528,9 @@ FileWriter t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -540,9 +539,9 @@ FileWriter t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -551,9 +550,9 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -562,9 +561,9 @@ FileWriter t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -573,9 +572,9 @@ FileWriter t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -584,9 +583,9 @@ FileWriter t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -595,9 +594,9 @@ FileWriter t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -606,9 +605,9 @@ FileWriter t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1749]  INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -617,9 +616,9 @@ FileWriter t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1749]  INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -628,9 +627,9 @@ FileWriter t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -639,9 +638,9 @@ FileWriter t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1749]  INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -650,9 +649,9 @@ FileWriter t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1749]  INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -661,9 +660,9 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1749]  INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -672,9 +671,9 @@ FileWriter t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1749]  INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1314]  INFO: Creating files in directory . @@ -683,25 +682,25 @@ FileWriter t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -DEBUG: len(subproc_diagrams_for_config) =  1 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1} [model_handling.py at line 1748]  -Generated helas calls for 18 subprocesses (372 diagrams) in 0.671 s -Wrote files for 810 helas calls in 5.590 s +DEBUG: len(subproc_diagrams_for_config) =  1 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1} [model_handling.py at line 1749]  +Generated helas calls for 18 subprocesses (372 diagrams) in 0.851 s +Wrote files for 810 helas calls in 1.837 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.216 s +ALOHA: aloha creates 5 routines in 0.160 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.194 s +ALOHA: aloha creates 10 routines in 0.178 s VVV1 VVV1 FFV1 @@ -714,32 +713,32 @@ ALOHA: aloha creates 10 routines in 0.194 s VVVV3 VVVV4 VVVV4 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. +Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README +/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README Run "open index.html" to see more information about this process. quit -real 0m15.089s -user 0m5.988s -sys 0m1.827s -Code generation completed in 16 seconds +real 0m8.487s +user 0m7.467s +sys 0m0.937s +Code generation completed in 8 seconds ************************************************************ * * * W E L C O M E to * @@ -760,9 +759,9 @@ Code generation completed in 16 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt @@ -790,9 +789,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt index 712b1897aa..7795e7e382 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat index fa1bcf88f4..25caee75ef 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-19-g7fac9eda1 3.7.1 * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_FDG.pkl deleted file mode 100644 index bf5a732979..0000000000 Binary files a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_Feynman.pkl deleted file mode 100644 index 3e55c479e2..0000000000 Binary files a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk b/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt index 9a1af87664..23144d1091 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,14 +45,14 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t INFO: load particles INFO: load vertices @@ -70,7 +69,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.06830000877380371  +DEBUG: model prefixing takes 0.04164624214172363  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -85,7 +84,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 2.021 s +1 processes with 72 diagrams generated in 2.315 s Total: 1 processes with 72 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -96,10 +95,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False - INFO: initialize a new directory: CODEGEN_mad_smeft_gg_tttt INFO: remove old information in CODEGEN_mad_smeft_gg_tttt DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 @@ -111,25 +110,25 @@ FileWriter t t~ t t~ WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxttx -DEBUG: len(subproc_diagrams_for_config) =  70 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (72 diagrams) in 0.097 s -Wrote files for 119 helas calls in 0.474 s +DEBUG: len(subproc_diagrams_for_config) =  70 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (72 diagrams) in 0.145 s +Wrote files for 119 helas calls in 0.278 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.204 s +ALOHA: aloha creates 5 routines in 0.150 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 10 routines in 0.193 s +ALOHA: aloha creates 10 routines in 0.182 s VVV5 VVV5 FFV1 @@ -139,32 +138,32 @@ ALOHA: aloha creates 10 routines in 0.193 s VVVV1 VVVV9 VVVV10 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h -INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done. +Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README +/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README Run "open index.html" to see more information about this process. quit -real 0m7.520s -user 0m3.917s -sys 0m0.620s -Code generation completed in 8 seconds +real 0m5.261s +user 0m4.932s +sys 0m0.298s +Code generation completed in 6 seconds ************************************************************ * * * W E L C O M E to * @@ -185,9 +184,9 @@ Code generation completed in 8 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt @@ -215,9 +214,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt index 712b1897aa..7795e7e382 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat index 5e08560167..ee882d1d1f 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-19-g7fac9eda1 3.7.1 * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/.resolved-backend b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk b/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt index c1a6a8c137..2fe2185d07 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,27 +45,27 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t -INFO: download model from http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz to the following directory: /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/models  ---2026-03-10 10:39:42-- http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz +INFO: download model from http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz to the following directory: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/models  +--2026-03-24 14:18:17-- http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz Resolving feynrules.irmp.ucl.ac.be (feynrules.irmp.ucl.ac.be)... 130.104.48.109 Connecting to feynrules.irmp.ucl.ac.be (feynrules.irmp.ucl.ac.be)|130.104.48.109|:80... connected. HTTP request sent, awaiting response... 200 Ok Length: 80562 (79K) [application/x-tar] Saving to: ‘tmp.tgz’ - 0K .......... .......... .......... .......... .......... 63% 832K 0s - 50K .......... .......... ........ 100% 70.5M=0.06s + 0K .......... .......... .......... .......... .......... 63% 809K 0s + 50K .......... .......... ........ 100% 1.51M=0.08s -2026-03-10 10:39:43 (1.27 MB/s) - ‘tmp.tgz’ saved [80562/80562] +2026-03-24 14:18:17 (979 KB/s) - ‘tmp.tgz’ saved [80562/80562] SMEFTsim_topU3l_MwScheme_UFO/ SMEFTsim_topU3l_MwScheme_UFO/__init__.py @@ -87,7 +86,7 @@ SMEFTsim_topU3l_MwScheme_UFO/lorentz.py SMEFTsim_topU3l_MwScheme_UFO/vertices.py SMEFTsim_topU3l_MwScheme_UFO/restrict_SMlimit_massless.dat fail to load model but auto_convert_model is on True. Trying to convert the model -convert model /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/models/SMEFTsim_topU3l_MwScheme_UFO +convert model /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/models/SMEFTsim_topU3l_MwScheme_UFO retry the load of the model import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t INFO: load particles @@ -105,7 +104,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.06466126441955566  +DEBUG: model prefixing takes 0.043245792388916016  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -123,13 +122,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 2.072 s +1 processes with 72 diagrams generated in 2.536 s Total: 1 processes with 72 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 @@ -138,18 +137,18 @@ INFO: Processing color information for process: g g > t t~ t t~ @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. -Generated helas calls for 1 subprocesses (72 diagrams) in 0.094 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. +Generated helas calls for 1 subprocesses (72 diagrams) in 0.115 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.194 s +ALOHA: aloha creates 5 routines in 0.156 s VVV5 VVV5 FFV1 @@ -159,17 +158,17 @@ ALOHA: aloha creates 5 routines in 0.194 s VVVV1 VVVV9 VVVV10 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h -INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. quit -real 0m4.177s -user 0m2.874s -sys 0m0.228s -Code generation completed in 4 seconds +real 0m6.410s +user 0m3.528s +sys 0m0.119s +Code generation completed in 6 seconds diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk b/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt index e0e58acbf4..c6eeee6890 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -547,7 +546,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.055 s +1 processes with 6 diagrams generated in 0.109 s Total: 1 processes with 6 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -558,10 +557,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False -- INFO: initialize a new directory: CODEGEN_mad_susy_gg_t1t1 INFO: remove old information in CODEGEN_mad_susy_gg_t1t1 DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 @@ -573,52 +572,52 @@ FileWriter t1 t1~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_t1t1x -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (6 diagrams) in 0.005 s -Wrote files for 16 helas calls in 0.279 s +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (6 diagrams) in 0.007 s +Wrote files for 16 helas calls in 0.058 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.114 s +ALOHA: aloha creates 3 routines in 0.106 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 6 routines in 0.120 s +ALOHA: aloha creates 6 routines in 0.102 s VVV1 VSS1 VSS1 VSS1 VVSS1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done. +Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README +/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README Run "open index.html" to see more information about this process. quit -real 0m5.502s -user 0m1.722s -sys 0m0.643s -Code generation completed in 6 seconds +real 0m2.553s +user 0m2.218s +sys 0m0.313s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -639,9 +638,9 @@ Code generation completed in 6 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt @@ -669,9 +668,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt index 712b1897aa..7795e7e382 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat index ee7d1277ff..fbc0a0e18e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-19-g7fac9eda1 3.7.1 * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/.resolved-backend b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt index 0ee162c616..07ce582df4 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -547,13 +546,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.055 s +1 processes with 6 diagrams generated in 0.091 s Total: 1 processes with 6 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 @@ -562,32 +561,32 @@ INFO: Processing color information for process: g g > t1 t1~ @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. -Generated helas calls for 1 subprocesses (6 diagrams) in 0.004 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. +Generated helas calls for 1 subprocesses (6 diagrams) in 0.007 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.113 s +ALOHA: aloha creates 3 routines in 0.121 s VVV1 VSS1 VSS1 VSS1 VVSS1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. quit -real 0m1.441s -user 0m0.724s -sys 0m0.134s -Code generation completed in 1 seconds +real 0m1.426s +user 0m1.340s +sys 0m0.075s +Code generation completed in 2 seconds diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt index 88e01c7e57..522666832f 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -547,7 +546,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.052 s +1 processes with 3 diagrams generated in 0.078 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -558,10 +557,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --ve INFO: initialize a new directory: CODEGEN_mad_susy_gg_tt INFO: remove old information in CODEGEN_mad_susy_gg_tt DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards  -WARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -573,49 +572,49 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1723]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1747]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  -Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s -Wrote files for 10 helas calls in 0.273 s +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1724]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1748]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1749]  +Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s +Wrote files for 10 helas calls in 0.051 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.095 s +ALOHA: aloha creates 2 routines in 0.068 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.079 s +ALOHA: aloha creates 4 routines in 0.067 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done. +Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done. Type "launch" to generate events from this process, or see -/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README +/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m5.086s -user 0m1.635s -sys 0m0.704s -Code generation completed in 5 seconds +real 0m2.737s +user 0m2.397s +sys 0m0.315s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -636,9 +635,9 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt @@ -666,9 +665,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt index 712b1897aa..7795e7e382 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt @@ -255,7 +255,7 @@ # pineappl = pineappl -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat index 3a6928f635..3a0296b94e 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat @@ -9,7 +9,7 @@ #* * #* * #* VERSION 3.7.0 2026-01-05 * -#* GIT r991-14-g6dba8f068 3.7.1 * +#* GIT r991-19-g7fac9eda1 3.7.1 * #* * #* The MadGraph5_aMC@NLO Development Team - Find us at * #* https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/.resolved-backend b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/.resolved-backend new file mode 100644 index 0000000000..f26d33068e --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/.resolved-backend @@ -0,0 +1 @@ +cppavx2 diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py index 262d39a736..3bd0c281fc 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py @@ -38,11 +38,26 @@ def compile(self, *args, **opts): cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py - logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) + if cudacpp_backend in ['cpp', 'cppauto']: + backend_log = pjoin(opts["cwd"], ".resolved-backend") + # try to remove old file if present + try: + os.remove(backend_log) + except FileNotFoundError: + pass + misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts) + try: + with open(backend_log, "r") as f: + resolved_backend = f.read().strip() + logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'") + cudacpp_backend = resolved_backend + except FileNotFoundError: + raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.") + logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements") if cudacpp_backend in cudacpp_supported_backends : args[0][0] = 'madevent_' + cudacpp_backend + '_link' else: - raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends ) + raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]") return misc.compile(nb_core=self.options['nb_core'], *args, **opts) else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma" diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index 7142d5e27a..07f495f92f 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -1,4 +1,3 @@ -WARNING:root:Support for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk  Running MG5 in debug mode Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT Plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. @@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5 * * * * * * * VERSION 3.7.0 2026-01-05 * -* GIT r991-14-g6dba8f068 3.7.1 * +* GIT r991-19-g7fac9eda1 3.7.1 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * http://madgraph.phys.ucl.ac.be/ * @@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt -import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg +import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -54,7 +53,7 @@ set zerowidth_tchannel F import model MSSM_SLHA2 INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.4310164451599121  +DEBUG: model prefixing takes 0.37195301055908203  INFO: Restrict model MSSM_SLHA2 with file models/MSSM_SLHA2/restrict_default.dat . INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Simplifying conditional expressions  @@ -550,13 +549,13 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.054 s +1 processes with 3 diagrams generated in 0.070 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -565,30 +564,30 @@ INFO: Processing color information for process: g g > t t~ @1 DEBUG: type(fortran_model)= [output.py at line 224]  DEBUG: type(me)= me=0 [output.py at line 225]  DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. -Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s +INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. +Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.082 s +ALOHA: aloha creates 2 routines in 0.072 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h -FileWriter for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h +FileWriter for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. +INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m2.103s -user 0m1.223s -sys 0m0.178s +real 0m1.878s +user 0m1.736s +sys 0m0.127s Code generation completed in 2 seconds diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk index f5bf67efbc..7969c42777 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk @@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s) # Detect architecture (x86_64, ppc64le...) UNAME_P := $(shell uname -p) ###$(info UNAME_P='$(UNAME_P)') +UNAME_M := $(shell uname -m) #------------------------------------------------------------------------------- @@ -57,10 +58,11 @@ endif #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available) +BACKEND_ORIG := $(BACKEND) ifeq ($(BACKEND),cppauto) ifeq ($(UNAME_P),ppc64le) override BACKEND = cppsse4 - else ifneq (,$(filter $(UNAME_P),arm aarch64)) + else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) override BACKEND = cppsse4 else ifeq ($(wildcard /proc/cpuinfo),) override BACKEND = cppnone @@ -84,6 +86,11 @@ else $(info BACKEND='$(BACKEND)') endif +# Create file with the resolved backend in case user chooses 'cppauto' +BACKEND_LOG ?= .resolved-backend +ifneq ($(BACKEND_ORIG),$(BACKEND)) + $(file >$(BACKEND_LOG),$(BACKEND)) +endif #------------------------------------------------------------------------------- #=== Configure the C++ compiler @@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda) # NVidia CUDA architecture flags # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # This will embed device code for 70, and PTX for 70+. + # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst + # then we embed device code for each compute capability, and for the highest PTX (forward-compatible) + # use nvidia-smi and validate output with grep before going forward + DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un) # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533). # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, - GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma)) + # Convert to space-separated list for looping + MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)) + + # Fallback if detection failed (box has CUDA selected but probe failed) + ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),) + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster) + # This will embed device code for 70, and PTX for 70+ + MADGRAPH_CUDA_ARCHITECTURE := 70 + MADGRAPH_CUDA_ARCH_LIST := 70 + $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE)) + $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=) + endif + + # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility) + HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST)) + GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch)) + GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) + GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX) GPUFLAGS += $(GPUARCHFLAGS) # Other NVidia-specific flags @@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le) else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment) endif -else ifeq ($(UNAME_P),arm) # ARM on Apple silicon +else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON override AVXFLAGS = -DMGONGPU_NOARMNEON else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon @@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon else ifeq ($(BACKEND),cpp512z) $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment) endif -else ifeq ($(UNAME_P),aarch64) # ARM on Linux +else ifeq ($(UNAME_M),aarch64) # ARM on Linux ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent override AVXFLAGS = -march=armv8-a+nosimd else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers) @@ -1111,7 +1135,7 @@ bld512z: ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 else @@ -1254,4 +1278,9 @@ endif cuda-memcheck: all.$(TAG) $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +# Detect backend (to be used in case of 'cppauto' to give info to the user) +.PHONY: detect-backend +detect-backend: + @echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time." + #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk index d2c3b0c747..b9d17f0e38 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk @@ -16,6 +16,7 @@ endif # Basic uname helpers (if not already set) UNAME_S ?= $(shell uname -s) UNAME_P ?= $(shell uname -p) +UNAME_M ?= $(shell uname -m) # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html FFLAGS+= -cpp @@ -225,7 +226,7 @@ madevent_%_link: # Cudacpp bldall targets ifeq ($(UNAME_P),ppc64le) bldavxs: bldnone bldsse4 -else ifneq (,$(filter $(UNAME_P),arm aarch64)) +else ifneq (,$(filter $(UNAME_M),arm64 aarch64)) bldavxs: bldnone bldsse4 else bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h index 9f3533a875..73719032b3 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h @@ -54,7 +54,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR #else - typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR #endif // Mixed fptypes #537: float for color algebra and double elsewhere @@ -65,7 +65,7 @@ namespace mg5amcCpu #ifdef __clang__ typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR #else - typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR #endif #else typedef fptype_v fptype2_v; @@ -123,14 +123,14 @@ namespace mg5amcCpu #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc - typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) ); + typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) ); #if defined MGONGPU_FPTYPE_DOUBLE - typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb #endif #endif diff --git a/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk index 977c75fc48..73dce678ef 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk @@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Host detection UNAME_S := $(shell uname -s) UNAME_P := $(shell uname -p) +UNAME_M := $(shell uname -m) # Only add AVX2/FMA on non-mac and non-ARM hosts ifeq ($(UNAME_S),Darwin) GTEST_CMAKE_FLAGS := -else ifeq ($(UNAME_P),aarch64) +else ifeq ($(UNAME_M),aarch64) GTEST_CMAKE_FLAGS := else GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"