diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo
index bfd34580eb..7fac9eda15 160000
--- a/MG5aMC/mg5amcnlo
+++ b/MG5aMC/mg5amcnlo
@@ -1 +1 @@
-Subproject commit bfd34580eb59c2a027a502c89995e682a70a95b9
+Subproject commit 7fac9eda15ce8f4c8e9eb01704bfb3c0c3b558b8
diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index f8930a863f..7e54de7d8d 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -55,7 +54,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0051648616790771484 [0m
+[1;32mDEBUG: model prefixing  takes 0.0028493404388427734 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -147,7 +146,7 @@ INFO: Checking for minimal orders which gives processes.
 INFO: Please specify coupling orders to bypass this step. 
 INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1  
 INFO: Process has 2 diagrams 
-1 processes with 2 diagrams generated in 0.002 s
+1 processes with 2 diagrams generated in 0.003 s
 Total: 1 processes with 2 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
@@ -158,10 +157,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vecto
 INFO: initialize a new directory: CODEGEN_mad_ee_mumu 
 INFO: remove old information in CODEGEN_mad_ee_mumu 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu 
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
@@ -173,22 +172,22 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1748][0m [0m
-Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s
-Wrote files for 8 helas calls in 0.285 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1749][0m [0m
+Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
+Wrote files for 8 helas calls in 0.063 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.122 s
+ALOHA: aloha creates 3 routines in  0.123 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.152 s
+ALOHA: aloha creates 7 routines in  0.122 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -197,32 +196,32 @@ ALOHA: aloha creates 7 routines in  0.152 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
 [1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done.
+Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done.
 Type "launch" to generate events from this process, or see
-/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README
+/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.542s
-user	0m1.246s
-sys	0m0.587s
-Code generation completed in 5 seconds
+real	0m1.894s
+user	0m1.598s
+sys	0m0.266s
+Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -243,9 +242,9 @@ Code generation completed in 5 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
@@ -273,9 +272,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
index 712b1897aa..7795e7e382 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
@@ -255,7 +255,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
index 7aed5df7db..c3d8b1e1bd 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
@@ -9,7 +9,7 @@
 #*                                                          *
 #*                                                          *
 #*         VERSION 3.7.0                 2026-01-05         *
-#*         GIT r991-14-g6dba8f068             3.7.1         *
+#*         GIT r991-19-g7fac9eda1             3.7.1         *
 #*                                                          *
 #*    The MadGraph5_aMC@NLO Development Team - Find us at   *
 #*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/.resolved-backend b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
index 262d39a736..3bd0c281fc 100644
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
@@ -38,11 +38,26 @@ def compile(self, *args, **opts):
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py
-            logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
+            if cudacpp_backend in ['cpp', 'cppauto']:
+                backend_log = pjoin(opts["cwd"], ".resolved-backend")
+                # try to remove old file if present
+                try:
+                    os.remove(backend_log)
+                except FileNotFoundError:
+                    pass
+                misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts)
+                try:
+                    with open(backend_log, "r") as f:
+                        resolved_backend = f.read().strip()
+                    logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'")
+                    cudacpp_backend = resolved_backend
+                except FileNotFoundError:
+                    raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.")
+            logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements")
             if cudacpp_backend in cudacpp_supported_backends :
                 args[0][0] = 'madevent_' + cudacpp_backend + '_link'
             else:
-                raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends )
+                raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]")
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_FDG.pkl
deleted file mode 100644
index bf5a732979..0000000000
Binary files a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_Feynman.pkl
deleted file mode 100644
index 3e55c479e2..0000000000
Binary files a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk b/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index bdea67b952..50cf4c47eb 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -1,5 +1,5 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
+('WARNING: loading of madgraph too slow!!!', 1.129610300064087)
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
 It has been validated for the last time with version: 3.6.5[0m
@@ -16,7 +16,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,7 +46,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -55,7 +55,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005957365036010742 [0m
+[1;32mDEBUG: model prefixing  takes 0.0025908946990966797 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -147,13 +147,13 @@ INFO: Checking for minimal orders which gives processes.
 INFO: Please specify coupling orders to bypass this step. 
 INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1  
 INFO: Process has 2 diagrams 
-1 processes with 2 diagrams generated in 0.002 s
+1 processes with 2 diagrams generated in 0.003 s
 Total: 1 processes with 2 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu 
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
@@ -162,17 +162,17 @@ INFO: Processing color information for process: e+ e- > mu+ mu- @1
 [1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
 [1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
 [1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
-Generated helas calls for 1 subprocesses (2 diagrams) in 0.002 s
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
+Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.171 s
+ALOHA: aloha creates 4 routines in  0.137 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -181,17 +181,17 @@ ALOHA: aloha creates 4 routines in  0.171 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
 quit
 
-real	0m1.151s
-user	0m0.372s
-sys	0m0.155s
-Code generation completed in 1 seconds
+real	0m1.595s
+user	0m1.385s
+sys	0m0.131s
+Code generation completed in 2 seconds
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk b/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index dbae24afe0..086655a6c3 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -55,7 +54,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005540609359741211 [0m
+[1;32mDEBUG: model prefixing  takes 0.0028896331787109375 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -148,7 +147,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.004 s
+1 processes with 3 diagrams generated in 0.006 s
 Total: 1 processes with 3 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
@@ -159,10 +158,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_
 INFO: initialize a new directory: CODEGEN_mad_gg_tt 
 INFO: remove old information in CODEGEN_mad_gg_tt 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt 
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
@@ -174,49 +173,49 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1748][0m [0m
-Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s
-Wrote files for 10 helas calls in 0.266 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1749][0m [0m
+Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s
+Wrote files for 10 helas calls in 0.050 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.089 s
+ALOHA: aloha creates 2 routines in  0.077 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.093 s
+ALOHA: aloha creates 4 routines in  0.066 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
 [1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done.
+Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done.
 Type "launch" to generate events from this process, or see
-/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README
+/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.687s
-user	0m1.163s
-sys	0m0.619s
-Code generation completed in 5 seconds
+real	0m1.717s
+user	0m1.422s
+sys	0m0.279s
+Code generation completed in 1 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -237,9 +236,9 @@ Code generation completed in 5 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
@@ -267,9 +266,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
index 712b1897aa..7795e7e382 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
@@ -255,7 +255,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
index 8b331b055f..af833f8d84 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
@@ -9,7 +9,7 @@
 #*                                                          *
 #*                                                          *
 #*         VERSION 3.7.0                 2026-01-05         *
-#*         GIT r991-14-g6dba8f068             3.7.1         *
+#*         GIT r991-19-g7fac9eda1             3.7.1         *
 #*                                                          *
 #*    The MadGraph5_aMC@NLO Development Team - Find us at   *
 #*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/.resolved-backend b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
index 262d39a736..3bd0c281fc 100644
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
@@ -38,11 +38,26 @@ def compile(self, *args, **opts):
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py
-            logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
+            if cudacpp_backend in ['cpp', 'cppauto']:
+                backend_log = pjoin(opts["cwd"], ".resolved-backend")
+                # try to remove old file if present
+                try:
+                    os.remove(backend_log)
+                except FileNotFoundError:
+                    pass
+                misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts)
+                try:
+                    with open(backend_log, "r") as f:
+                        resolved_backend = f.read().strip()
+                    logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'")
+                    cudacpp_backend = resolved_backend
+                except FileNotFoundError:
+                    raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.")
+            logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements")
             if cudacpp_backend in cudacpp_supported_backends :
                 args[0][0] = 'madevent_' + cudacpp_backend + '_link'
             else:
-                raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends )
+                raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]")
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_FDG.pkl
deleted file mode 100644
index bf5a732979..0000000000
Binary files a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_Feynman.pkl
deleted file mode 100644
index 3e55c479e2..0000000000
Binary files a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index 20cc72fd46..84563e6016 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -55,7 +54,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00434565544128418 [0m
+[1;32mDEBUG: model prefixing  takes 0.0028569698333740234 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -148,13 +147,13 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.004 s
+1 processes with 3 diagrams generated in 0.006 s
 Total: 1 processes with 3 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt 
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
@@ -163,30 +162,30 @@ INFO: Processing color information for process: g g > t t~ @1
 [1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
 [1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
 [1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
-Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
+Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.091 s
+ALOHA: aloha creates 2 routines in  0.076 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
 quit
 
-real	0m0.992s
-user	0m0.334s
-sys	0m0.123s
-Code generation completed in 1 seconds
+real	0m0.418s
+user	0m0.368s
+sys	0m0.045s
+Code generation completed in 0 seconds
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
index 332a0806f1..e02a13bee6 100644
--- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -55,7 +54,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005877494812011719 [0m
+[1;32mDEBUG: model prefixing  takes 0.0028417110443115234 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -148,7 +147,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.004 s
+1 processes with 3 diagrams generated in 0.006 s
 Total: 1 processes with 3 diagrams
 add process g g > t t~ g
 INFO: Checking for minimal orders which gives processes. 
@@ -156,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.010 s
+1 processes with 16 diagrams generated in 0.013 s
 Total: 2 processes with 19 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
@@ -167,10 +166,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vect
 INFO: initialize a new directory: CODEGEN_mad_gg_tt01g 
 INFO: remove old information in CODEGEN_mad_gg_tt01g 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g 
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @2 
 INFO: Processing color information for process: g g > t t~ g @2 
@@ -184,9 +183,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -195,25 +194,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1748][0m [0m
-Generated helas calls for 2 subprocesses (19 diagrams) in 0.023 s
-Wrote files for 46 helas calls in 0.502 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1749][0m [0m
+Generated helas calls for 2 subprocesses (19 diagrams) in 0.043 s
+Wrote files for 46 helas calls in 0.142 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.190 s
+ALOHA: aloha creates 5 routines in  0.164 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.187 s
+ALOHA: aloha creates 10 routines in  0.147 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -223,32 +222,32 @@ ALOHA: aloha creates 10 routines in  0.187 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
 [1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done.
+Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done.
 Type "launch" to generate events from this process, or see
-/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README
+/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m5.233s
-user	0m1.496s
-sys	0m0.718s
-Code generation completed in 5 seconds
+real	0m2.211s
+user	0m1.861s
+sys	0m0.323s
+Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -269,9 +268,9 @@ Code generation completed in 5 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
@@ -299,9 +298,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt
index 712b1897aa..7795e7e382 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt
@@ -255,7 +255,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
index 30bd3794c3..509884e6e3 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
@@ -9,7 +9,7 @@
 #*                                                          *
 #*                                                          *
 #*         VERSION 3.7.0                 2026-01-05         *
-#*         GIT r991-14-g6dba8f068             3.7.1         *
+#*         GIT r991-19-g7fac9eda1             3.7.1         *
 #*                                                          *
 #*    The MadGraph5_aMC@NLO Development Team - Find us at   *
 #*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/.resolved-backend b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/.resolved-backend b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
index 262d39a736..3bd0c281fc 100644
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
@@ -38,11 +38,26 @@ def compile(self, *args, **opts):
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py
-            logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
+            if cudacpp_backend in ['cpp', 'cppauto']:
+                backend_log = pjoin(opts["cwd"], ".resolved-backend")
+                # try to remove old file if present
+                try:
+                    os.remove(backend_log)
+                except FileNotFoundError:
+                    pass
+                misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts)
+                try:
+                    with open(backend_log, "r") as f:
+                        resolved_backend = f.read().strip()
+                    logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'")
+                    cudacpp_backend = resolved_backend
+                except FileNotFoundError:
+                    raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.")
+            logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements")
             if cudacpp_backend in cudacpp_supported_backends :
                 args[0][0] = 'madevent_' + cudacpp_backend + '_link'
             else:
-                raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends )
+                raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]")
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_FDG.pkl
deleted file mode 100644
index bf5a732979..0000000000
Binary files a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_Feynman.pkl
deleted file mode 100644
index 3e55c479e2..0000000000
Binary files a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index b836987bc5..655f6496a9 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -55,7 +54,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00551295280456543 [0m
+[1;32mDEBUG: model prefixing  takes 0.002906322479248047 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -148,7 +147,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.012 s
+1 processes with 16 diagrams generated in 0.014 s
 Total: 1 processes with 16 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
@@ -159,10 +158,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector
 INFO: initialize a new directory: CODEGEN_mad_gg_ttg 
 INFO: remove old information in CODEGEN_mad_gg_ttg 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg 
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Processing color information for process: g g > t t~ g @1 
@@ -174,25 +173,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1748][0m [0m
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.046 s
-Wrote files for 36 helas calls in 0.368 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1749][0m [0m
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.027 s
+Wrote files for 36 helas calls in 0.077 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.190 s
+ALOHA: aloha creates 5 routines in  0.166 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.194 s
+ALOHA: aloha creates 10 routines in  0.184 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -202,32 +201,32 @@ ALOHA: aloha creates 10 routines in  0.194 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
 [1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done.
+Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done.
 Type "launch" to generate events from this process, or see
-/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README
+/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.945s
-user	0m1.513s
-sys	0m0.678s
-Code generation completed in 5 seconds
+real	0m2.048s
+user	0m1.772s
+sys	0m0.254s
+Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -248,9 +247,9 @@ Code generation completed in 5 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
@@ -278,9 +277,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
index 712b1897aa..7795e7e382 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
@@ -255,7 +255,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
index 0fe3df08d4..4f43fc7e75 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
@@ -9,7 +9,7 @@
 #*                                                          *
 #*                                                          *
 #*         VERSION 3.7.0                 2026-01-05         *
-#*         GIT r991-14-g6dba8f068             3.7.1         *
+#*         GIT r991-19-g7fac9eda1             3.7.1         *
 #*                                                          *
 #*    The MadGraph5_aMC@NLO Development Team - Find us at   *
 #*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/.resolved-backend b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
index 262d39a736..3bd0c281fc 100644
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
@@ -38,11 +38,26 @@ def compile(self, *args, **opts):
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py
-            logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
+            if cudacpp_backend in ['cpp', 'cppauto']:
+                backend_log = pjoin(opts["cwd"], ".resolved-backend")
+                # try to remove old file if present
+                try:
+                    os.remove(backend_log)
+                except FileNotFoundError:
+                    pass
+                misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts)
+                try:
+                    with open(backend_log, "r") as f:
+                        resolved_backend = f.read().strip()
+                    logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'")
+                    cudacpp_backend = resolved_backend
+                except FileNotFoundError:
+                    raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.")
+            logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements")
             if cudacpp_backend in cudacpp_supported_backends :
                 args[0][0] = 'madevent_' + cudacpp_backend + '_link'
             else:
-                raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends )
+                raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]")
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_FDG.pkl
deleted file mode 100644
index bf5a732979..0000000000
Binary files a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl
deleted file mode 100644
index 3e55c479e2..0000000000
Binary files a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index ba99f30bdf..a58fe51240 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -55,7 +54,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005433082580566406 [0m
+[1;32mDEBUG: model prefixing  takes 0.0028769969940185547 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -148,13 +147,13 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.012 s
+1 processes with 16 diagrams generated in 0.015 s
 Total: 1 processes with 16 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg 
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Processing color information for process: g g > t t~ g @1 
@@ -163,18 +162,18 @@ INFO: Processing color information for process: g g > t t~ g @1
 [1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
 [1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
 [1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.045 s
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.026 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.206 s
+ALOHA: aloha creates 5 routines in  0.167 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -184,17 +183,17 @@ ALOHA: aloha creates 5 routines in  0.206 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
 quit
 
-real	0m1.176s
-user	0m0.468s
-sys	0m0.131s
-Code generation completed in 1 seconds
+real	0m0.546s
+user	0m0.501s
+sys	0m0.038s
+Code generation completed in 0 seconds
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index ea9db152a3..89fd13be72 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -55,7 +54,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004921674728393555 [0m
+[1;32mDEBUG: model prefixing  takes 0.002771615982055664 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -148,7 +147,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.080 s
+1 processes with 123 diagrams generated in 0.105 s
 Total: 1 processes with 123 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
@@ -159,10 +158,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vecto
 INFO: initialize a new directory: CODEGEN_mad_gg_ttgg 
 INFO: remove old information in CODEGEN_mad_gg_ttgg 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg 
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ g g @1 
@@ -174,25 +173,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 105 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [1;30m[model_handling.py at line 1748][0m [0m
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.223 s
-Wrote files for 222 helas calls in 0.654 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 105 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [1;30m[model_handling.py at line 1749][0m [0m
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.306 s
+Wrote files for 222 helas calls in 0.423 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.219 s
+ALOHA: aloha creates 5 routines in  0.151 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.197 s
+ALOHA: aloha creates 10 routines in  0.166 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -205,32 +204,32 @@ ALOHA: aloha creates 10 routines in  0.197 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
 [1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done.
+Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done.
 Type "launch" to generate events from this process, or see
-/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README
+/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m5.675s
-user	0m2.118s
-sys	0m0.681s
-Code generation completed in 6 seconds
+real	0m3.147s
+user	0m2.809s
+sys	0m0.313s
+Code generation completed in 3 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -251,9 +250,9 @@ Code generation completed in 6 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
@@ -281,9 +280,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
index 712b1897aa..7795e7e382 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
@@ -255,7 +255,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
index 5fe0cb01be..ffcd35ce8c 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
@@ -9,7 +9,7 @@
 #*                                                          *
 #*                                                          *
 #*         VERSION 3.7.0                 2026-01-05         *
-#*         GIT r991-14-g6dba8f068             3.7.1         *
+#*         GIT r991-19-g7fac9eda1             3.7.1         *
 #*                                                          *
 #*    The MadGraph5_aMC@NLO Development Team - Find us at   *
 #*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/.resolved-backend b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
index 262d39a736..3bd0c281fc 100644
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
@@ -38,11 +38,26 @@ def compile(self, *args, **opts):
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py
-            logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
+            if cudacpp_backend in ['cpp', 'cppauto']:
+                backend_log = pjoin(opts["cwd"], ".resolved-backend")
+                # try to remove old file if present
+                try:
+                    os.remove(backend_log)
+                except FileNotFoundError:
+                    pass
+                misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts)
+                try:
+                    with open(backend_log, "r") as f:
+                        resolved_backend = f.read().strip()
+                    logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'")
+                    cudacpp_backend = resolved_backend
+                except FileNotFoundError:
+                    raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.")
+            logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements")
             if cudacpp_backend in cudacpp_supported_backends :
                 args[0][0] = 'madevent_' + cudacpp_backend + '_link'
             else:
-                raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends )
+                raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]")
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_FDG.pkl
deleted file mode 100644
index bf5a732979..0000000000
Binary files a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl
deleted file mode 100644
index 3e55c479e2..0000000000
Binary files a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index 7ff994126b..4465db9974 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -55,7 +54,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.003983020782470703 [0m
+[1;32mDEBUG: model prefixing  takes 0.002856731414794922 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -148,13 +147,13 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.081 s
+1 processes with 123 diagrams generated in 0.167 s
 Total: 1 processes with 123 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg 
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ g g @1 
@@ -163,18 +162,18 @@ INFO: Processing color information for process: g g > t t~ g g @1
 [1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
 [1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
 [1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.216 s
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.271 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.204 s
+ALOHA: aloha creates 5 routines in  0.153 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -187,17 +186,17 @@ ALOHA: aloha creates 5 routines in  0.204 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
 quit
 
-real	0m1.544s
-user	0m0.774s
-sys	0m0.144s
-Code generation completed in 2 seconds
+real	0m1.043s
+user	0m0.977s
+sys	0m0.055s
+Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index ebb525b6f1..708b742d29 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -55,7 +54,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0036034584045410156 [0m
+[1;32mDEBUG: model prefixing  takes 0.0028352737426757812 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -148,7 +147,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 0.963 s
+1 processes with 1240 diagrams generated in 1.434 s
 Total: 1 processes with 1240 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
@@ -159,16 +158,16 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vect
 INFO: initialize a new directory: CODEGEN_mad_gg_ttggg 
 INFO: remove old information in CODEGEN_mad_gg_ttggg 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg 
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Processing color information for process: g g > t t~ g g g @1 
 INFO: Creating files in directory P1_gg_ttxggg 
 INFO: Computing Color-Flow optimization [15120 term] 
-INFO: Color-Flow passed to 1630 term in 3s. Introduce 3030 contraction 
+INFO: Color-Flow passed to 1630 term in 5s. Introduce 3030 contraction 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
@@ -176,25 +175,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 945 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [1;30m[model_handling.py at line 1748][0m [0m
-Generated helas calls for 1 subprocesses (1240 diagrams) in 3.355 s
-Wrote files for 2281 helas calls in 9.598 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 945 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [1;30m[model_handling.py at line 1749][0m [0m
+Generated helas calls for 1 subprocesses (1240 diagrams) in 4.232 s
+Wrote files for 2281 helas calls in 10.524 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.194 s
+ALOHA: aloha creates 5 routines in  0.271 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.231 s
+ALOHA: aloha creates 10 routines in  0.148 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -207,32 +206,32 @@ ALOHA: aloha creates 10 routines in  0.231 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
 [1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done.
+Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done.
 Type "launch" to generate events from this process, or see
-/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README
+/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m20.546s
-user	0m16.458s
-sys	0m0.884s
-Code generation completed in 20 seconds
+real	0m21.162s
+user	0m20.720s
+sys	0m0.378s
+Code generation completed in 21 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -253,9 +252,9 @@ Code generation completed in 20 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
@@ -283,9 +282,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
index 712b1897aa..7795e7e382 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
@@ -255,7 +255,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
index 08a07273bc..a97972097e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
@@ -9,7 +9,7 @@
 #*                                                          *
 #*                                                          *
 #*         VERSION 3.7.0                 2026-01-05         *
-#*         GIT r991-14-g6dba8f068             3.7.1         *
+#*         GIT r991-19-g7fac9eda1             3.7.1         *
 #*                                                          *
 #*    The MadGraph5_aMC@NLO Development Team - Find us at   *
 #*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/.resolved-backend b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
index 262d39a736..3bd0c281fc 100644
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
@@ -38,11 +38,26 @@ def compile(self, *args, **opts):
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py
-            logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
+            if cudacpp_backend in ['cpp', 'cppauto']:
+                backend_log = pjoin(opts["cwd"], ".resolved-backend")
+                # try to remove old file if present
+                try:
+                    os.remove(backend_log)
+                except FileNotFoundError:
+                    pass
+                misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts)
+                try:
+                    with open(backend_log, "r") as f:
+                        resolved_backend = f.read().strip()
+                    logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'")
+                    cudacpp_backend = resolved_backend
+                except FileNotFoundError:
+                    raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.")
+            logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements")
             if cudacpp_backend in cudacpp_supported_backends :
                 args[0][0] = 'madevent_' + cudacpp_backend + '_link'
             else:
-                raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends )
+                raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]")
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_FDG.pkl
deleted file mode 100644
index bf5a732979..0000000000
Binary files a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl
deleted file mode 100644
index 3e55c479e2..0000000000
Binary files a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index 66cd67a19b..0b8e756576 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -55,7 +54,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0029516220092773438 [0m
+[1;32mDEBUG: model prefixing  takes 0.0029799938201904297 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -148,13 +147,13 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 0.953 s
+1 processes with 1240 diagrams generated in 1.367 s
 Total: 1 processes with 1240 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg 
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Processing color information for process: g g > t t~ g g g @1 
@@ -163,18 +162,18 @@ INFO: Processing color information for process: g g > t t~ g g g @1
 [1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
 [1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
 [1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 3.379 s
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
+Generated helas calls for 1 subprocesses (1240 diagrams) in 4.242 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.214 s
+ALOHA: aloha creates 5 routines in  0.190 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -187,17 +186,17 @@ ALOHA: aloha creates 5 routines in  0.214 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
 quit
 
-real	0m7.419s
-user	0m6.626s
-sys	0m0.185s
-Code generation completed in 7 seconds
+real	0m8.313s
+user	0m8.187s
+sys	0m0.095s
+Code generation completed in 8 seconds
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
index 248fa16d65..cc9ea4ac2d 100644
--- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -54,7 +53,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0052111148834228516 [0m
+[1;32mDEBUG: model prefixing  takes 0.003007173538208008 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -163,7 +162,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.041 s
+8 processes with 40 diagrams generated in 0.053 s
 Total: 8 processes with 40 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
@@ -174,10 +173,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector
 INFO: initialize a new directory: CODEGEN_mad_gq_ttq 
 INFO: remove old information in CODEGEN_mad_gq_ttq 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq 
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Processing color information for process: g u > t t~ u @1 
@@ -197,9 +196,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -208,50 +207,50 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1748][0m [0m
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.017 s
-Wrote files for 32 helas calls in 0.625 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1749][0m [0m
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.023 s
+Wrote files for 32 helas calls in 0.110 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.094 s
+ALOHA: aloha creates 2 routines in  0.086 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.080 s
+ALOHA: aloha creates 4 routines in  0.065 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
 [1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done.
+Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done.
 Type "launch" to generate events from this process, or see
-/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README
+/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m5.076s
-user	0m1.391s
-sys	0m0.672s
-Code generation completed in 5 seconds
+real	0m1.975s
+user	0m1.676s
+sys	0m0.274s
+Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -272,9 +271,9 @@ Code generation completed in 5 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
@@ -302,9 +301,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt
index 712b1897aa..7795e7e382 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt
@@ -255,7 +255,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
index aba2f10b06..38476653c7 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
@@ -9,7 +9,7 @@
 #*                                                          *
 #*                                                          *
 #*         VERSION 3.7.0                 2026-01-05         *
-#*         GIT r991-14-g6dba8f068             3.7.1         *
+#*         GIT r991-19-g7fac9eda1             3.7.1         *
 #*                                                          *
 #*    The MadGraph5_aMC@NLO Development Team - Find us at   *
 #*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/.resolved-backend b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/.resolved-backend b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
index 262d39a736..3bd0c281fc 100644
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
@@ -38,11 +38,26 @@ def compile(self, *args, **opts):
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py
-            logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
+            if cudacpp_backend in ['cpp', 'cppauto']:
+                backend_log = pjoin(opts["cwd"], ".resolved-backend")
+                # try to remove old file if present
+                try:
+                    os.remove(backend_log)
+                except FileNotFoundError:
+                    pass
+                misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts)
+                try:
+                    with open(backend_log, "r") as f:
+                        resolved_backend = f.read().strip()
+                    logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'")
+                    cudacpp_backend = resolved_backend
+                except FileNotFoundError:
+                    raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.")
+            logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements")
             if cudacpp_backend in cudacpp_supported_backends :
                 args[0][0] = 'madevent_' + cudacpp_backend + '_link'
             else:
-                raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends )
+                raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]")
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_FDG.pkl
deleted file mode 100644
index bf5a732979..0000000000
Binary files a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_Feynman.pkl
deleted file mode 100644
index 3e55c479e2..0000000000
Binary files a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk b/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
index e76b814911..1518ee06b2 100644
--- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -54,7 +53,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005757570266723633 [0m
+[1;32mDEBUG: model prefixing  takes 0.0035910606384277344 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -163,13 +162,13 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.040 s
+8 processes with 40 diagrams generated in 0.054 s
 Total: 8 processes with 40 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq 
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Processing color information for process: g u > t t~ u @1 
@@ -186,40 +185,40 @@ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~
 [1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
 [1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
 [1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. 
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
 [1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
 [1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
 [1;32mDEBUG:    type(me)=<class 'int'> me=1 [1;30m[output.py at line 225][0m [0m
 [1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. 
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.016 s
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. 
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.021 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.090 s
+ALOHA: aloha creates 2 routines in  0.078 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
 quit
 
-real	0m1.337s
-user	0m0.375s
-sys	0m0.160s
-Code generation completed in 2 seconds
+real	0m0.562s
+user	0m0.491s
+sys	0m0.063s
+Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk b/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt
index f374f8f313..d0de7d3966 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt
+++ b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,14 +45,14 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
 set zerowidth_tchannel F
 set auto_convert_model T
 save options auto_convert_model
-save configuration file to /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
 import model heft
 INFO: Restrict model heft with file models/heft/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
@@ -120,7 +119,7 @@ Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~
 generate g g > b b~ HIW<=1
 INFO: Trying process: g g > b b~ HIG<=1 HIW<=1 @1  
 INFO: Process has 4 diagrams 
-1 processes with 4 diagrams generated in 0.003 s
+1 processes with 4 diagrams generated in 0.005 s
 Total: 1 processes with 4 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_heft_gg_bb --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
@@ -131,10 +130,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_heft_gg_bb --hel_recycling=False --ve
 INFO: initialize a new directory: CODEGEN_mad_heft_gg_bb 
 INFO: remove old information in CODEGEN_mad_heft_gg_bb 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb 
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 
 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 
@@ -146,55 +145,55 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > b b~ HIG<=1 HIW<=1 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_bbx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 4 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4} [1;30m[model_handling.py at line 1748][0m [0m
-Generated helas calls for 1 subprocesses (4 diagrams) in 0.005 s
-Wrote files for 12 helas calls in 0.268 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 4 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4} [1;30m[model_handling.py at line 1749][0m [0m
+Generated helas calls for 1 subprocesses (4 diagrams) in 0.011 s
+Wrote files for 12 helas calls in 0.079 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFS2 routines[0m
-ALOHA: aloha creates 4 routines in  0.159 s
+ALOHA: aloha creates 4 routines in  0.221 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFS2 routines[0m
-ALOHA: aloha creates 8 routines in  0.152 s
+ALOHA: aloha creates 8 routines in  0.131 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVS3
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFS2
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h
-INFO: Created file HelAmps_heft.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h
+INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc
 INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
 [1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done.
+Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done.
 Type "launch" to generate events from this process, or see
-/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README
+/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.654s
-user	0m1.223s
-sys	0m0.605s
-Code generation completed in 5 seconds
+real	0m2.373s
+user	0m1.936s
+sys	0m0.410s
+Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -215,9 +214,9 @@ Code generation completed in 5 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
@@ -245,9 +244,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt
index 712b1897aa..7795e7e382 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt
@@ -255,7 +255,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat
index 84c16b4cf4..f3c02d2ec5 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat
@@ -9,7 +9,7 @@
 #*                                                          *
 #*                                                          *
 #*         VERSION 3.7.0                 2026-01-05         *
-#*         GIT r991-14-g6dba8f068             3.7.1         *
+#*         GIT r991-19-g7fac9eda1             3.7.1         *
 #*                                                          *
 #*    The MadGraph5_aMC@NLO Development Team - Find us at   *
 #*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/.resolved-backend b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py
index 262d39a736..3bd0c281fc 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py
@@ -38,11 +38,26 @@ def compile(self, *args, **opts):
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py
-            logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
+            if cudacpp_backend in ['cpp', 'cppauto']:
+                backend_log = pjoin(opts["cwd"], ".resolved-backend")
+                # try to remove old file if present
+                try:
+                    os.remove(backend_log)
+                except FileNotFoundError:
+                    pass
+                misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts)
+                try:
+                    with open(backend_log, "r") as f:
+                        resolved_backend = f.read().strip()
+                    logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'")
+                    cudacpp_backend = resolved_backend
+                except FileNotFoundError:
+                    raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.")
+            logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements")
             if cudacpp_backend in cudacpp_supported_backends :
                 args[0][0] = 'madevent_' + cudacpp_backend + '_link'
             else:
-                raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends )
+                raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]")
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk b/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt
index e04a2da479..37bd64eb68 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt
+++ b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,26 +45,26 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
 set zerowidth_tchannel F
 set auto_convert_model T
 save options auto_convert_model
-save configuration file to /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
 import model heft
-[1;60mINFO: download model from https://madgraph.mi.infn.it/Downloads/models/heft.tgz to the following directory: /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/models [0m
---2026-03-10 10:38:21--  https://madgraph.mi.infn.it/Downloads/models/heft.tgz
-Resolving madgraph.mi.infn.it (madgraph.mi.infn.it)... 192.135.21.75
-Connecting to madgraph.mi.infn.it (madgraph.mi.infn.it)|192.135.21.75|:443... connected.
+[1;60mINFO: download model from http://madgraph.phys.ucl.ac.be/Downloads/models/heft.tgz to the following directory: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/models [0m
+--2026-03-24 14:16:55--  http://madgraph.phys.ucl.ac.be/Downloads/models/heft.tgz
+Resolving madgraph.phys.ucl.ac.be (madgraph.phys.ucl.ac.be)... 130.104.2.143
+Connecting to madgraph.phys.ucl.ac.be (madgraph.phys.ucl.ac.be)|130.104.2.143|:80... connected.
 HTTP request sent, awaiting response... 200 OK
 Length: 50876 (50K) [application/x-gzip]
 Saving to: ‘tmp.tgz’
 
-     0K .......... .......... .......... .......... ......... 100% 2.92M=0.02s
+     0K .......... .......... .......... .......... ......... 100%  848K=0.06s
 
-2026-03-10 10:38:22 (2.92 MB/s) - ‘tmp.tgz’ saved [50876/50876]
+2026-03-24 14:16:55 (848 KB/s) - ‘tmp.tgz’ saved [50876/50876]
 
 heft/
 heft/write_param_card.py
@@ -102,7 +101,7 @@ INFO: load particles
 INFO: load vertices 
 [1;34mWARNING: coupling GC_13=-(complex(0,1)*GH) has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model. [0m
 [1;34mWARNING: coupling GC_16=(complex(0,1)*Gphi)/8. has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model. [0m
-[1;32mDEBUG: model prefixing  takes 0.007684946060180664 [0m
+[1;32mDEBUG: model prefixing  takes 0.0035004615783691406 [0m
 INFO: Restrict model heft with file models/heft/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: s u w+ at order: QED=1 [0m
@@ -168,13 +167,13 @@ Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~
 generate g g > b b~ HIW<=1
 INFO: Trying process: g g > b b~ HIG<=1 HIW<=1 @1  
 INFO: Process has 4 diagrams 
-1 processes with 4 diagrams generated in 0.003 s
+1 processes with 4 diagrams generated in 0.004 s
 Total: 1 processes with 4 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_bb
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb 
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 
 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 
@@ -183,34 +182,34 @@ INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1
 [1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
 [1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
 [1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. 
-Generated helas calls for 1 subprocesses (4 diagrams) in 0.005 s
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. 
+Generated helas calls for 1 subprocesses (4 diagrams) in 0.007 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFS2 routines[0m
-ALOHA: aloha creates 4 routines in  0.159 s
+ALOHA: aloha creates 4 routines in  0.145 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVS3
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFS2
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h
-INFO: Created file HelAmps_heft.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h
+INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc
 INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. 
 quit
 
-real	0m1.669s
-user	0m0.522s
-sys	0m0.180s
-Code generation completed in 2 seconds
+real	0m0.816s
+user	0m0.527s
+sys	0m0.083s
+Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk b/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt
index 5067c06ff1..eb52b08042 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -54,7 +53,7 @@ set zerowidth_tchannel F
 import model sm-no_b_mass
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.003014802932739258 [0m
+[1;32mDEBUG: model prefixing  takes 0.0029153823852539062 [0m
 INFO: Restrict model sm-no_b_mass with file models/sm/restrict_no_b_mass.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -178,7 +177,7 @@ INFO: Process u~ d > t t~ w- added to mirror process d u~ > t t~ w-
 INFO: Process c~ s > t t~ w- added to mirror process s c~ > t t~ w- 
 INFO: Process d~ u > t t~ w+ added to mirror process u d~ > t t~ w+ 
 INFO: Process s~ c > t t~ w+ added to mirror process c s~ > t t~ w+ 
-4 processes with 8 diagrams generated in 0.056 s
+4 processes with 8 diagrams generated in 0.072 s
 Total: 4 processes with 8 diagrams
 add process p p > t t~ w j @1
 INFO: Checking for minimal orders which gives processes. 
@@ -220,7 +219,7 @@ INFO: Process d~ g > t t~ w+ u~ added to mirror process g d~ > t t~ w+ u~
 INFO: Process d~ u > t t~ w+ g added to mirror process u d~ > t t~ w+ g 
 INFO: Process s~ g > t t~ w+ c~ added to mirror process g s~ > t t~ w+ c~ 
 INFO: Process s~ c > t t~ w+ g added to mirror process c s~ > t t~ w+ g 
-12 processes with 144 diagrams generated in 0.331 s
+12 processes with 144 diagrams generated in 0.421 s
 Total: 16 processes with 152 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_nobm_pp_ttW --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
@@ -231,10 +230,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_nobm_pp_ttW --hel_recycling=False --v
 INFO: initialize a new directory: CODEGEN_mad_nobm_pp_ttW 
 INFO: remove old information in CODEGEN_mad_nobm_pp_ttW 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW 
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g u > t t~ w+ d WEIGHTED<=5 @1 
 INFO: Processing color information for process: g u > t t~ w+ d @1 
@@ -268,9 +267,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u > t t~ w+ d WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxwpd 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P1_gd_ttxwmu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -279,9 +278,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g d > t t~ w- u WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gd_ttxwmu 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P1_gux_ttxwmdx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -290,9 +289,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ w- d~ WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxwmdx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P1_gdx_ttxwpux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -301,9 +300,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g d~ > t t~ w+ u~ WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gdx_ttxwpux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P1_udx_ttxwpg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -312,9 +311,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u d~ > t t~ w+ g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group udx_ttxwpg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P1_dux_ttxwmg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -323,9 +322,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: d u~ > t t~ w- g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group dux_ttxwmg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P0_udx_ttxwp 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -334,9 +333,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u d~ > t t~ w+ WEIGHTED<=4 
 INFO: Finding symmetric diagrams for subprocess group udx_ttxwp 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P0_dux_ttxwm 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -345,21 +344,21 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: d u~ > t t~ w- WEIGHTED<=4 
 INFO: Finding symmetric diagrams for subprocess group dux_ttxwm 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1748][0m [0m
-Generated helas calls for 8 subprocesses (76 diagrams) in 0.104 s
-Wrote files for 212 helas calls in 2.138 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1749][0m [0m
+Generated helas calls for 8 subprocesses (76 diagrams) in 0.129 s
+Wrote files for 212 helas calls in 0.520 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
-ALOHA: aloha creates 3 routines in  0.123 s
+ALOHA: aloha creates 3 routines in  0.105 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
-ALOHA: aloha creates 6 routines in  0.122 s
+ALOHA: aloha creates 6 routines in  0.125 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -367,32 +366,32 @@ ALOHA: aloha creates 6 routines in  0.122 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h
-INFO: Created file HelAmps_sm_no_b_mass.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h
+INFO: Created file HelAmps_sm_no_b_mass.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc
 INFO: Created files Parameters_sm_no_b_mass.h and Parameters_sm_no_b_mass.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
 [1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done.
+Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done.
 Type "launch" to generate events from this process, or see
-/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README
+/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m8.122s
-user	0m2.522s
-sys	0m1.075s
-Code generation completed in 8 seconds
+real	0m3.838s
+user	0m3.253s
+sys	0m0.542s
+Code generation completed in 4 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -413,9 +412,9 @@ Code generation completed in 8 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
@@ -443,9 +442,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt
index 712b1897aa..7795e7e382 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt
@@ -255,7 +255,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat
index 3f652ded8d..df2a5aba90 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat
@@ -9,7 +9,7 @@
 #*                                                          *
 #*                                                          *
 #*         VERSION 3.7.0                 2026-01-05         *
-#*         GIT r991-14-g6dba8f068             3.7.1         *
+#*         GIT r991-19-g7fac9eda1             3.7.1         *
 #*                                                          *
 #*    The MadGraph5_aMC@NLO Development Team - Find us at   *
 #*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/.resolved-backend b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/.resolved-backend b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/.resolved-backend b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/.resolved-backend b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/.resolved-backend b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/.resolved-backend b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/.resolved-backend b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/.resolved-backend b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py
index 262d39a736..3bd0c281fc 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py
@@ -38,11 +38,26 @@ def compile(self, *args, **opts):
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py
-            logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
+            if cudacpp_backend in ['cpp', 'cppauto']:
+                backend_log = pjoin(opts["cwd"], ".resolved-backend")
+                # try to remove old file if present
+                try:
+                    os.remove(backend_log)
+                except FileNotFoundError:
+                    pass
+                misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts)
+                try:
+                    with open(backend_log, "r") as f:
+                        resolved_backend = f.read().strip()
+                    logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'")
+                    cudacpp_backend = resolved_backend
+                except FileNotFoundError:
+                    raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.")
+            logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements")
             if cudacpp_backend in cudacpp_supported_backends :
                 args[0][0] = 'madevent_' + cudacpp_backend + '_link'
             else:
-                raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends )
+                raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]")
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_FDG.pkl
deleted file mode 100644
index bf5a732979..0000000000
Binary files a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_Feynman.pkl
deleted file mode 100644
index 3e55c479e2..0000000000
Binary files a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk b/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
index a8e3a6d67a..276b354ed7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -54,7 +53,7 @@ set zerowidth_tchannel F
 define j = p
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004999399185180664 [0m
+[1;32mDEBUG: model prefixing  takes 0.0027861595153808594 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -165,7 +164,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~
 INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ 
 INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ 
 INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ 
-5 processes with 7 diagrams generated in 0.015 s
+5 processes with 7 diagrams generated in 0.020 s
 Total: 5 processes with 7 diagrams
 add process p p > t t~ j @1
 INFO: Checking for minimal orders which gives processes. 
@@ -205,7 +204,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~
 INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g 
 INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ 
 INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g 
-13 processes with 76 diagrams generated in 0.070 s
+13 processes with 76 diagrams generated in 0.088 s
 Total: 18 processes with 83 diagrams
 add process p p > t t~ j j @2
 INFO: Checking for minimal orders which gives processes. 
@@ -371,7 +370,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~
 INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ 
 INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ 
 INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. 
-65 processes with 1119 diagrams generated in 0.941 s
+65 processes with 1119 diagrams generated in 1.258 s
 Total: 83 processes with 1202 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
@@ -382,10 +381,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vec
 INFO: initialize a new directory: CODEGEN_mad_pp_tt012j 
 INFO: remove old information in CODEGEN_mad_pp_tt012j 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j 
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @2 
 INFO: Processing color information for process: g g > t t~ g g @2 
@@ -496,9 +495,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 105 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 105 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P2_gg_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -507,9 +506,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P2_gu_ttxgu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -518,9 +517,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P2_gux_ttxgux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -529,9 +528,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P2_uux_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -540,9 +539,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -551,9 +550,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P2_uu_ttxuu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -562,9 +561,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P2_uux_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -573,9 +572,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P2_uxux_ttxuxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -584,9 +583,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P2_uc_ttxuc 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -595,9 +594,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P2_uux_ttxccx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -606,9 +605,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P2_ucx_ttxucx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -617,9 +616,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P2_uxcx_ttxuxcx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -628,9 +627,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -639,9 +638,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -650,9 +649,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P1_uux_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -661,9 +660,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P0_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -672,9 +671,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1749][0m [0m
 INFO: Creating files in directory P0_uux_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1314][0m [0m
 INFO: Creating files in directory . 
@@ -683,25 +682,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 1 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1} [1;30m[model_handling.py at line 1748][0m [0m
-Generated helas calls for 18 subprocesses (372 diagrams) in 0.671 s
-Wrote files for 810 helas calls in 5.590 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 1 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1} [1;30m[model_handling.py at line 1749][0m [0m
+Generated helas calls for 18 subprocesses (372 diagrams) in 0.851 s
+Wrote files for 810 helas calls in 1.837 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.216 s
+ALOHA: aloha creates 5 routines in  0.160 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.194 s
+ALOHA: aloha creates 10 routines in  0.178 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -714,32 +713,32 @@ ALOHA: aloha creates 10 routines in  0.194 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
 [1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done.
+Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done.
 Type "launch" to generate events from this process, or see
-/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README
+/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m15.089s
-user	0m5.988s
-sys	0m1.827s
-Code generation completed in 16 seconds
+real	0m8.487s
+user	0m7.467s
+sys	0m0.937s
+Code generation completed in 8 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -760,9 +759,9 @@ Code generation completed in 16 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
@@ -790,9 +789,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt
index 712b1897aa..7795e7e382 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt
@@ -255,7 +255,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
index fa1bcf88f4..25caee75ef 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
@@ -9,7 +9,7 @@
 #*                                                          *
 #*                                                          *
 #*         VERSION 3.7.0                 2026-01-05         *
-#*         GIT r991-14-g6dba8f068             3.7.1         *
+#*         GIT r991-19-g7fac9eda1             3.7.1         *
 #*                                                          *
 #*    The MadGraph5_aMC@NLO Development Team - Find us at   *
 #*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/.resolved-backend b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
index 262d39a736..3bd0c281fc 100644
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
@@ -38,11 +38,26 @@ def compile(self, *args, **opts):
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py
-            logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
+            if cudacpp_backend in ['cpp', 'cppauto']:
+                backend_log = pjoin(opts["cwd"], ".resolved-backend")
+                # try to remove old file if present
+                try:
+                    os.remove(backend_log)
+                except FileNotFoundError:
+                    pass
+                misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts)
+                try:
+                    with open(backend_log, "r") as f:
+                        resolved_backend = f.read().strip()
+                    logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'")
+                    cudacpp_backend = resolved_backend
+                except FileNotFoundError:
+                    raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.")
+            logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements")
             if cudacpp_backend in cudacpp_supported_backends :
                 args[0][0] = 'madevent_' + cudacpp_backend + '_link'
             else:
-                raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends )
+                raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]")
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_FDG.pkl b/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_FDG.pkl
deleted file mode 100644
index bf5a732979..0000000000
Binary files a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_FDG.pkl and /dev/null differ
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_Feynman.pkl b/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_Feynman.pkl
deleted file mode 100644
index 3e55c479e2..0000000000
Binary files a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/py3_model_Feynman.pkl and /dev/null differ
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk b/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt
index 9a1af87664..23144d1091 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,14 +45,14 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
 set zerowidth_tchannel F
 set auto_convert_model T
 save options auto_convert_model
-save configuration file to /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
 import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t
 INFO: load particles 
 INFO: load vertices 
@@ -70,7 +69,7 @@ INFO: load vertices
 [1;32mDEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1) [0m
 [1;32mDEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3) [0m
 [1;32mDEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1) [0m
-[1;32mDEBUG: model prefixing  takes 0.06830000877380371 [0m
+[1;32mDEBUG: model prefixing  takes 0.04164624214172363 [0m
 INFO: Change particles name to pass to MG5 convention 
 Defined multiparticle p = g u c d s u~ c~ d~ s~
 Defined multiparticle j = g u c d s u~ c~ d~ s~
@@ -85,7 +84,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 
 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1  
 INFO: Process has 72 diagrams 
-1 processes with 72 diagrams generated in 2.021 s
+1 processes with 72 diagrams generated in 2.315 s
 Total: 1 processes with 72 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
@@ -96,10 +95,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False -
 INFO: initialize a new directory: CODEGEN_mad_smeft_gg_tttt 
 INFO: remove old information in CODEGEN_mad_smeft_gg_tttt 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt 
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ t t~ @1 
@@ -111,25 +110,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ t t~ WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 70 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [1;30m[model_handling.py at line 1748][0m [0m
-Generated helas calls for 1 subprocesses (72 diagrams) in 0.097 s
-Wrote files for 119 helas calls in 0.474 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 70 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [1;30m[model_handling.py at line 1749][0m [0m
+Generated helas calls for 1 subprocesses (72 diagrams) in 0.145 s
+Wrote files for 119 helas calls in 0.278 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV5 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV9 routines[0m
 ALOHA: aloha creates VVVV10 routines[0m
-ALOHA: aloha creates 5 routines in  0.204 s
+ALOHA: aloha creates 5 routines in  0.150 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV5 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV9 routines[0m
 ALOHA: aloha creates VVVV10 routines[0m
-ALOHA: aloha creates 10 routines in  0.193 s
+ALOHA: aloha creates 10 routines in  0.182 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV5
 <class 'aloha.create_aloha.AbstractRoutine'> VVV5
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -139,32 +138,32 @@ ALOHA: aloha creates 10 routines in  0.193 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV9
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV10
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
-INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
+INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
 INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
 [1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done.
+Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done.
 Type "launch" to generate events from this process, or see
-/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README
+/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m7.520s
-user	0m3.917s
-sys	0m0.620s
-Code generation completed in 8 seconds
+real	0m5.261s
+user	0m4.932s
+sys	0m0.298s
+Code generation completed in 6 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -185,9 +184,9 @@ Code generation completed in 8 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
@@ -215,9 +214,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt
index 712b1897aa..7795e7e382 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt
@@ -255,7 +255,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat
index 5e08560167..ee882d1d1f 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat
@@ -9,7 +9,7 @@
 #*                                                          *
 #*                                                          *
 #*         VERSION 3.7.0                 2026-01-05         *
-#*         GIT r991-14-g6dba8f068             3.7.1         *
+#*         GIT r991-19-g7fac9eda1             3.7.1         *
 #*                                                          *
 #*    The MadGraph5_aMC@NLO Development Team - Find us at   *
 #*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/.resolved-backend b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py
index 262d39a736..3bd0c281fc 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py
@@ -38,11 +38,26 @@ def compile(self, *args, **opts):
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py
-            logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
+            if cudacpp_backend in ['cpp', 'cppauto']:
+                backend_log = pjoin(opts["cwd"], ".resolved-backend")
+                # try to remove old file if present
+                try:
+                    os.remove(backend_log)
+                except FileNotFoundError:
+                    pass
+                misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts)
+                try:
+                    with open(backend_log, "r") as f:
+                        resolved_backend = f.read().strip()
+                    logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'")
+                    cudacpp_backend = resolved_backend
+                except FileNotFoundError:
+                    raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.")
+            logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements")
             if cudacpp_backend in cudacpp_supported_backends :
                 args[0][0] = 'madevent_' + cudacpp_backend + '_link'
             else:
-                raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends )
+                raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]")
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk b/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt
index c1a6a8c137..2fe2185d07 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,27 +45,27 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
 set zerowidth_tchannel F
 set auto_convert_model T
 save options auto_convert_model
-save configuration file to /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
 import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t
-[1;60mINFO: download model from http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz to the following directory: /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/models [0m
---2026-03-10 10:39:42--  http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz
+[1;60mINFO: download model from http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz to the following directory: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/models [0m
+--2026-03-24 14:18:17--  http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz
 Resolving feynrules.irmp.ucl.ac.be (feynrules.irmp.ucl.ac.be)... 130.104.48.109
 Connecting to feynrules.irmp.ucl.ac.be (feynrules.irmp.ucl.ac.be)|130.104.48.109|:80... connected.
 HTTP request sent, awaiting response... 200 Ok
 Length: 80562 (79K) [application/x-tar]
 Saving to: ‘tmp.tgz’
 
-     0K .......... .......... .......... .......... .......... 63%  832K 0s
-    50K .......... .......... ........                        100% 70.5M=0.06s
+     0K .......... .......... .......... .......... .......... 63%  809K 0s
+    50K .......... .......... ........                        100% 1.51M=0.08s
 
-2026-03-10 10:39:43 (1.27 MB/s) - ‘tmp.tgz’ saved [80562/80562]
+2026-03-24 14:18:17 (979 KB/s) - ‘tmp.tgz’ saved [80562/80562]
 
 SMEFTsim_topU3l_MwScheme_UFO/
 SMEFTsim_topU3l_MwScheme_UFO/__init__.py
@@ -87,7 +86,7 @@ SMEFTsim_topU3l_MwScheme_UFO/lorentz.py
 SMEFTsim_topU3l_MwScheme_UFO/vertices.py
 SMEFTsim_topU3l_MwScheme_UFO/restrict_SMlimit_massless.dat
 fail to load model but auto_convert_model is on True. Trying to convert the model
-convert model /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/models/SMEFTsim_topU3l_MwScheme_UFO
+convert model /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/models/SMEFTsim_topU3l_MwScheme_UFO
 retry the load of the model
 import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t
 INFO: load particles 
@@ -105,7 +104,7 @@ INFO: load vertices
 [1;32mDEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1) [0m
 [1;32mDEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3) [0m
 [1;32mDEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1) [0m
-[1;32mDEBUG: model prefixing  takes 0.06466126441955566 [0m
+[1;32mDEBUG: model prefixing  takes 0.043245792388916016 [0m
 INFO: Change particles name to pass to MG5 convention 
 Defined multiparticle p = g u c d s u~ c~ d~ s~
 Defined multiparticle j = g u c d s u~ c~ d~ s~
@@ -123,13 +122,13 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 
 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1  
 INFO: Process has 72 diagrams 
-1 processes with 72 diagrams generated in 2.072 s
+1 processes with 72 diagrams generated in 2.536 s
 Total: 1 processes with 72 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt 
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ t t~ @1 
@@ -138,18 +137,18 @@ INFO: Processing color information for process: g g > t t~ t t~ @1
 [1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
 [1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
 [1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. 
-Generated helas calls for 1 subprocesses (72 diagrams) in 0.094 s
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. 
+Generated helas calls for 1 subprocesses (72 diagrams) in 0.115 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV5 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV9 routines[0m
 ALOHA: aloha creates VVVV10 routines[0m
-ALOHA: aloha creates 5 routines in  0.194 s
+ALOHA: aloha creates 5 routines in  0.156 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV5
 <class 'aloha.create_aloha.AbstractRoutine'> VVV5
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -159,17 +158,17 @@ ALOHA: aloha creates 5 routines in  0.194 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV9
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV10
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
-INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
+INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
 INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. 
 quit
 
-real	0m4.177s
-user	0m2.874s
-sys	0m0.228s
-Code generation completed in 4 seconds
+real	0m6.410s
+user	0m3.528s
+sys	0m0.119s
+Code generation completed in 6 seconds
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk b/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt
index e0e58acbf4..c6eeee6890 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -547,7 +546,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1  
 INFO: Process has 6 diagrams 
-1 processes with 6 diagrams generated in 0.055 s
+1 processes with 6 diagrams generated in 0.109 s
 Total: 1 processes with 6 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
@@ -558,10 +557,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --
 INFO: initialize a new directory: CODEGEN_mad_susy_gg_t1t1 
 INFO: remove old information in CODEGEN_mad_susy_gg_t1t1 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t1 t1~ @1 
@@ -573,52 +572,52 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t1 t1~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_t1t1x 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [1;30m[model_handling.py at line 1748][0m [0m
-Generated helas calls for 1 subprocesses (6 diagrams) in 0.005 s
-Wrote files for 16 helas calls in 0.279 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [1;30m[model_handling.py at line 1749][0m [0m
+Generated helas calls for 1 subprocesses (6 diagrams) in 0.007 s
+Wrote files for 16 helas calls in 0.058 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VSS1 routines[0m
 ALOHA: aloha creates VVSS1 routines[0m
-ALOHA: aloha creates 3 routines in  0.114 s
+ALOHA: aloha creates 3 routines in  0.106 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VSS1 routines[0m
 ALOHA: aloha creates VVSS1 routines[0m
-ALOHA: aloha creates 6 routines in  0.120 s
+ALOHA: aloha creates 6 routines in  0.102 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VVSS1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h
-INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h
+INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc
 INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
 [1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done.
+Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done.
 Type "launch" to generate events from this process, or see
-/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README
+/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m5.502s
-user	0m1.722s
-sys	0m0.643s
-Code generation completed in 6 seconds
+real	0m2.553s
+user	0m2.218s
+sys	0m0.313s
+Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -639,9 +638,9 @@ Code generation completed in 6 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
@@ -669,9 +668,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt
index 712b1897aa..7795e7e382 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt
@@ -255,7 +255,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat
index ee7d1277ff..fbc0a0e18e 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat
@@ -9,7 +9,7 @@
 #*                                                          *
 #*                                                          *
 #*         VERSION 3.7.0                 2026-01-05         *
-#*         GIT r991-14-g6dba8f068             3.7.1         *
+#*         GIT r991-19-g7fac9eda1             3.7.1         *
 #*                                                          *
 #*    The MadGraph5_aMC@NLO Development Team - Find us at   *
 #*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/.resolved-backend b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py
index 262d39a736..3bd0c281fc 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py
@@ -38,11 +38,26 @@ def compile(self, *args, **opts):
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py
-            logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
+            if cudacpp_backend in ['cpp', 'cppauto']:
+                backend_log = pjoin(opts["cwd"], ".resolved-backend")
+                # try to remove old file if present
+                try:
+                    os.remove(backend_log)
+                except FileNotFoundError:
+                    pass
+                misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts)
+                try:
+                    with open(backend_log, "r") as f:
+                        resolved_backend = f.read().strip()
+                    logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'")
+                    cudacpp_backend = resolved_backend
+                except FileNotFoundError:
+                    raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.")
+            logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements")
             if cudacpp_backend in cudacpp_supported_backends :
                 args[0][0] = 'madevent_' + cudacpp_backend + '_link'
             else:
-                raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends )
+                raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]")
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt
index 0ee162c616..07ce582df4 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -547,13 +546,13 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1  
 INFO: Process has 6 diagrams 
-1 processes with 6 diagrams generated in 0.055 s
+1 processes with 6 diagrams generated in 0.091 s
 Total: 1 processes with 6 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t1 t1~ @1 
@@ -562,32 +561,32 @@ INFO: Processing color information for process: g g > t1 t1~ @1
 [1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
 [1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
 [1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. 
-Generated helas calls for 1 subprocesses (6 diagrams) in 0.004 s
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. 
+Generated helas calls for 1 subprocesses (6 diagrams) in 0.007 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VSS1 routines[0m
 ALOHA: aloha creates VVSS1 routines[0m
-ALOHA: aloha creates 3 routines in  0.113 s
+ALOHA: aloha creates 3 routines in  0.121 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VVSS1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h
-INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h
+INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc
 INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. 
 quit
 
-real	0m1.441s
-user	0m0.724s
-sys	0m0.134s
-Code generation completed in 1 seconds
+real	0m1.426s
+user	0m1.340s
+sys	0m0.075s
+Code generation completed in 2 seconds
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt
index 88e01c7e57..522666832f 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt
+++ b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -547,7 +546,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.052 s
+1 processes with 3 diagrams generated in 0.078 s
 Total: 1 processes with 3 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
@@ -558,10 +557,10 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --ve
 INFO: initialize a new directory: CODEGEN_mad_susy_gg_tt 
 INFO: remove old information in CODEGEN_mad_susy_gg_tt 
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt 
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards [0m
-[1;34mWARNING: File exists /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt [0m
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt 
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards [0m
+[1;34mWARNING: File exists /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
@@ -573,49 +572,49 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1723][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1747][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1748][0m [0m
-Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s
-Wrote files for 10 helas calls in 0.273 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1724][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1748][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1749][0m [0m
+Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s
+Wrote files for 10 helas calls in 0.051 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.095 s
+ALOHA: aloha creates 2 routines in  0.068 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.079 s
+ALOHA: aloha creates 4 routines in  0.067 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h
-INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h
+INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc
 INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt
+save configuration file to /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
 [1;32mDEBUG:  result.returncode = [0m 0 [1;30m[output.py at line 273][0m [0m
-Output to directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done.
+Output to directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done.
 Type "launch" to generate events from this process, or see
-/shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README
+/home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m5.086s
-user	0m1.635s
-sys	0m0.704s
-Code generation completed in 5 seconds
+real	0m2.737s
+user	0m2.397s
+sys	0m0.315s
+Code generation completed in 3 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -636,9 +635,9 @@ Code generation completed in 5 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
@@ -666,9 +665,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt
index 712b1897aa..7795e7e382 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt
@@ -255,7 +255,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo 
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /shared/git/madgraph4gpu/MG5aMC/mg5amcnlo
+#mg5_path = /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat
index 3a6928f635..3a0296b94e 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat
@@ -9,7 +9,7 @@
 #*                                                          *
 #*                                                          *
 #*         VERSION 3.7.0                 2026-01-05         *
-#*         GIT r991-14-g6dba8f068             3.7.1         *
+#*         GIT r991-19-g7fac9eda1             3.7.1         *
 #*                                                          *
 #*    The MadGraph5_aMC@NLO Development Team - Find us at   *
 #*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/.resolved-backend b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/.resolved-backend
new file mode 100644
index 0000000000..f26d33068e
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/.resolved-backend
@@ -0,0 +1 @@
+cppavx2
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py
index 262d39a736..3bd0c281fc 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py
@@ -38,11 +38,26 @@ def compile(self, *args, **opts):
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py
-            logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
+            if cudacpp_backend in ['cpp', 'cppauto']:
+                backend_log = pjoin(opts["cwd"], ".resolved-backend")
+                # try to remove old file if present
+                try:
+                    os.remove(backend_log)
+                except FileNotFoundError:
+                    pass
+                misc.compile(["-f", "cudacpp.mk", f"BACKEND=cppauto", f"BACKEND_LOG={backend_log}", "detect-backend"], **opts)
+                try:
+                    with open(backend_log, "r") as f:
+                        resolved_backend = f.read().strip()
+                    logger.info(f"Backend '{cudacpp_backend}' resolved as '{resolved_backend}'")
+                    cudacpp_backend = resolved_backend
+                except FileNotFoundError:
+                    raise RuntimeError("Could not resolve cudacpp_backend=cppauto|cpp; ensure Makefile detection runs properly.")
+            logger.info(f"Building madevent in madevent_interface.py with '{cudacpp_backend}' matrix elements")
             if cudacpp_backend in cudacpp_supported_backends :
                 args[0][0] = 'madevent_' + cudacpp_backend + '_link'
             else:
-                raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends )
+                raise Exception(f"Invalid cudacpp_backend='{cudacpp_backend}': supported backends are [ '" + "', '".join(cudacpp_supported_backends) + "' ]")
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt
index 7142d5e27a..07f495f92f 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt
+++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt
@@ -1,4 +1,3 @@
-WARNING:root:[91mSupport for Python3.9 (and below) has been dropped since end of 2025. Please consider update your version of Python. Continue at your own risk [0m
 Running MG5 in debug mode
 Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin MG5aMC_PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.7.0. 
@@ -16,7 +15,7 @@ It has been validated for the last time with version: 3.6.5[0m
 *                 *                       *                *
 *                                                          *
 *         VERSION 3.7.0                 2026-01-05         *
-*         GIT r991-14-g6dba8f068             3.7.1         *
+*         GIT r991-19-g7fac9eda1             3.7.1         *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *              http://madgraph.phys.ucl.ac.be/             *
@@ -46,7 +45,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
 Using default gzip "pigz". Set another one in ./input/mg5_configuration.txt
-import /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg
+import /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -54,7 +53,7 @@ set zerowidth_tchannel F
 import model MSSM_SLHA2
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.4310164451599121 [0m
+[1;32mDEBUG: model prefixing  takes 0.37195301055908203 [0m
 INFO: Restrict model MSSM_SLHA2 with file models/MSSM_SLHA2/restrict_default.dat . 
 INFO: Detect SLHA2 format. keeping restricted parameter in the param_card 
 [1;32mDEBUG: Simplifying conditional expressions [0m
@@ -550,13 +549,13 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.054 s
+1 processes with 3 diagrams generated in 0.070 s
 Total: 1 processes with 3 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
-INFO: Creating subdirectories in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt 
+INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
@@ -565,30 +564,30 @@ INFO: Processing color information for process: g g > t t~ @1
 [1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
 [1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
 [1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
-INFO: Creating files in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. 
-Generated helas calls for 1 subprocesses (3 diagrams) in 0.004 s
+INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. 
+Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.082 s
+ALOHA: aloha creates 2 routines in  0.072 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h
-INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h
+INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc
 INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory 
-INFO: /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /shared/git/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. 
+INFO: /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/ci_3.7.1/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. 
 quit
 
-real	0m2.103s
-user	0m1.223s
-sys	0m0.178s
+real	0m1.878s
+user	0m1.736s
+sys	0m0.127s
 Code generation completed in 2 seconds
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
index f5bf67efbc..7969c42777 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
@@ -41,6 +41,7 @@ UNAME_S := $(shell uname -s)
 # Detect architecture (x86_64, ppc64le...)
 UNAME_P := $(shell uname -p)
 ###$(info UNAME_P='$(UNAME_P)')
+UNAME_M := $(shell uname -m)
 
 #-------------------------------------------------------------------------------
 
@@ -57,10 +58,11 @@ endif
 #=== Redefine BACKEND if the current value is 'cppauto'
 
 # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+BACKEND_ORIG := $(BACKEND)
 ifeq ($(BACKEND),cppauto)
   ifeq ($(UNAME_P),ppc64le)
     override BACKEND = cppsse4
-  else ifneq (,$(filter $(UNAME_P),arm aarch64))
+  else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
     override BACKEND = cppsse4
   else ifeq ($(wildcard /proc/cpuinfo),)
     override BACKEND = cppnone
@@ -84,6 +86,11 @@ else
   $(info BACKEND='$(BACKEND)')
 endif
 
+# Create file with the resolved backend in case user chooses 'cppauto'
+BACKEND_LOG ?= .resolved-backend
+ifneq ($(BACKEND_ORIG),$(BACKEND))
+  $(file >$(BACKEND_LOG),$(BACKEND))
+endif
 #-------------------------------------------------------------------------------
 
 #=== Configure the C++ compiler
@@ -184,15 +191,32 @@ ifeq ($(BACKEND),cuda)
   # NVidia CUDA architecture flags
   # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
-  # This will embed device code for 70, and PTX for 70+.
+  # Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
+  # then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
+  # use nvidia-smi and validate output with grep before going forward
+  DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
   # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
   # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
-  MADGRAPH_CUDA_ARCHITECTURE ?= 70
-  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
-  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
   comma:=,
-  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
+  # Convert to space-separated list for looping
+  MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
+
+  # Fallback if detection failed (box has CUDA selected but probe failed)
+  ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
+	# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
+	# This will embed device code for 70, and PTX for 70+
+    MADGRAPH_CUDA_ARCHITECTURE := 70
+    MADGRAPH_CUDA_ARCH_LIST := 70
+    $(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
+    $(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
+  endif
+
+  # Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
+  HIGHEST_SM    := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
+  GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
+  GENCODE_PTX   := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+  GPUARCHFLAGS  := $(GENCODE_FLAGS) $(GENCODE_PTX)
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other NVidia-specific flags
@@ -531,7 +555,7 @@ ifeq ($(UNAME_P),ppc64le)
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
   endif
-else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
+else ifeq ($(UNAME_M),arm64) # ARM on Apple silicon
   ifeq ($(BACKEND),cppnone) # this internally undefines __ARM_NEON
     override AVXFLAGS = -DMGONGPU_NOARMNEON
   else ifeq ($(BACKEND),cppsse4) # __ARM_NEON is always defined on Apple silicon
@@ -543,7 +567,7 @@ else ifeq ($(UNAME_P),arm) # ARM on Apple silicon
   else ifeq ($(BACKEND),cpp512z)
     $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
   endif
-else ifeq ($(UNAME_P),aarch64) # ARM on Linux
+else ifeq ($(UNAME_M),aarch64) # ARM on Linux
   ifeq ($(BACKEND),cppnone) # +nosimd ensures __ARM_NEON is absent
     override AVXFLAGS = -march=armv8-a+nosimd
   else ifeq ($(BACKEND),cppsse4) # +simd ensures __ARM_NEON is present (128 width Q/quadword registers)
@@ -1111,7 +1135,7 @@ bld512z:
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
 else
@@ -1254,4 +1278,9 @@ endif
 cuda-memcheck: all.$(TAG)
 	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
 
+# Detect backend (to be used in case of 'cppauto' to give info to the user)
+.PHONY: detect-backend
+detect-backend:
+	@echo "Resolved backend has already been written to $(BACKEND_LOG) at parse time."
+
 #-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk
index d2c3b0c747..b9d17f0e38 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp_overlay.mk
@@ -16,6 +16,7 @@ endif
 # Basic uname helpers (if not already set)
 UNAME_S ?= $(shell uname -s)
 UNAME_P ?= $(shell uname -p)
+UNAME_M ?= $(shell uname -m)
 
 # Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
 FFLAGS+= -cpp
@@ -225,7 +226,7 @@ madevent_%_link:
 # Cudacpp bldall targets
 ifeq ($(UNAME_P),ppc64le)
   bldavxs: bldnone bldsse4
-else ifneq (,$(filter $(UNAME_P),arm aarch64))
+else ifneq (,$(filter $(UNAME_M),arm64 aarch64))
   bldavxs: bldnone bldsse4
 else
   bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h
index 9f3533a875..73719032b3 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h
@@ -54,7 +54,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
 #else
-  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ), aligned( neppV * sizeof( fptype ) ) ) ); // RRRR
 #endif
 
   // Mixed fptypes #537: float for color algebra and double elsewhere
@@ -65,7 +65,7 @@ namespace mg5amcCpu
 #ifdef __clang__
   typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
 #else
-  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ), aligned( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
 #endif
 #else
   typedef fptype_v fptype2_v;
@@ -123,14 +123,14 @@ namespace mg5amcCpu
 #if defined MGONGPU_FPTYPE_DOUBLE
   typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                                                                // bbbb
 #endif
 #else // gcc
-  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ), aligned( neppV * sizeof( unsigned int ) ) ) );
 #if defined MGONGPU_FPTYPE_DOUBLE
-  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ), aligned( neppV * sizeof( long int ) ) ) ); // bbbb
 #elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ), aligned( neppV * sizeof( int ) ) ) ); // bbbb
 #endif
 #endif
 
diff --git a/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk
index 977c75fc48..73dce678ef 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk
@@ -8,11 +8,12 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Host detection
 UNAME_S := $(shell uname -s)
 UNAME_P := $(shell uname -p)
+UNAME_M := $(shell uname -m)
 
 # Only add AVX2/FMA on non-mac and non-ARM hosts
 ifeq ($(UNAME_S),Darwin)
   GTEST_CMAKE_FLAGS :=
-else ifeq ($(UNAME_P),aarch64)
+else ifeq ($(UNAME_M),aarch64)
   GTEST_CMAKE_FLAGS :=
 else
   GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"