From b12db5d3c7fe15e7d95a3c62642cabbdea27665a Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 15 Jan 2026 11:58:25 -0500 Subject: [PATCH 01/10] mpi4py CI with ASAN in separate workflow Address sanitizer helps us catch memory bugs even if they don't manifest into faults right away. The instrumention incurs some overhead so this is run on a reduced set of mpi4py runs. Also tests `ompi_info` and `mpicc`. Signed-off-by: Joseph Schuchart --- .github/workflows/ompi_mpi4py_asan.yaml | 159 ++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 .github/workflows/ompi_mpi4py_asan.yaml diff --git a/.github/workflows/ompi_mpi4py_asan.yaml b/.github/workflows/ompi_mpi4py_asan.yaml new file mode 100644 index 00000000000..95a519604b4 --- /dev/null +++ b/.github/workflows/ompi_mpi4py_asan.yaml @@ -0,0 +1,159 @@ +name: mpi4py (ASAN) + +on: + pull_request: + workflow_dispatch: + inputs: + repository: + description: 'mpi4py repository' + default: 'mpi4py/mpi4py' + required: false + type: string + ref: + description: 'mpi4py branch/tag/SHA' + default: 'master' + required: false + type: string + +permissions: + contents: read + +jobs: + test: + # We need Unbuntu 24.04 (over 22.04) due to a kernel bug, + # see https://github.com/google/sanitizers/issues/856. + runs-on: ubuntu-24.04 + timeout-minutes: 30 + env: + MPI4PY_TEST_SPAWN: true + # disable ASAN while building + ASAN_OPTIONS: verify_asan_link_order=0,detect_odr_violation=0,abort_on_error=0 + # disable leak detection + LSAN_OPTIONS: detect_leaks=0,exitcode=0 + + steps: + - name: Configure hostname + run: echo 127.0.0.1 `hostname` | sudo tee -a /etc/hosts > /dev/null + if: ${{ runner.os == 'Linux' || runner.os == 'macOS' }} + + - name: Install depencencies + run: sudo apt-get install -y -q + libnuma-dev libasan8 + if: ${{ runner.os == 'Linux' }} + + - name: Checkout Open MPI + uses: actions/checkout@v4 + with: + path: mpi-build + submodules: recursive + + - name: Bootstrap Open MPI + run: ./autogen.pl + working-directory: mpi-build + + # Install into a separate directory (/opt/openmpi) so that we can + # bundle up that tree into an artifact to share with other jobs in + # this github action. Specifically don't use /usr/local, because + # there's a bunch of other stuff already installed in /usr/local, + # and we don't need to include that in our artifact. + - name: Configure Open MPI + run: ./configure + --disable-dependency-tracking + --disable-sphinx + --disable-mpi-fortran + --disable-oshmem + --disable-silent-rules + --prefix=/opt/openmpi + CFLAGS="-O2 -fno-omit-frame-pointer -g -fsanitize=address" + LDFLAGS="-Wl,-rpath,/opt/openmpi/lib -fsanitize=address" + working-directory: mpi-build + + - name: Build MPI + run: make -j $(nproc) + working-directory: mpi-build + + - name: Install MPI + run: sudo make install + working-directory: mpi-build + + - name: Add Open MPI to PATH + run: echo /opt/openmpi/bin >> $GITHUB_PATH + + - name: Tweak MPI + run: | + # Tweak MPI + mca_params="$HOME/.openmpi/mca-params.conf" + mkdir -p "$(dirname "$mca_params")" + echo mpi_param_check = true >> "$mca_params" + echo mpi_show_handle_leaks = true >> "$mca_params" + mca_params="$HOME/.prte/mca-params.conf" + mkdir -p "$(dirname "$mca_params")" + echo rmaps_default_mapping_policy = :oversubscribe >> "$mca_params" + + - name: Show MPI + run: ompi_info + + - name: Show MPICC + run: mpicc -show + + - name: Use Python + uses: actions/setup-python@v5 + with: + python-version: 3 + architecture: x64 + + - name: Install Python packages (build) + run: python -m pip install --upgrade + setuptools pip wheel + + - name: Install Python packages (test) + run: python -m pip install --upgrade + numpy cffi pyyaml + + - name: Checkout mpi4py + uses: actions/checkout@v4 + with: + repository: ${{ inputs.repository || 'mpi4py/mpi4py' }} + ref: ${{ inputs.ref }} + + - name: Install mpi4py + run: python -m pip install . + env: + CFLAGS: "-O0" + + - name: Setting up ASAN environment + # LD_PRELOAD is needed to make sure ASAN is the first thing loaded + # as it will otherwise complain. + # Leak detection is currently disabled because of the size of the report. + # The patcher is disabled because ASAN fails if code mmaps data at fixed + # memory addresses, see https://github.com/open-mpi/ompi/issues/12819. + # ODR violation detection is disabled until #13469 is fixed + # Disabling stack use after return detection to reduce slowdown, per + # https://github.com/llvm/llvm-project/issues/64190. + run: | + echo LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libasan.so.8 >> $GITHUB_ENV + echo ASAN_OPTIONS=detect_odr_violation=0,abort_on_error=1,detect_stack_use_after_return=0 >> $GITHUB_ENV + echo LSAN_OPTIONS=detect_leaks=0,exitcode=0 >> $GITHUB_ENV + echo OMPI_MCA_memory=^patcher >> $GITHUB_ENV + + - name: Test mpi4py (singleton) + run: python test/main.py -v -x TestExcErrhandlerNull + if: ${{ true }} + timeout-minutes: 10 + + - name: Test mpi4py (np=1) + run: mpiexec -n 1 python test/main.py -v -x TestExcErrhandlerNull + if: ${{ true }} + timeout-minutes: 10 + + - name: Test mpi4py (np=4) + run: mpiexec -n 4 python test/main.py -v -f -x TestExcErrhandlerNull + if: ${{ true }} + timeout-minutes: 10 + + - name: Show MPI (ASAN) + run: ompi_info + + - name: Show MPICC (ASAN) + run: mpicc -show + From d4ff6ff0cc62d53ed3f900b7eab89b407684893a Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 15 Jan 2026 12:37:58 -0500 Subject: [PATCH 02/10] Reduce optimization levels and enable ASAN when building mpi4py Signed-off-by: Joseph Schuchart --- .github/workflows/ompi_mpi4py_asan.yaml | 28 ++++++++----------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/.github/workflows/ompi_mpi4py_asan.yaml b/.github/workflows/ompi_mpi4py_asan.yaml index 95a519604b4..ad859fb1e71 100644 --- a/.github/workflows/ompi_mpi4py_asan.yaml +++ b/.github/workflows/ompi_mpi4py_asan.yaml @@ -64,7 +64,7 @@ jobs: --disable-oshmem --disable-silent-rules --prefix=/opt/openmpi - CFLAGS="-O2 -fno-omit-frame-pointer -g -fsanitize=address" + CFLAGS="-O1 -fno-omit-frame-pointer -g -fsanitize=address" LDFLAGS="-Wl,-rpath,/opt/openmpi/lib -fsanitize=address" working-directory: mpi-build @@ -90,12 +90,6 @@ jobs: mkdir -p "$(dirname "$mca_params")" echo rmaps_default_mapping_policy = :oversubscribe >> "$mca_params" - - name: Show MPI - run: ompi_info - - - name: Show MPICC - run: mpicc -show - - name: Use Python uses: actions/setup-python@v5 with: @@ -116,11 +110,6 @@ jobs: repository: ${{ inputs.repository || 'mpi4py/mpi4py' }} ref: ${{ inputs.ref }} - - name: Install mpi4py - run: python -m pip install . - env: - CFLAGS: "-O0" - - name: Setting up ASAN environment # LD_PRELOAD is needed to make sure ASAN is the first thing loaded # as it will otherwise complain. @@ -136,6 +125,14 @@ jobs: echo LSAN_OPTIONS=detect_leaks=0,exitcode=0 >> $GITHUB_ENV echo OMPI_MCA_memory=^patcher >> $GITHUB_ENV + - name: Show MPI + run: ompi_info + + - name: Install mpi4py + run: python -m pip install . + env: + CFLAGS: "-O0" + - name: Test mpi4py (singleton) run: python test/main.py -v -x TestExcErrhandlerNull if: ${{ true }} @@ -150,10 +147,3 @@ jobs: run: mpiexec -n 4 python test/main.py -v -f -x TestExcErrhandlerNull if: ${{ true }} timeout-minutes: 10 - - - name: Show MPI (ASAN) - run: ompi_info - - - name: Show MPICC (ASAN) - run: mpicc -show - From 6b8dfeb0f2e67aedc9a68b63988ce5709aaa3076 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 15 Jan 2026 12:51:49 -0500 Subject: [PATCH 03/10] Enable leak check, for sanity checking Signed-off-by: Joseph Schuchart --- .github/workflows/ompi_mpi4py_asan.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ompi_mpi4py_asan.yaml b/.github/workflows/ompi_mpi4py_asan.yaml index ad859fb1e71..e5bfc17e970 100644 --- a/.github/workflows/ompi_mpi4py_asan.yaml +++ b/.github/workflows/ompi_mpi4py_asan.yaml @@ -122,7 +122,7 @@ jobs: run: | echo LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libasan.so.8 >> $GITHUB_ENV echo ASAN_OPTIONS=detect_odr_violation=0,abort_on_error=1,detect_stack_use_after_return=0 >> $GITHUB_ENV - echo LSAN_OPTIONS=detect_leaks=0,exitcode=0 >> $GITHUB_ENV + echo LSAN_OPTIONS=detect_leaks=1,exitcode=0 >> $GITHUB_ENV echo OMPI_MCA_memory=^patcher >> $GITHUB_ENV - name: Show MPI From 3fbedbf31cf8e10e796be83cdea7ade39afa3799 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Sat, 17 Jan 2026 08:45:51 +0900 Subject: [PATCH 04/10] Disable LSAN and enable stack-use-after-return checks Signed-off-by: Joseph Schuchart --- .github/workflows/ompi_mpi4py_asan.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ompi_mpi4py_asan.yaml b/.github/workflows/ompi_mpi4py_asan.yaml index e5bfc17e970..5204dba825e 100644 --- a/.github/workflows/ompi_mpi4py_asan.yaml +++ b/.github/workflows/ompi_mpi4py_asan.yaml @@ -121,8 +121,8 @@ jobs: # https://github.com/llvm/llvm-project/issues/64190. run: | echo LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libasan.so.8 >> $GITHUB_ENV - echo ASAN_OPTIONS=detect_odr_violation=0,abort_on_error=1,detect_stack_use_after_return=0 >> $GITHUB_ENV - echo LSAN_OPTIONS=detect_leaks=1,exitcode=0 >> $GITHUB_ENV + echo ASAN_OPTIONS=detect_odr_violation=0,abort_on_error=1 >> $GITHUB_ENV + echo LSAN_OPTIONS=detect_leaks=0,exitcode=0 >> $GITHUB_ENV echo OMPI_MCA_memory=^patcher >> $GITHUB_ENV - name: Show MPI From e12e230968e47fbf338f359a663c532fb5f961b0 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Sat, 17 Jan 2026 09:10:47 +0900 Subject: [PATCH 05/10] ASAN: Configure Open MPI with --enable-debug Signed-off-by: Joseph Schuchart --- .github/workflows/ompi_mpi4py_asan.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ompi_mpi4py_asan.yaml b/.github/workflows/ompi_mpi4py_asan.yaml index 5204dba825e..3f20cd675df 100644 --- a/.github/workflows/ompi_mpi4py_asan.yaml +++ b/.github/workflows/ompi_mpi4py_asan.yaml @@ -58,6 +58,7 @@ jobs: # and we don't need to include that in our artifact. - name: Configure Open MPI run: ./configure + --enable-debug --disable-dependency-tracking --disable-sphinx --disable-mpi-fortran @@ -121,7 +122,7 @@ jobs: # https://github.com/llvm/llvm-project/issues/64190. run: | echo LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libasan.so.8 >> $GITHUB_ENV - echo ASAN_OPTIONS=detect_odr_violation=0,abort_on_error=1 >> $GITHUB_ENV + echo ASAN_OPTIONS=detect_odr_violation=0,abort_on_error=1,detect_stack_use_after_return=1 >> $GITHUB_ENV echo LSAN_OPTIONS=detect_leaks=0,exitcode=0 >> $GITHUB_ENV echo OMPI_MCA_memory=^patcher >> $GITHUB_ENV From f71172c966f371ea6cfaa77e2c351cd7670ae3c3 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Sat, 17 Jan 2026 10:30:41 +0900 Subject: [PATCH 06/10] Have ompi_info print all variables Signed-off-by: Joseph Schuchart --- .github/workflows/ompi_mpi4py_asan.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ompi_mpi4py_asan.yaml b/.github/workflows/ompi_mpi4py_asan.yaml index 3f20cd675df..a14cb084b0b 100644 --- a/.github/workflows/ompi_mpi4py_asan.yaml +++ b/.github/workflows/ompi_mpi4py_asan.yaml @@ -127,7 +127,7 @@ jobs: echo OMPI_MCA_memory=^patcher >> $GITHUB_ENV - name: Show MPI - run: ompi_info + run: ompi_info --all --all - name: Install mpi4py run: python -m pip install . From 09a2d300a02ab7c9238c696ce264c1557be65d8a Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Tue, 27 Jan 2026 16:37:39 +0900 Subject: [PATCH 07/10] Fix local mca variable reference leakage in ompi_mpi_register_params The variable will go out of scope and ASAN flags this in ompi_info. Signed-off-by: Joseph Schuchart --- ompi/runtime/ompi_mpi_params.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c index c747d55ee7d..7b5d1f3c55e 100644 --- a/ompi/runtime/ompi_mpi_params.c +++ b/ompi/runtime/ompi_mpi_params.c @@ -104,11 +104,12 @@ bool ompi_ftmpi_enabled = false; #endif /* OPAL_ENABLE_FT_MPI */ static int ompi_stream_buffering_mode = -1; +static int ompi_mpi_ft_verbose = 0; int ompi_comm_verbose_level = 0; int ompi_mpi_register_params(void) { - int value; + int value = 0; #if OPAL_ENABLE_FT_MPI mca_base_var_scope_t ftscope = MCA_BASE_VAR_SCOPE_READONLY; @@ -121,15 +122,14 @@ int ompi_mpi_register_params(void) "Enable UFLM MPI Fault Tolerance framework", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_4, ftscope, &ompi_ftmpi_enabled); - value = 0; (void) mca_base_var_register ("ompi", "mpi", "ft", "verbose", "Verbosity level of the ULFM MPI Fault Tolerance framework", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, - OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL, &value); + OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL, &ompi_mpi_ft_verbose); #if OPAL_ENABLE_FT_MPI - if( 0 < value ) { + if( 0 < ompi_mpi_ft_verbose ) { ompi_ftmpi_output_handle = opal_output_open(NULL); - opal_output_set_verbosity(ompi_ftmpi_output_handle, value); + opal_output_set_verbosity(ompi_ftmpi_output_handle, ompi_mpi_ft_verbose); } (void) ompi_comm_rbcast_register_params(); From b0f20e1d4385ae1a0f5339fac1227118c3e20ad0 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Tue, 27 Jan 2026 16:41:03 +0900 Subject: [PATCH 08/10] Fixes suggested by copilot Signed-off-by: Joseph Schuchart --- .github/workflows/ompi_mpi4py_asan.yaml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ompi_mpi4py_asan.yaml b/.github/workflows/ompi_mpi4py_asan.yaml index a14cb084b0b..240e3d2f101 100644 --- a/.github/workflows/ompi_mpi4py_asan.yaml +++ b/.github/workflows/ompi_mpi4py_asan.yaml @@ -20,7 +20,7 @@ permissions: jobs: test: - # We need Unbuntu 24.04 (over 22.04) due to a kernel bug, + # We need Ubuntu 24.04 (over 22.04) due to a kernel bug, # see https://github.com/google/sanitizers/issues/856. runs-on: ubuntu-24.04 timeout-minutes: 30 @@ -36,7 +36,7 @@ jobs: run: echo 127.0.0.1 `hostname` | sudo tee -a /etc/hosts > /dev/null if: ${{ runner.os == 'Linux' || runner.os == 'macOS' }} - - name: Install depencencies + - name: Install dependencies run: sudo apt-get install -y -q libnuma-dev libasan8 if: ${{ runner.os == 'Linux' }} @@ -117,9 +117,7 @@ jobs: # Leak detection is currently disabled because of the size of the report. # The patcher is disabled because ASAN fails if code mmaps data at fixed # memory addresses, see https://github.com/open-mpi/ompi/issues/12819. - # ODR violation detection is disabled until #13469 is fixed - # Disabling stack use after return detection to reduce slowdown, per - # https://github.com/llvm/llvm-project/issues/64190. + # ODR violation detection is disabled until #13469 is fixed. run: | echo LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libasan.so.8 >> $GITHUB_ENV echo ASAN_OPTIONS=detect_odr_violation=0,abort_on_error=1,detect_stack_use_after_return=1 >> $GITHUB_ENV From 019c605ef9455e15d09cb1e78f155b8dc9eeb383 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 28 Jan 2026 10:39:54 +0900 Subject: [PATCH 09/10] Fix local variable reference leak in coll/tuned Signed-off-by: Joseph Schuchart --- ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c | 3 ++- ompi/mca/coll/tuned/coll_tuned_component.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c b/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c index e3482116c84..9dca14bcc55 100644 --- a/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c @@ -34,6 +34,8 @@ static int coll_tuned_alltoall_segment_size = 0; static int coll_tuned_alltoall_tree_fanout; static int coll_tuned_alltoall_chain_fanout; +static int deprecated_mca_params = -1; + /* valid values for coll_tuned_alltoall_forced_algorithm */ static const mca_base_var_enum_value_t alltoall_algorithms[] = { {0, "ignore"}, @@ -119,7 +121,6 @@ int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm MCA_BASE_VAR_SCOPE_ALL, &coll_tuned_alltoall_chain_fanout); - int deprecated_mca_params = -1; (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, "alltoall_large_msg", "use pairwise exchange algorithm for messages larger than this value", diff --git a/ompi/mca/coll/tuned/coll_tuned_component.c b/ompi/mca/coll/tuned/coll_tuned_component.c index d8dbb7959e4..6f5a8c57987 100644 --- a/ompi/mca/coll/tuned/coll_tuned_component.c +++ b/ompi/mca/coll/tuned/coll_tuned_component.c @@ -71,6 +71,8 @@ int ompi_coll_tuned_scatter_large_msg = 0; int ompi_coll_tuned_scatter_min_procs = 0; int ompi_coll_tuned_scatter_blocking_send_ratio = 0; +static int deprecated_mca_params = -1; + /* forced algorithm variables */ /* indices for the MCA parameters */ coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT] = {{0}}; @@ -161,7 +163,6 @@ static int tuned_register(void) MCA_BASE_VAR_SCOPE_ALL, &ompi_coll_tuned_init_chain_fanout); - int deprecated_mca_params = -1; (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, "alltoall_small_msg", "threshold (if supported) to decide if small MSGs alltoall algorithm will be used", From 97f7a936b556f1970f2f3ca48f60886b36c66419 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 28 Jan 2026 12:10:41 +0900 Subject: [PATCH 10/10] Fix local variable reference leak in coll/ft Signed-off-by: Joseph Schuchart --- ompi/mca/coll/ftagree/coll_ftagree_component.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ompi/mca/coll/ftagree/coll_ftagree_component.c b/ompi/mca/coll/ftagree/coll_ftagree_component.c index 97e9ca4cee7..8a733ad3357 100644 --- a/ompi/mca/coll/ftagree/coll_ftagree_component.c +++ b/ompi/mca/coll/ftagree/coll_ftagree_component.c @@ -38,6 +38,8 @@ int mca_coll_ftagree_era_rebuild = 0; double mca_coll_ftagree_debug_inject_proba = 0.0; #endif +static int mca_coll_ft_agreement; + /* * Local function */ @@ -92,8 +94,6 @@ ftagree_close(void) static int ftagree_register(void) { - int value; - /* Use a low priority, but allow other components to be lower */ mca_coll_ftagree_priority = 30; (void) mca_base_component_var_register(&mca_coll_ftagree_component.collm_version, @@ -103,15 +103,15 @@ ftagree_register(void) MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_ftagree_priority); - if( ompi_ftmpi_enabled ) value = 1; - else value = 0; /* NOFT: do not initialize ERA */ + if( ompi_ftmpi_enabled ) mca_coll_ft_agreement = 1; + else mca_coll_ft_agreement = 0; /* NOFT: do not initialize ERA */ (void) mca_base_component_var_register(&mca_coll_ftagree_component.collm_version, "agreement", "Agreement algorithm 0: Allreduce (NOT FAULT TOLERANT); 1: Early Returning Consensus (era); 2: Early Terminating Consensus (eta)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, - &value); - switch(value) { + &mca_coll_ft_agreement); + switch(mca_coll_ft_agreement) { case 0: mca_coll_ftagree_algorithm = COLL_FTAGREE_NOFT; opal_output_verbose(6, ompi_ftmpi_output_handle,