diff --git a/.github/workflows/ompi_mpi4py_asan.yaml b/.github/workflows/ompi_mpi4py_asan.yaml new file mode 100644 index 00000000000..240e3d2f101 --- /dev/null +++ b/.github/workflows/ompi_mpi4py_asan.yaml @@ -0,0 +1,148 @@ +name: mpi4py (ASAN) + +on: + pull_request: + workflow_dispatch: + inputs: + repository: + description: 'mpi4py repository' + default: 'mpi4py/mpi4py' + required: false + type: string + ref: + description: 'mpi4py branch/tag/SHA' + default: 'master' + required: false + type: string + +permissions: + contents: read + +jobs: + test: + # We need Ubuntu 24.04 (over 22.04) due to a kernel bug, + # see https://github.com/google/sanitizers/issues/856. + runs-on: ubuntu-24.04 + timeout-minutes: 30 + env: + MPI4PY_TEST_SPAWN: true + # disable ASAN while building + ASAN_OPTIONS: verify_asan_link_order=0,detect_odr_violation=0,abort_on_error=0 + # disable leak detection + LSAN_OPTIONS: detect_leaks=0,exitcode=0 + + steps: + - name: Configure hostname + run: echo 127.0.0.1 `hostname` | sudo tee -a /etc/hosts > /dev/null + if: ${{ runner.os == 'Linux' || runner.os == 'macOS' }} + + - name: Install dependencies + run: sudo apt-get install -y -q + libnuma-dev libasan8 + if: ${{ runner.os == 'Linux' }} + + - name: Checkout Open MPI + uses: actions/checkout@v4 + with: + path: mpi-build + submodules: recursive + + - name: Bootstrap Open MPI + run: ./autogen.pl + working-directory: mpi-build + + # Install into a separate directory (/opt/openmpi) so that we can + # bundle up that tree into an artifact to share with other jobs in + # this github action. Specifically don't use /usr/local, because + # there's a bunch of other stuff already installed in /usr/local, + # and we don't need to include that in our artifact. + - name: Configure Open MPI + run: ./configure + --enable-debug + --disable-dependency-tracking + --disable-sphinx + --disable-mpi-fortran + --disable-oshmem + --disable-silent-rules + --prefix=/opt/openmpi + CFLAGS="-O1 -fno-omit-frame-pointer -g -fsanitize=address" + LDFLAGS="-Wl,-rpath,/opt/openmpi/lib -fsanitize=address" + working-directory: mpi-build + + - name: Build MPI + run: make -j $(nproc) + working-directory: mpi-build + + - name: Install MPI + run: sudo make install + working-directory: mpi-build + + - name: Add Open MPI to PATH + run: echo /opt/openmpi/bin >> $GITHUB_PATH + + - name: Tweak MPI + run: | + # Tweak MPI + mca_params="$HOME/.openmpi/mca-params.conf" + mkdir -p "$(dirname "$mca_params")" + echo mpi_param_check = true >> "$mca_params" + echo mpi_show_handle_leaks = true >> "$mca_params" + mca_params="$HOME/.prte/mca-params.conf" + mkdir -p "$(dirname "$mca_params")" + echo rmaps_default_mapping_policy = :oversubscribe >> "$mca_params" + + - name: Use Python + uses: actions/setup-python@v5 + with: + python-version: 3 + architecture: x64 + + - name: Install Python packages (build) + run: python -m pip install --upgrade + setuptools pip wheel + + - name: Install Python packages (test) + run: python -m pip install --upgrade + numpy cffi pyyaml + + - name: Checkout mpi4py + uses: actions/checkout@v4 + with: + repository: ${{ inputs.repository || 'mpi4py/mpi4py' }} + ref: ${{ inputs.ref }} + + - name: Setting up ASAN environment + # LD_PRELOAD is needed to make sure ASAN is the first thing loaded + # as it will otherwise complain. + # Leak detection is currently disabled because of the size of the report. + # The patcher is disabled because ASAN fails if code mmaps data at fixed + # memory addresses, see https://github.com/open-mpi/ompi/issues/12819. + # ODR violation detection is disabled until #13469 is fixed. + run: | + echo LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libasan.so.8 >> $GITHUB_ENV + echo ASAN_OPTIONS=detect_odr_violation=0,abort_on_error=1,detect_stack_use_after_return=1 >> $GITHUB_ENV + echo LSAN_OPTIONS=detect_leaks=0,exitcode=0 >> $GITHUB_ENV + echo OMPI_MCA_memory=^patcher >> $GITHUB_ENV + + - name: Show MPI + run: ompi_info --all --all + + - name: Install mpi4py + run: python -m pip install . + env: + CFLAGS: "-O0" + + - name: Test mpi4py (singleton) + run: python test/main.py -v -x TestExcErrhandlerNull + if: ${{ true }} + timeout-minutes: 10 + + - name: Test mpi4py (np=1) + run: mpiexec -n 1 python test/main.py -v -x TestExcErrhandlerNull + if: ${{ true }} + timeout-minutes: 10 + + - name: Test mpi4py (np=4) + run: mpiexec -n 4 python test/main.py -v -f -x TestExcErrhandlerNull + if: ${{ true }} + timeout-minutes: 10 diff --git a/ompi/mca/coll/ftagree/coll_ftagree_component.c b/ompi/mca/coll/ftagree/coll_ftagree_component.c index 97e9ca4cee7..8a733ad3357 100644 --- a/ompi/mca/coll/ftagree/coll_ftagree_component.c +++ b/ompi/mca/coll/ftagree/coll_ftagree_component.c @@ -38,6 +38,8 @@ int mca_coll_ftagree_era_rebuild = 0; double mca_coll_ftagree_debug_inject_proba = 0.0; #endif +static int mca_coll_ft_agreement; + /* * Local function */ @@ -92,8 +94,6 @@ ftagree_close(void) static int ftagree_register(void) { - int value; - /* Use a low priority, but allow other components to be lower */ mca_coll_ftagree_priority = 30; (void) mca_base_component_var_register(&mca_coll_ftagree_component.collm_version, @@ -103,15 +103,15 @@ ftagree_register(void) MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_ftagree_priority); - if( ompi_ftmpi_enabled ) value = 1; - else value = 0; /* NOFT: do not initialize ERA */ + if( ompi_ftmpi_enabled ) mca_coll_ft_agreement = 1; + else mca_coll_ft_agreement = 0; /* NOFT: do not initialize ERA */ (void) mca_base_component_var_register(&mca_coll_ftagree_component.collm_version, "agreement", "Agreement algorithm 0: Allreduce (NOT FAULT TOLERANT); 1: Early Returning Consensus (era); 2: Early Terminating Consensus (eta)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, - &value); - switch(value) { + &mca_coll_ft_agreement); + switch(mca_coll_ft_agreement) { case 0: mca_coll_ftagree_algorithm = COLL_FTAGREE_NOFT; opal_output_verbose(6, ompi_ftmpi_output_handle, diff --git a/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c b/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c index e3482116c84..9dca14bcc55 100644 --- a/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c @@ -34,6 +34,8 @@ static int coll_tuned_alltoall_segment_size = 0; static int coll_tuned_alltoall_tree_fanout; static int coll_tuned_alltoall_chain_fanout; +static int deprecated_mca_params = -1; + /* valid values for coll_tuned_alltoall_forced_algorithm */ static const mca_base_var_enum_value_t alltoall_algorithms[] = { {0, "ignore"}, @@ -119,7 +121,6 @@ int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm MCA_BASE_VAR_SCOPE_ALL, &coll_tuned_alltoall_chain_fanout); - int deprecated_mca_params = -1; (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, "alltoall_large_msg", "use pairwise exchange algorithm for messages larger than this value", diff --git a/ompi/mca/coll/tuned/coll_tuned_component.c b/ompi/mca/coll/tuned/coll_tuned_component.c index d8dbb7959e4..6f5a8c57987 100644 --- a/ompi/mca/coll/tuned/coll_tuned_component.c +++ b/ompi/mca/coll/tuned/coll_tuned_component.c @@ -71,6 +71,8 @@ int ompi_coll_tuned_scatter_large_msg = 0; int ompi_coll_tuned_scatter_min_procs = 0; int ompi_coll_tuned_scatter_blocking_send_ratio = 0; +static int deprecated_mca_params = -1; + /* forced algorithm variables */ /* indices for the MCA parameters */ coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT] = {{0}}; @@ -161,7 +163,6 @@ static int tuned_register(void) MCA_BASE_VAR_SCOPE_ALL, &ompi_coll_tuned_init_chain_fanout); - int deprecated_mca_params = -1; (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, "alltoall_small_msg", "threshold (if supported) to decide if small MSGs alltoall algorithm will be used", diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c index c747d55ee7d..7b5d1f3c55e 100644 --- a/ompi/runtime/ompi_mpi_params.c +++ b/ompi/runtime/ompi_mpi_params.c @@ -104,11 +104,12 @@ bool ompi_ftmpi_enabled = false; #endif /* OPAL_ENABLE_FT_MPI */ static int ompi_stream_buffering_mode = -1; +static int ompi_mpi_ft_verbose = 0; int ompi_comm_verbose_level = 0; int ompi_mpi_register_params(void) { - int value; + int value = 0; #if OPAL_ENABLE_FT_MPI mca_base_var_scope_t ftscope = MCA_BASE_VAR_SCOPE_READONLY; @@ -121,15 +122,14 @@ int ompi_mpi_register_params(void) "Enable UFLM MPI Fault Tolerance framework", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_4, ftscope, &ompi_ftmpi_enabled); - value = 0; (void) mca_base_var_register ("ompi", "mpi", "ft", "verbose", "Verbosity level of the ULFM MPI Fault Tolerance framework", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, - OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL, &value); + OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL, &ompi_mpi_ft_verbose); #if OPAL_ENABLE_FT_MPI - if( 0 < value ) { + if( 0 < ompi_mpi_ft_verbose ) { ompi_ftmpi_output_handle = opal_output_open(NULL); - opal_output_set_verbosity(ompi_ftmpi_output_handle, value); + opal_output_set_verbosity(ompi_ftmpi_output_handle, ompi_mpi_ft_verbose); } (void) ompi_comm_rbcast_register_params();