diff --git a/.ci/community-jenkins/Jenkinsfile b/.ci/community-jenkins/Jenkinsfile index f4b305f1d66..2c20d630ac1 100644 --- a/.ci/community-jenkins/Jenkinsfile +++ b/.ci/community-jenkins/Jenkinsfile @@ -14,7 +14,6 @@ // // // WORKSPACE Layout: -// autotools-install/ Autotools install for the builder // ompi/ Open MPI source tree // We if we push changes to a PR, we don't need to keep old jobs running, so @@ -56,9 +55,26 @@ println('Tests Completed') // although currently we only support the one stage of "everything", where each // build stage is a map of different configurations to test. def prepare_check_stages() { - def configure_options = ["--disable-dlopen", "--disable-oshmem", "--enable-builtin-atomic", "--enable-ipv6"] - def compilers = ["clang10", "gcc7", "gcc8", "gcc9", "gcc10"] - def platforms = ["amazon_linux_2", "amazon_linux_2-arm64", "rhel8"] + def configure_options = [ + "--disable-dlopen", + "--disable-oshmem", + "--enable-builtin-atomic", + "--enable-ipv6" + ] + def compilers = [ + "gcc14", + "clang18" + ] + def platforms = [ + "amazon_linux_2", + "amazon_linux_2-arm64", + "rhel8", + "amazon_linux_2023-arm64", + "amazon_linux_2023-x86_64", + "ubuntu_20.04", + "ubuntu_24.04-arm64", + "ubuntu_24.04-x86_64" + ] def check_stages_list = [] // Build everything stage @@ -79,6 +95,7 @@ def prepare_check_stages() { } build_parallel_map.put("distcheck", prepare_build("distcheck", "tarball_build", "--distcheck")) + build_parallel_map.put("vpath", prepare_build("vpath", "", "--build-dir ompi-build")) check_stages_list.add(build_parallel_map) @@ -89,14 +106,20 @@ def prepare_build(build_name, label, build_arg) { return { stage("${build_name}") { node(label) { - checkout(changelog: false, poll: false, scm: scm) + // Checkout into ompi-source instead of the top of the + // workspace, so that we have room in the workspace to setup a + // vpath build. + dir ('ompi-source') { + checkout(changelog: false, poll: false, scm: scm) + } + // If pr-builder.sh fails, the sh step will throw an exception, // which we catch so that the job doesn't abort and continues on // to other steps - such as cleanup. Because we catch the // exception, we need to tell Jenkins the overall job has // failed. try { - sh "/bin/bash -x .ci/community-jenkins/pr-builder.sh ${build_arg} ompi" + sh "/bin/bash -x ompi-source/.ci/community-jenkins/pr-builder.sh ${build_arg} --source-dir ompi-source" } catch (Exception e) { currentBuild.result = "FAILURE" } diff --git a/.ci/community-jenkins/pr-builder.sh b/.ci/community-jenkins/pr-builder.sh index eb88b4c1538..88426859bf0 100755 --- a/.ci/community-jenkins/pr-builder.sh +++ b/.ci/community-jenkins/pr-builder.sh @@ -21,6 +21,8 @@ MAKE_ARGS= MAKE_J="-j 8" PREFIX="${WORKSPACE}/install" MPIRUN_MODE=${MPIRUN_MODE:-runall} +SOURCE_DIR= +BUILD_DIR= # # Options Parsing @@ -77,6 +79,24 @@ while (( "$#" )); do exit 1 fi ;; + --source-dir) + if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then + SOURCE_DIR=$2 + shift 2 + else + echo "Error: Argument for $1 is missing" >&2 + exit 1 + fi + ;; + --build-dir) + if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then + BUILD_DIR=$2 + shift 2 + else + echo "Error: Argument for $1 is missing" >&2 + exit 1 + fi + ;; -*|--*=) # Unsupported flags echo "Error: Unsupported flag $1" >&2 exit 1 @@ -105,93 +125,43 @@ fi echo "--> platform: $PLATFORM_ID" echo "--> version: $VERSION_ID" +if test "${SOURCE_DIR}" = "" ; then + echo "SOURCED_DIR is unset. Cannot continue." + exit 1 +fi + +echo "--> Workspace: ${WORKSPACE}" +echo "--> Source Dir: ${SOURCE_DIR}" +echo "--> Build Dir: ${BUILD_DIR}" +echo "--> Install Dir: ${PREFIX}" + # # See if builder provided a compiler we should use, and translate it to # CONFIGURE_ARGS. # -case ${PLATFORM_ID} in - rhel) - case "$COMPILER" in - gcc48|"") - echo "--> Using default compilers" - ;; - *) - echo "Unsupported compiler ${COMPILER}. Aborting" - exit 1 - ;; - esac - ;; - amzn) - case "$COMPILER" in - "") - echo "--> Using default compilers" - ;; - gcc44) - CONFIGURE_ARGS="$CONFIGURE_ARGS CC=gcc44 CXX=g++44 FC=gfortran44" - ;; - gcc48) - CONFIGURE_ARGS="$CONFIGURE_ARGS CC=gcc48 CXX=g++48 FC=gfortran48" - ;; - clang36) - CONFIGURE_ARGS="$CONFIGURE_ARGS CC=clang CXX=clang++ --disable-mpi-fortran" - ;; - *) - echo "Unsupported compiler ${COMPILER}. Aborting" - exit 1 - ;; - esac - ;; - ubuntu) - case "$COMPILER" in - "") - echo "--> Using default compilers" - ;; - gcc4*) - version=`echo "$COMPILER" | sed -e 's/gcc4\([0-9]*\)/4.\1/'` - CONFIGURE_ARGS="CC=gcc-${version} CXX=g++-${version} FC=gfortran-${version}" - ;; - gcc*) - version=`echo "$COMPILER" | sed -e 's/gcc\([0-9]*\)/\1/'` - CONFIGURE_ARGS="CC=gcc-${version} CXX=g++-${version} FC=gfortran-${version}" - ;; - clang3*|clang4*|clang5*|clang6*) - version=`echo "$COMPILER" | sed -e 's/clang\([0-9]\)\([0-9]*\)/\1.\2/'` - CONFIGURE_ARGS="CC=clang-${version} CXX=clang++-${version} --disable-mpi-fortran" - ;; +if test "${COMPILER}" != "" ; then + if test ! -r ${HOME}/ompi-compiler-setup.sh ; then + echo "Could not find compiler setup script ompi-compiler-setup.sh. Aborting." + exit 1 + fi + + . ${HOME}/ompi-compiler-setup.sh + activate_compiler ${COMPILER} + + CONFIGURE_ARGS="${CONFIGURE_ARGS} CC=${CC} CPP=${CPP} CXX=${CXX} FC=${FC}" + if test "$FC" = "" ; then + CONFIGURE_ARGS="${CONFIGURE_ARGS} --disable-mpi-fortran" + else + # Flang doesn't seem good enough (yet) to compile our Fortran bindings, + # so skip for now. + case "${COMPILER}" in clang*) - version=`echo "$COMPILER" | sed -e 's/clang\([0-9]*\)/\1/'` - CONFIGURE_ARGS="CC=clang-${version} CXX=clang++-${version} --disable-mpi-fortran" - ;; - *) - echo "Unsupported compiler ${COMPILER}. Aborting" - exit 1 + CONFIGURE_ARGS="${CONFIGURE_ARGS} --disable-mpi-fortran" ;; esac - ;; - sles) - case "$COMPILER" in - "") - echo "--> Using default compilers" - ;; - gcc48) - CONFIGURE_ARGS="$CONFIGURE_ARGS CC=gcc-48 CXX=g++-48 FC=gfortran-48" - ;; - gcc5) - CONFIGURE_ARGS="$CONFIGURE_ARGS CC=gcc-5 CXX=g++-5 FC=gfortran-5" - ;; - gcc6) - CONFIGURE_ARGS="$CONFIGURE_ARGS CC=gcc-6 CXX=g++-6 FC=gfortran-6" - ;; - *) - echo "Unsupported compiler ${COMPILER}. Aborting" - exit 1 - ;; - esac - ;; - FreeBSD) - CONFIGURE_ARGS="$CONFIGURE_ARGS LDFLAGS=-Wl,-rpath,/usr/local/lib/gcc5 --with-wrapper-ldflags=-Wl,-rpath,/usr/local/lib/gcc5" - ;; -esac + fi +fi + CONFIGURE_ARGS="$CONFIGURE_ARGS --disable-silent-rules" echo "--> Compiler setup: $CONFIGURE_ARGS" @@ -210,10 +180,20 @@ fi echo "--> Autogen arguments: $AUTOGEN_ARGS" echo "--> Configure arguments: $CONFIGURE_ARGS" +cd "${WORKSPACE}/${SOURCE_DIR}" + # Build sha1=`git rev-parse HEAD` echo "--> Building commit ${sha1}" +if test "${HOME}/ompi-setup-python.sh" ; then + echo "--> Initializing Python environment" + . ${HOME}/ompi-setup-python.sh + find . -name "requirements.txt" -exec ${PIP_CMD} install -r {} \; +else + echo "--> No Python environment found, hoping for the best." +fi + if test -f autogen.pl; then echo "--> running ./autogen.pl ${AUTOGEN_ARGS}" ./autogen.pl ${AUTOGEN_ARGS} @@ -227,9 +207,20 @@ else fi fi -echo "--> running ./configure --prefix=\"${PREFIX}\" ${CONFIGURE_ARGS}" -if ! ./configure --prefix="${PREFIX}" ${CONFIGURE_ARGS}; then - echo "./configure --prefix=\"${PREFIX}\" ${CONFIGURE_ARGS} failed, ABORTING !" +if test "${BUILD_DIR}" != "" ; then + cd "${WORKSPACE}" + rm -rf "${BUILD_DIR}" + mkdir "${BUILD_DIR}" + cd "${WORKSPACE}/${BUILD_DIR}" + CONFIGURE=../${SOURCE_DIR}/configure +else + # already in ${WORKSPACE}/${SOURCE_DIR} + CONFIGURE=./configure +fi + +echo "--> running ${CONFIGURE} --prefix=\"${PREFIX}\" ${CONFIGURE_ARGS}" +if ! ${CONFIGURE} --prefix="${PREFIX}" ${CONFIGURE_ARGS}; then + echo "${CONFIGURE} --prefix=\"${PREFIX}\" ${CONFIGURE_ARGS} failed, ABORTING !" if test -f config.log; then echo "config.log content :" cat config.log @@ -268,7 +259,7 @@ echo "--> running ompi_info" ompi_info echo "--> running make all in examples" -cd "examples" +cd "${WORKSPACE}/${SOURCE_DIR}/examples" make ${MAKE_ARGS} all cd .. diff --git a/.github/workflows/ompi_mpi4py_asan.yaml b/.github/workflows/ompi_mpi4py_asan.yaml new file mode 100644 index 00000000000..240e3d2f101 --- /dev/null +++ b/.github/workflows/ompi_mpi4py_asan.yaml @@ -0,0 +1,148 @@ +name: mpi4py (ASAN) + +on: + pull_request: + workflow_dispatch: + inputs: + repository: + description: 'mpi4py repository' + default: 'mpi4py/mpi4py' + required: false + type: string + ref: + description: 'mpi4py branch/tag/SHA' + default: 'master' + required: false + type: string + +permissions: + contents: read + +jobs: + test: + # We need Ubuntu 24.04 (over 22.04) due to a kernel bug, + # see https://github.com/google/sanitizers/issues/856. + runs-on: ubuntu-24.04 + timeout-minutes: 30 + env: + MPI4PY_TEST_SPAWN: true + # disable ASAN while building + ASAN_OPTIONS: verify_asan_link_order=0,detect_odr_violation=0,abort_on_error=0 + # disable leak detection + LSAN_OPTIONS: detect_leaks=0,exitcode=0 + + steps: + - name: Configure hostname + run: echo 127.0.0.1 `hostname` | sudo tee -a /etc/hosts > /dev/null + if: ${{ runner.os == 'Linux' || runner.os == 'macOS' }} + + - name: Install dependencies + run: sudo apt-get install -y -q + libnuma-dev libasan8 + if: ${{ runner.os == 'Linux' }} + + - name: Checkout Open MPI + uses: actions/checkout@v4 + with: + path: mpi-build + submodules: recursive + + - name: Bootstrap Open MPI + run: ./autogen.pl + working-directory: mpi-build + + # Install into a separate directory (/opt/openmpi) so that we can + # bundle up that tree into an artifact to share with other jobs in + # this github action. Specifically don't use /usr/local, because + # there's a bunch of other stuff already installed in /usr/local, + # and we don't need to include that in our artifact. + - name: Configure Open MPI + run: ./configure + --enable-debug + --disable-dependency-tracking + --disable-sphinx + --disable-mpi-fortran + --disable-oshmem + --disable-silent-rules + --prefix=/opt/openmpi + CFLAGS="-O1 -fno-omit-frame-pointer -g -fsanitize=address" + LDFLAGS="-Wl,-rpath,/opt/openmpi/lib -fsanitize=address" + working-directory: mpi-build + + - name: Build MPI + run: make -j $(nproc) + working-directory: mpi-build + + - name: Install MPI + run: sudo make install + working-directory: mpi-build + + - name: Add Open MPI to PATH + run: echo /opt/openmpi/bin >> $GITHUB_PATH + + - name: Tweak MPI + run: | + # Tweak MPI + mca_params="$HOME/.openmpi/mca-params.conf" + mkdir -p "$(dirname "$mca_params")" + echo mpi_param_check = true >> "$mca_params" + echo mpi_show_handle_leaks = true >> "$mca_params" + mca_params="$HOME/.prte/mca-params.conf" + mkdir -p "$(dirname "$mca_params")" + echo rmaps_default_mapping_policy = :oversubscribe >> "$mca_params" + + - name: Use Python + uses: actions/setup-python@v5 + with: + python-version: 3 + architecture: x64 + + - name: Install Python packages (build) + run: python -m pip install --upgrade + setuptools pip wheel + + - name: Install Python packages (test) + run: python -m pip install --upgrade + numpy cffi pyyaml + + - name: Checkout mpi4py + uses: actions/checkout@v4 + with: + repository: ${{ inputs.repository || 'mpi4py/mpi4py' }} + ref: ${{ inputs.ref }} + + - name: Setting up ASAN environment + # LD_PRELOAD is needed to make sure ASAN is the first thing loaded + # as it will otherwise complain. + # Leak detection is currently disabled because of the size of the report. + # The patcher is disabled because ASAN fails if code mmaps data at fixed + # memory addresses, see https://github.com/open-mpi/ompi/issues/12819. + # ODR violation detection is disabled until #13469 is fixed. + run: | + echo LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libasan.so.8 >> $GITHUB_ENV + echo ASAN_OPTIONS=detect_odr_violation=0,abort_on_error=1,detect_stack_use_after_return=1 >> $GITHUB_ENV + echo LSAN_OPTIONS=detect_leaks=0,exitcode=0 >> $GITHUB_ENV + echo OMPI_MCA_memory=^patcher >> $GITHUB_ENV + + - name: Show MPI + run: ompi_info --all --all + + - name: Install mpi4py + run: python -m pip install . + env: + CFLAGS: "-O0" + + - name: Test mpi4py (singleton) + run: python test/main.py -v -x TestExcErrhandlerNull + if: ${{ true }} + timeout-minutes: 10 + + - name: Test mpi4py (np=1) + run: mpiexec -n 1 python test/main.py -v -x TestExcErrhandlerNull + if: ${{ true }} + timeout-minutes: 10 + + - name: Test mpi4py (np=4) + run: mpiexec -n 4 python test/main.py -v -f -x TestExcErrhandlerNull + if: ${{ true }} + timeout-minutes: 10 diff --git a/.gitignore b/.gitignore index 7ab0b99af7d..b30321da7ca 100644 --- a/.gitignore +++ b/.gitignore @@ -542,3 +542,5 @@ ompi/mpi/fortran/use-mpi-f08/base/*_generated.c ompi/mpi/fortran/use-mpi-f08/mod/mpi-f08-interfaces-generated.h ompi/mpi/fortran/use-mpi-ignore-tkr/mpi-ignore-tkr-interfaces-generated.h ompi/mpi/fortran/use-mpi-ignore-tkr/*_generated.F90 + +.vscode/ diff --git a/3rd-party/openpmix b/3rd-party/openpmix index 7704efaf865..53fce423d5d 160000 --- a/3rd-party/openpmix +++ b/3rd-party/openpmix @@ -1 +1 @@ -Subproject commit 7704efaf865328234e3cb1f77ff393adc971c9fe +Subproject commit 53fce423d5d6b25798ed1f32837671dc55d0230d diff --git a/3rd-party/prrte b/3rd-party/prrte index 91544b8d2c5..2d9b0aaaeea 160000 --- a/3rd-party/prrte +++ b/3rd-party/prrte @@ -1 +1 @@ -Subproject commit 91544b8d2c5ac84585022d0edad68e38f375a917 +Subproject commit 2d9b0aaaeea49a0e7850aed95e5ace9340c7d847 diff --git a/config/ompi_fortran_check.m4 b/config/ompi_fortran_check.m4 index e479a87ac64..7fd2a790353 100644 --- a/config/ompi_fortran_check.m4 +++ b/config/ompi_fortran_check.m4 @@ -137,8 +137,8 @@ AC_DEFUN([OMPI_FORTRAN_CHECK], [ long*double*_Complex) ofc_type_kind=C_LONG_DOUBLE_COMPLEX ;; opal_short_float_t) ofc_type_kind=C_SHORT_FLOAT ;; opal_short_float_complex_t) ofc_type_kind=C_SHORT_FLOAT_COMPLEX ;; - _Float128) ofc_type_kind=C__FLOAT128 ;; - __float128) ofc_type_kind=C___FLOAT128 ;; + _Float128) ofc_type_kind=C_FLOAT128 ;; + __float128) ofc_type_kind=C_FLOAT128 ;; *) # Skip types like "DOUBLE PRECISION" ;; diff --git a/config/ompi_fortran_check_real16_c_equiv.m4 b/config/ompi_fortran_check_real16_c_equiv.m4 index 85141c798b6..b9e67d9606e 100644 --- a/config/ompi_fortran_check_real16_c_equiv.m4 +++ b/config/ompi_fortran_check_real16_c_equiv.m4 @@ -61,19 +61,26 @@ AC_DEFUN([OMPI_FORTRAN_CHECK_REAL16_C_EQUIV],[ AC_MSG_RESULT([works!])], [AC_MSG_RESULT([does not work])]) ]) - # As recent Intel compilers identify as GNU we will always test for Quad support if no other tests were succesfull + # As recent Intel compilers identify as GNU we will always test for Quad + # support if no other tests were succesfull AS_IF([test "$fortran_real16_happy" = "no"], - [AC_CHECK_TYPES(_Quad) - AS_IF([test "$ac_cv_type__Quad" = "yes"], - [AC_MSG_CHECKING([if the compiler _Quad == REAL*16]) - CFLAGS_save="$CFLAGS" + [AC_CHECK_TYPES([_Quad]) + AS_IF([test "$ac_cv_type__Quad" != "yes"], + [CFLAGS_save="$CFLAGS" OPAL_FLAGS_APPEND_UNIQ([CFLAGS], ["-Qoption,cpp,--extended_float_types"]) + # force the check as we have updated CFLAGS + unset ac_cv_type__Quad + AC_CHECK_TYPES([_Quad]) + AS_IF([test "$ac_cv_type__Quad" != "yes"], + [CFLAGS="$CFLAGS_save"]) + ]) + AS_IF([test "$ac_cv_type__Quad" != "yes"], + [AC_MSG_CHECKING([if the compiler _Quad == REAL*16]) OMPI_FORTRAN_CHECK_REAL16_EQUIV_TYPE([_Quad], [q]) AS_IF([test "$fortran_real16_happy" = "yes"], [OMPI_FORTRAN_REAL16_C_TYPE="_Quad" AC_MSG_RESULT([works!])], - [CFLAGS="$CFLAGS_save" - AC_MSG_RESULT([does not work])]) + [AC_MSG_RESULT([does not work])]) ]) ]) # We have to [re-]print a new message here, because diff --git a/config/ompi_setup_mpi_fortran.m4 b/config/ompi_setup_mpi_fortran.m4 index 3474276e661..c396a2efab6 100644 --- a/config/ompi_setup_mpi_fortran.m4 +++ b/config/ompi_setup_mpi_fortran.m4 @@ -226,7 +226,7 @@ AC_DEFUN([OMPI_SETUP_MPI_FORTRAN],[ [long double _Complex, double _Complex, float _Complex, short float _Complex, opal_short_float_complex_t], [16], [no]) OMPI_FORTRAN_CHECK([COMPLEX*32], [no], - [_Float128 _Complex, long double _Complex, double _Complex, float _Complex, short float _Complex, opal_short_float_complex_t], + [_Float128 _Complex, __float128 _Complex, long double _Complex, double _Complex, float _Complex, short float _Complex, opal_short_float_complex_t], [32], [no]) # Double precision complex types are not standard, but many # compilers support it. Code should be wrapped with #ifdef diff --git a/config/opal_check_cuda.m4 b/config/opal_check_cuda.m4 index a6bf80a1b2a..ed3a51a26e8 100644 --- a/config/opal_check_cuda.m4 +++ b/config/opal_check_cuda.m4 @@ -154,6 +154,7 @@ AC_MSG_CHECKING([if have cuda support]) if test "$opal_check_cuda_happy" = "yes"; then AC_MSG_RESULT([yes (-I$opal_cuda_incdir)]) CUDA_SUPPORT=1 + OMPI_HAVE_ACCELERATOR_SUPPORT=1 common_cuda_CPPFLAGS="-I$opal_cuda_incdir" AC_SUBST([common_cuda_CPPFLAGS]) else diff --git a/config/opal_check_rocm.m4 b/config/opal_check_rocm.m4 index 25ac54e438e..0d1e6053469 100644 --- a/config/opal_check_rocm.m4 +++ b/config/opal_check_rocm.m4 @@ -57,7 +57,8 @@ AC_DEFUN([OPAL_CHECK_ROCM],[ AS_IF([ test "$opal_check_rocm_happy" = "yes" ], [ OPAL_APPEND([$1_CPPFLAGS], [$rocm_CPPFLAGS]) AC_DEFINE_UNQUOTED([OPAL_ROCM_SUPPORT], [1], [Enable ROCm support]) - ROCM_SUPPORT=1 ], + ROCM_SUPPORT=1 + OMPI_HAVE_ACCELERATOR_SUPPORT=1 ], [ AC_DEFINE_UNQUOTED([OPAL_ROCM_SUPPORT], [0], [Disable ROCm support]) ROCM_SUPPORT=0 ]) diff --git a/config/opal_check_ze.m4 b/config/opal_check_ze.m4 index d1d47bb67c1..84c8dacd2df 100644 --- a/config/opal_check_ze.m4 +++ b/config/opal_check_ze.m4 @@ -56,7 +56,8 @@ AC_DEFUN([OPAL_CHECK_ZE],[ AS_IF([ test "$opal_check_ze_happy" = "yes" ], [ AC_DEFINE_UNQUOTED([OPAL_ZE_SUPPORT], [1], [Enable Intel ZE support]) - ZE_SUPPORT=1 ], + ZE_SUPPORT=1 + OMPI_HAVE_ACCELERATOR_SUPPORT=1 ], [ AC_DEFINE_UNQUOTED([OPAL_ZE_SUPPORT], [0], [Disable Intel ZE support]) ZE_SUPPORT=0 ]) diff --git a/config/opal_mca.m4 b/config/opal_mca.m4 index cdeb935a3a3..bb51d3bc5f1 100644 --- a/config/opal_mca.m4 +++ b/config/opal_mca.m4 @@ -186,7 +186,7 @@ of type-component pairs. For example, --enable-mca-no-build=pml-ob1]) else msg= if test -z "$enable_mca_dso"; then - enable_mca_dso="accelerator-cuda,accelerator-rocm,accelerator-ze,btl-smcuda,rcache-gpusm,rcache-rgpusm" + enable_mca_dso="accelerator-cuda,accelerator-rocm,accelerator-ze" msg="(default)" fi DSO_all=0 diff --git a/configure.ac b/configure.ac index 928f41b0415..d4276b23284 100644 --- a/configure.ac +++ b/configure.ac @@ -276,6 +276,7 @@ m4_ifdef([project_oshmem], ############################################################################ # Configuration options ############################################################################ +OMPI_HAVE_ACCELERATOR_SUPPORT=0 OPAL_CONFIGURE_OPTIONS diff --git a/contrib/platform/mellanox/optimized.conf b/contrib/platform/mellanox/optimized.conf index 6a7be025a66..b1316c4b67d 100644 --- a/contrib/platform/mellanox/optimized.conf +++ b/contrib/platform/mellanox/optimized.conf @@ -85,8 +85,6 @@ opal_warn_on_missing_libcuda = 0 bml_r2_show_unreach_errors = 0 # alltoall algorithm selection settings for tuned coll mca -coll_tuned_alltoall_large_msg = 250000 -coll_tuned_alltoall_min_procs = 2048 coll_tuned_alltoall_algorithm_max_requests = 8 coll_tuned_scatter_intermediate_msg = 8192 coll_tuned_scatter_large_msg = 250000 diff --git a/docs/Makefile.am b/docs/Makefile.am index 871184eb01d..a6edc6ae045 100644 --- a/docs/Makefile.am +++ b/docs/Makefile.am @@ -38,7 +38,8 @@ TEXT_SOURCE_FILES = \ $(srcdir)/license/*.txt IMAGE_SOURCE_FILES = \ $(srcdir)/openmpi_logo.png \ - $(srcdir)/installing-open-mpi/required-support-libraries-dependency-graph.png + $(srcdir)/installing-open-mpi/required-support-libraries-dependency-graph.png \ + $(srcdir)/tuning-apps/collectives/images/xhc-hierarchy.svg RST_SOURCE_FILES = \ $(srcdir)/*.rst \ $(srcdir)/release-notes/*.rst \ diff --git a/docs/tuning-apps/collectives/components.rst b/docs/tuning-apps/collectives/components.rst index f29c202e358..921f7e12036 100644 --- a/docs/tuning-apps/collectives/components.rst +++ b/docs/tuning-apps/collectives/components.rst @@ -28,7 +28,9 @@ The following provides a list of components and their primary target scenario: more details. - ``ucc``: component using the `UCC library `_ for collective operations. - - ``xhc``: shared memory collective component using XPMEM for data transfers. + - ``xhc``: shared memory collective component, employing hierarchical & + topology-aware algorithms, with XPMEM for data transfers. See :doc:`xhc` for + more details. - ``acoll``: collective component tuned for AMD Zen architectures. See :doc:`acoll` for more details. - ``accelerator``: component providing host-proxy algorithms for some diff --git a/ompi/mca/coll/xhc/resources/xhc-hierarchy.svg b/docs/tuning-apps/collectives/images/xhc-hierarchy.svg similarity index 86% rename from ompi/mca/coll/xhc/resources/xhc-hierarchy.svg rename to docs/tuning-apps/collectives/images/xhc-hierarchy.svg index c8f6d8a2da3..b4ae62a6c4f 100644 --- a/ompi/mca/coll/xhc/resources/xhc-hierarchy.svg +++ b/docs/tuning-apps/collectives/images/xhc-hierarchy.svg @@ -7,7 +7,7 @@ viewBox="0 0 169.571 119.89402" version="1.1" id="svg5" - inkscape:version="1.2.1 (9c6d41e410, 2022-07-14, custom)" + inkscape:version="1.4.3 (0d15f75042, 2025-12-25)" sodipodi:docname="xhc-hierarchy.svg" inkscape:export-filename="../xhc-hierarchy.png" inkscape:export-xdpi="300" @@ -26,11 +26,11 @@ inkscape:pagecheckerboard="0" inkscape:document-units="mm" showgrid="false" - inkscape:zoom="0.75290071" - inkscape:cx="286.22632" - inkscape:cy="274.93665" + inkscape:zoom="1.4452058" + inkscape:cx="278.16108" + inkscape:cy="266.39805" inkscape:window-width="1920" - inkscape:window-height="1018" + inkscape:window-height="1136" inkscape:window-x="1920" inkscape:window-y="0" inkscape:window-maximized="1" @@ -78,25 +78,6 @@ id="path-effect556" is_visible="true" lpeversion="1" /> - - - + transform="translate(-430.99854,-193.98109)"> + y="193.98109" /> NUMA Level + y="296.00598">NUMA Level Socket Level + y="259.80359">Socket Level + transform="translate(28.708569,27.920669)"> System Level + style="font-size:5.64444px;text-align:center;text-anchor:middle;stroke-width:0.264583" + x="524.14557" + y="204.60033">Node Level + inkscape:original-d="m 561.29231,236.42783 c -10.38789,-6.52565 -20.67275,-12.94489 -31.00982,-19.41762" + transform="translate(0,-1.0583333)" /> + y="241.07695" /> + y="241.07695" /> + transform="translate(76.684113,23.158255)"> + y="240.44742" /> + y="240.44742" /> + transform="translate(118.68254,23.158255)"> + transform="translate(150.43255,23.158255)"> + inkscape:original-d="m 487.56018,236.95873 c 10.17386,-6.63057 20.2468,-13.15301 30.3709,-19.72977" + transform="translate(0,-1.0583333)" /> Cores + style="font-size:4.93889px;stroke-width:0.264583" + x="-163.80605" + y="497.17615">Cores @@ -768,23 +751,23 @@ + transform="matrix(-1,0,0,1,924.11737,0.52916667)"> NUMA 0Leader + transform="translate(44.916471,23.158255)"> P0 + y="279.12918">P0 P1 + y="279.12741">P1 P2 + y="279.12741">P2 P3 + y="279.12741">P3 P4 + y="279.12918">P4 P5 + y="279.12741">P5 P6 + y="279.12741">P6 P7 + y="279.12741">P7 P8 + y="279.12921">P8 P9 + y="279.12744">P9 10 + y="279.12744">10 11 + y="279.12744">11 12 + y="279.12921">12 13 + y="279.12744">13 14 + y="279.12744">14 15 + y="279.12744">15 + transform="matrix(-1,0,0,1,944.89717,-1.0583333)"> + transform="translate(84.666671)"> + id="g8800"> + transform="translate(-83.60834,-0.52916667)"> + transform="rotate(180,501.4769,222.70799)"> + transform="matrix(1,0,0,-1,45.394312,445.41596)"> + transform="matrix(-1,0,0,1,955.86739,0.52916667)"> NUMA 1Leader + transform="matrix(-1,0,0,1,1029.9509,0.52916667)"> NUMA 3Leader [...] + +Main Features +------------- + +Hierarchy +~~~~~~~~~ + +XHC constructs an *n*-level hierarchy (i.e. no limitation on number of levels), +based on intra-node topological features. Rank/process locality information +originates from Hwloc, and is obtained through Open MPI's internal structures. + +The following topological features can currently be defined: + + * NUMA node + * CPU Socket + * L1/L2/L3 cache + * Hwthread/core + * Node (all ranks *are* in same node -> flat hierarchy) + +An example of a 3-level XHC hierarchy (``numa,socket`` configuration): + +.. image:: images/xhc-hierarchy.svg + :width: 450px + +Furthermore, support for virtual/user-defined hierarchies is available, to +allow for even finer control and custom experiments. + +**Pipelining** is seamlessly applied across all levels of the hierarchy, to +minimize hierarchy-induced overheads, and to allow for interleaving of +operations in certain collectives (e.g. reduce+bcast in allreduce). + +Single-copy data transfers +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +XHC supports data transfers between MPI ranks using a single copy, through Open +MPI's ``opal/smsc`` (shared-memory-single-copy) framework. Despite the +component's name, XHC actually also supports additional single-copy mechanisms +in some collectives, though XPMEM is highly recommended. + + * Bcast: XPMEM, CMA, KNEM + * Allreduce/Reduce: XPMEM + * Barrier: *(irrelevant)* + +In XPMEM mode, application buffers are attached on the fly the first time they +appear, and are saved in ``smsc/xpmem``'s internal registration cache for +future uses. + +Shared-memory data transfers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +XHC also supports data transfers using copy-in-copy-out (CICO) over shared +memory. Copy-in-copy-out is always used for small messages, with automatic +switching to single-copy for large ones. All primitives support this mode, +regardless of XPMEM or SMSC presence, as long as the size of the message is +below the threshold. + +Inline data transfers +~~~~~~~~~~~~~~~~~~~~~ + +For especially small messages, the payload data is inlined in the same cache +line as the control data. This achieves exceptionally low latency in such +messages. Supported in all primitives, regardless of XPMEM or SMSC presence. + +Synchronization +~~~~~~~~~~~~~~~ + +XHC uses **lock-free** synchronization, using the single-writer paradigm and lightweight *read* or *write* memory barriers wherever appropriate. + +Multi-node with HAN +------------------- + +Even though ``xhc`` only works over shared memory, it may also be utilized in +multi-node environments, through ``coll/han``. HAN is already the default +component in multi-node runs, so all that's needed is to define ``xhc`` as the +component to be used for the intra-node phase: + +.. code-block:: sh + + $ mpirun --mca coll_han_bcast_low_module 2 --mca coll_han_reduce_low_module 2 \ + --mca coll_han_allreduce_low_module 2 + +.. _mca-params: + +MCA Parameters +-------------- + +Basic +~~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 20 10 70 + + * - Parameter + - Default + - Description + + * - coll_xhc_priority + - 0 + - The priority of the component. Set it to a value higher than other + components to enable xhc. + +Main +~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Parameter + - Default + - Description + + * - coll_xhc_hierarchy + - *unset* + - A comma separated list of topological features to which XHC's hierarchy + should be sensitive. This is a hint -- xhc will automatically: disregard + features that don't exist in the system, or that don't further segment + the ranks (e.g. ``numa`` was specified, but all ranks are in the same + NUMA node); re-order the list to match the system's hierarchy; add an + extra top level that's common to all ranks. This parameter applies to + all primitives, and is mutually exclusive with the primitive-specific + ones below. + + This parameter also supports the use of special modifiers for *virtual + hierarchies*. Check ``xhc_component_parse_hierarchy()`` for further + explanation and syntax. + + * - coll_xhc_chunk_size + - *unset* + - The chunk size for the pipelining. Data is processed in this-much sized + pieces at once. Applies to all primitives -- mutually exclusive with + primitive-specific parameters. + + * - coll_xhc_cico_max + - *unset* + - The max size up to which to use copy-in-copy-out. Single copy will be + used for messages above this size. Applies to all primitives -- mutually + exclusive with primitive-specific parameters. + + * - coll_xhc__hierarchy + - bcast/barrier: ``numa,socket`` + (all)reduce: ``l3,numa,socket`` + - Topological features to consider for XHC's hierarchy, specifially for + this primitive. Mutually exclusive with the respective non-specific + parameter. + + * - coll_xhc__chunk_size + - 16K + - Pipeline chunk size, specifically for this primitive. Mutually exclusive + with the non-specific parameter. + + * - coll_xhc__cico_max + - bcast: ``256`` + (all)reduce: ``4K`` + - Max size for copy-in-copy-out transfers, specifically for this + primitive. Mutually exclusive with the non-specific parameter. + +Advanced +~~~~~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Parameter + - Default + - Description + + * - coll_xhc__root + - 0 + - Internal root rank, for either of these operations. + + * - coll_xhc_uniforms_chunks + - true + - Whether to dynamically adjust (decrease) the chunk size in reduction + primitives, so that all ranks will perform equal work, depending on + the message size. + + * - coll_xhc_uniforms_chunks_min + - 4K + - Minimum allowed value for the automatically decreased chunk size in + reduction primitives. + + * - coll_xhc_reduce_load_balance + - top,first + - Controls load balancing features in reduction primitives. With no such + features enabled, leader ranks don't perform any reduction work, on the + levels on which they are leaders. Add ``top`` to have the root perform + reductions on the top-most level of the hierarchy, as if a common rank. + Add ``first``, to have all leaders reduce a single chunk, at the + beginning of the operation as if they weren't leaders. Add ``all`` to + have leaders always perform reductions, even on the levels on which they + are leaders (not recommended). + + * - coll_xhc_dynamic_reduce + - non-float + - Controls support for out-of-order reduction (rank wise), which allows + temporarily skipping a peer that's not yet ready. The default value only + enables the feature for non-float types, to avoid reproducibility issues + with floats. Set to ``disabled`` or ``all`` to turn off or on, + respectively, for all types. + + * - coll_xhc_dynamic_leader + - false + - Dynamically elect the first rank from each hierarchy group to join the + collective as its leader, in broadcast. Introduces an atomic + compare-exchange per each call, when enabled. + +Other +~~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Parameter + - Default + - Description + + * - coll_xhc_shmem_backing + - /dev/shm + - Backing directory for shmem files. + + * - coll_xhc_memcpy_chunk_size + - 256K + - Break up large memcpy calls to smaller ones, using this chunk size. + Will actually attempt to mirror the value of ``smsc/xpmem``'s respective + parameter at run-time. + +Debug +~~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 25 15 60 + + * - Parameter + - Default + - Description + + * - coll_xhc_print_info + - *none* + - Print information about the component's configuration, and its + constructed hierarchies. Takes a comma delimited list of: the name of + the collective primitive about which to print information; ``config`` + to print the configuration; ``all`` to print everything; ``dot`` along + with the name of a collective primitive to print its hierarchy in DOT + format. + +Limitations +----------- + +* **Heterogeneity**: XHC does not support nodes with non-uniform + datatype representations across ranks (Open MPI's ``proc_arch``). + +* **Non-commutative** operators are not currently supported in + reduction collectives. + +* **Derived datatypes** are not yet supported. + +* The Reduce implementation only supports rank 0 as the root, and will + automatically fall back to another component in other scenarios. Work in + progress. + +Other resources +--------------- + +All things XHC landing page: https://github.com/CARV-ICS-FORTH/XHC-OpenMPI + +Publications +~~~~~~~~~~~~ + +.. **Publications** + +| **A framework for hierarchical single-copy MPI collectives on multicore nodes** +| *George Katevenis, Manolis Ploumidis, and Manolis Marazakis* +| Cluster 2022, Heidelberg, Germany +| https://ieeexplore.ieee.org/document/9912729 + +| **Impact of Cache Coherence on the Performance of Shared-Memory based MPI Primitives: A Case Study for Broadcast on Intel Xeon Scalable Processors** +| *George Katevenis, Manolis Ploumidis, and Manolis Marazakis* +| ICPP 2023, Salt Lake City, Utah, USA +| https://dl.acm.org/doi/10.1145/3605573.3605616 diff --git a/ompi/communicator/comm_cid.c b/ompi/communicator/comm_cid.c index be99de913ab..ddf1657b9ab 100644 --- a/ompi/communicator/comm_cid.c +++ b/ompi/communicator/comm_cid.c @@ -24,7 +24,7 @@ * Copyright (c) 2017 Mellanox Technologies. All rights reserved. * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. * Copyright (c) 2021 Nanook Consulting. All rights reserved. - * Copyright (c) 2020-2025 Triad National Security, LLC. All rights + * Copyright (c) 2020-2026 Triad National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -1094,7 +1094,7 @@ int ompi_comm_get_remote_cid_from_pmix (ompi_communicator_t *comm, int dest, uin } if (val->type != PMIX_SIZE) { - OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID type mismatch")); + OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID type mismatch - %s", PMIx_Value_string(val))); rc = OMPI_ERR_TYPE_MISMATCH; goto done; } diff --git a/ompi/include/mpi.h.in b/ompi/include/mpi.h.in index e06865b182f..1422695ea37 100644 --- a/ompi/include/mpi.h.in +++ b/ompi/include/mpi.h.in @@ -764,6 +764,7 @@ enum { #define MPI_ERR_SESSION 78 #define MPI_ERR_VALUE_TOO_LARGE 79 #define MPI_ERR_ERRHANDLER 80 +#define MPI_ERR_NOTIFY_IDX 81 /* Per MPI-3 p349 47, MPI_ERR_LASTCODE must be >= the last predefined MPI_ERR_ code. Set the last code to allow some room for adding @@ -1917,6 +1918,14 @@ OMPI_DECLSPEC int MPI_Get_c(void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, MPI_Count target_count, MPI_Datatype target_datatype, MPI_Win win); +OMPI_DECLSPEC int MPI_Get_notify(void *origin_addr, int origin_count, + MPI_Datatype origin_datatype, int target_rank, + MPI_Aint target_disp, int target_count, + MPI_Datatype target_datatype, int notification_idx, MPI_Win win); +OMPI_DECLSPEC int MPI_Get_notify_c(void *origin_addr, MPI_Count origin_count, + MPI_Datatype origin_datatype, int target_rank, + MPI_Aint target_disp, MPI_Count target_count, + MPI_Datatype target_datatype, int notification_idx, MPI_Win win); OMPI_DECLSPEC int MPI_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, void *result_addr, int result_count, MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp, int target_count, @@ -2180,6 +2189,12 @@ OMPI_DECLSPEC int MPI_Put(const void *origin_addr, int origin_count, MPI_Dataty OMPI_DECLSPEC int MPI_Put_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, MPI_Count target_count, MPI_Datatype target_datatype, MPI_Win win); +OMPI_DECLSPEC int MPI_Put_notify(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, + int target_rank, MPI_Aint target_disp, int target_count, + MPI_Datatype target_datatype, int notification_idx, MPI_Win win); +OMPI_DECLSPEC int MPI_Put_notify_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype, + int target_rank, MPI_Aint target_disp, MPI_Count target_count, + MPI_Datatype target_datatype, int notification_idx, MPI_Win win); OMPI_DECLSPEC int MPI_Query_thread(int *provided); OMPI_DECLSPEC int MPI_Raccumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, @@ -2604,6 +2619,8 @@ OMPI_DECLSPEC int MPI_Win_get_errhandler(MPI_Win win, MPI_Errhandler *errhandle OMPI_DECLSPEC int MPI_Win_get_group(MPI_Win win, MPI_Group *group); OMPI_DECLSPEC int MPI_Win_get_info(MPI_Win win, MPI_Info *info_used); OMPI_DECLSPEC int MPI_Win_get_name(MPI_Win win, char *win_name, int *resultlen); +OMPI_DECLSPEC int MPI_Win_get_notify_value(MPI_Win win, int notification_idx, MPI_Count *value); +OMPI_DECLSPEC int MPI_Win_reset_notify_value(MPI_Win win, int notification_idx, MPI_Count *value); OMPI_DECLSPEC int MPI_Win_lock(int lock_type, int rank, int mpi_assert, MPI_Win win); OMPI_DECLSPEC int MPI_Win_lock_all(int mpi_assert, MPI_Win win); OMPI_DECLSPEC int MPI_Win_post(MPI_Group group, int mpi_assert, MPI_Win win); @@ -3091,6 +3108,14 @@ OMPI_DECLSPEC int PMPI_Get_c(void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, MPI_Count target_count, MPI_Datatype target_datatype, MPI_Win win); +OMPI_DECLSPEC int PMPI_Get_notify(void *origin_addr, int origin_count, + MPI_Datatype origin_datatype, int target_rank, + MPI_Aint target_disp, int target_count, + MPI_Datatype target_datatype, int notification_idx, MPI_Win win); +OMPI_DECLSPEC int PMPI_Get_notify_c(void *origin_addr, MPI_Count origin_count, + MPI_Datatype origin_datatype, int target_rank, + MPI_Aint target_disp, MPI_Count target_count, + MPI_Datatype target_datatype, int notification_idx, MPI_Win win); OMPI_DECLSPEC int PMPI_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, void *result_addr, int result_count, MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp, int target_count, @@ -3354,6 +3379,12 @@ OMPI_DECLSPEC int PMPI_Put(const void *origin_addr, int origin_count, MPI_Datat OMPI_DECLSPEC int PMPI_Put_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, MPI_Count target_count, MPI_Datatype target_datatype, MPI_Win win); +OMPI_DECLSPEC int PMPI_Put_notify(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, + int target_rank, MPI_Aint target_disp, int target_count, + MPI_Datatype target_datatype, int notification_idx, MPI_Win win); +OMPI_DECLSPEC int PMPI_Put_notify_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype, + int target_rank, MPI_Aint target_disp, MPI_Count target_count, + MPI_Datatype target_datatype, int notification_idx, MPI_Win win); OMPI_DECLSPEC int PMPI_Query_thread(int *provided); OMPI_DECLSPEC int PMPI_Raccumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, @@ -3778,6 +3809,8 @@ OMPI_DECLSPEC int PMPI_Win_get_errhandler(MPI_Win win, MPI_Errhandler *errhandl OMPI_DECLSPEC int PMPI_Win_get_group(MPI_Win win, MPI_Group *group); OMPI_DECLSPEC int PMPI_Win_get_info(MPI_Win win, MPI_Info *info_used); OMPI_DECLSPEC int PMPI_Win_get_name(MPI_Win win, char *win_name, int *resultlen); +OMPI_DECLSPEC int PMPI_Win_get_notify_value(MPI_Win win, int notification_idx, MPI_Count *value); +OMPI_DECLSPEC int PMPI_Win_reset_notify_value(MPI_Win win, int notification_idx, MPI_Count *value); OMPI_DECLSPEC int PMPI_Win_lock(int lock_type, int rank, int mpi_assert, MPI_Win win); OMPI_DECLSPEC int PMPI_Win_lock_all(int mpi_assert, MPI_Win win); OMPI_DECLSPEC int PMPI_Win_post(MPI_Group group, int mpi_assert, MPI_Win win); diff --git a/ompi/include/mpif-values.py b/ompi/include/mpif-values.py index 53159d5d8dd..b74fbcbaf1f 100755 --- a/ompi/include/mpif-values.py +++ b/ompi/include/mpif-values.py @@ -301,6 +301,7 @@ 'MPI_ERR_SESSION': 78, 'MPI_ERR_VALUE_TOO_LARGE': 79, 'MPI_ERR_ERRHANDLER': 80, + 'MPI_ERR_NOTIFY_IDX': 81, 'MPI_ERR_LASTCODE': 92, 'MPI_IDENT': 0, 'MPI_CONGRUENT': 1, diff --git a/ompi/instance/instance.c b/ompi/instance/instance.c index bd686d2bab2..6d50d32ffb2 100644 --- a/ompi/instance/instance.c +++ b/ompi/instance/instance.c @@ -8,6 +8,7 @@ * reserved. * Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2026 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -586,11 +587,16 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) active = true; OPAL_POST_OBJECT(&active); PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL); - if( PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, NULL, 0, - fence_release, - (void*)&active))) { - ret = opal_pmix_convert_status(rc); - return ompi_instance_print_error ("PMIx_Fence_nb() failed", ret); + rc = PMIx_Fence_nb(NULL, 0, NULL, 0, fence_release, (void*)&active); + if (PMIX_SUCCESS != rc) { + active = false; + if (PMIX_OPERATION_SUCCEEDED == rc) { + // can return operation_succeeded if atomically completed + ret = MPI_SUCCESS; + } else { + ret = opal_pmix_convert_status(rc); + return ompi_instance_print_error ("PMIx_Fence_nb() failed", ret); + } } } } else { @@ -602,12 +608,19 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) OPAL_POST_OBJECT(&active); PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL); rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active); - if( PMIX_SUCCESS != rc) { - ret = opal_pmix_convert_status(rc); - return ompi_instance_print_error ("PMIx_Fence() failed", ret); + if (PMIX_SUCCESS != rc) { + active = false; + if (PMIX_OPERATION_SUCCEEDED == rc) { + // can return operation_succeeded if atomically completed + ret = MPI_SUCCESS; + } else { + ret = opal_pmix_convert_status(rc); + return ompi_instance_print_error ("PMIx_Fence() failed", ret); + } + } else { + /* cannot just wait on thread as we need to call opal_progress */ + OMPI_LAZY_WAIT_FOR_COMPLETION(active); } - /* cannot just wait on thread as we need to call opal_progress */ - OMPI_LAZY_WAIT_FOR_COMPLETION(active); } } @@ -748,7 +761,9 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) * we have to wait here for it to complete. However, there * is no reason to do two barriers! */ if (background_fence) { - OMPI_LAZY_WAIT_FOR_COMPLETION(active); + if (active) { + OMPI_LAZY_WAIT_FOR_COMPLETION(active); + } } else if (!ompi_async_mpi_init) { /* wait for everyone to reach this point - this is a hard * barrier requirement at this time, though we hope to relax @@ -757,12 +772,19 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) active = true; OPAL_POST_OBJECT(&active); PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &flag, PMIX_BOOL); - if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, info, 1, - fence_release, (void*)&active))) { - ret = opal_pmix_convert_status(rc); - return ompi_instance_print_error ("PMIx_Fence_nb() failed", ret); + rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active); + if (PMIX_SUCCESS != rc) { + active = false; + if (PMIX_OPERATION_SUCCEEDED == rc) { + // can return operation_succeeded if atomically completed + ret = MPI_SUCCESS; + } else { + ret = opal_pmix_convert_status(rc); + return ompi_instance_print_error ("PMIx_Fence() failed", ret); + } + } else { + OMPI_LAZY_WAIT_FOR_COMPLETION(active); } - OMPI_LAZY_WAIT_FOR_COMPLETION(active); } } diff --git a/ompi/mca/coll/accelerator/configure.m4 b/ompi/mca/coll/accelerator/configure.m4 new file mode 100644 index 00000000000..057db874435 --- /dev/null +++ b/ompi/mca/coll/accelerator/configure.m4 @@ -0,0 +1,27 @@ +# Copyright (c) 2026 NVIDIA Corporation. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# +# If any accelerators have been discovered, then build support for the +# accelerator collective component. +# +AC_DEFUN([MCA_ompi_coll_accelerator_CONFIG],[ + + AC_CONFIG_FILES([ompi/mca/coll/accelerator/Makefile]) + + # This component shall be configured only after the accelerator discovery + # has been completed. This discovery is part of the OPAL accelerator framework. + AC_MSG_CHECKING([if any accelerator components were found (cuda, rocm, ze)]) + AS_IF([test "x$OMPI_HAVE_ACCELERATOR_SUPPORT" = "x1"], + [AC_MSG_RESULT([yes]) + $1], + [AC_MSG_RESULT([no]) + $2]) + +])dnl diff --git a/ompi/mca/coll/acoll/coll_acoll_reduce.c b/ompi/mca/coll/acoll/coll_acoll_reduce.c index 69da3cb49cf..28fc3c62c6a 100644 --- a/ompi/mca/coll/acoll/coll_acoll_reduce.c +++ b/ompi/mca/coll/acoll/coll_acoll_reduce.c @@ -66,7 +66,7 @@ static inline int coll_acoll_reduce_topo(const void *sbuf, void *rbuf, size_t co int use_socket = (0 == acoll_module->use_socket) ? 1 : acoll_module->use_socket; tmp_sbuf = (char *) sbuf; - if ((MPI_IN_PLACE == sbuf) && (rank == root)) { + if (MPI_IN_PLACE == sbuf) { tmp_sbuf = (char *) rbuf; } diff --git a/ompi/mca/coll/adapt/coll_adapt_ireduce.c b/ompi/mca/coll/adapt/coll_adapt_ireduce.c index 15bd586901a..07616285616 100644 --- a/ompi/mca/coll/adapt/coll_adapt_ireduce.c +++ b/ompi/mca/coll/adapt/coll_adapt_ireduce.c @@ -48,7 +48,7 @@ int ompi_coll_adapt_ireduce_register(void) mca_coll_adapt_component.adapt_ireduce_algorithm = 1; } - mca_coll_adapt_component.adapt_ireduce_segment_size = 163740; + mca_coll_adapt_component.adapt_ireduce_segment_size = 524288; mca_base_component_var_register(c, "reduce_segment_size", "Segment size in bytes used by default for reduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, diff --git a/ompi/mca/coll/ftagree/coll_ftagree_component.c b/ompi/mca/coll/ftagree/coll_ftagree_component.c index 97e9ca4cee7..8a733ad3357 100644 --- a/ompi/mca/coll/ftagree/coll_ftagree_component.c +++ b/ompi/mca/coll/ftagree/coll_ftagree_component.c @@ -38,6 +38,8 @@ int mca_coll_ftagree_era_rebuild = 0; double mca_coll_ftagree_debug_inject_proba = 0.0; #endif +static int mca_coll_ft_agreement; + /* * Local function */ @@ -92,8 +94,6 @@ ftagree_close(void) static int ftagree_register(void) { - int value; - /* Use a low priority, but allow other components to be lower */ mca_coll_ftagree_priority = 30; (void) mca_base_component_var_register(&mca_coll_ftagree_component.collm_version, @@ -103,15 +103,15 @@ ftagree_register(void) MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_ftagree_priority); - if( ompi_ftmpi_enabled ) value = 1; - else value = 0; /* NOFT: do not initialize ERA */ + if( ompi_ftmpi_enabled ) mca_coll_ft_agreement = 1; + else mca_coll_ft_agreement = 0; /* NOFT: do not initialize ERA */ (void) mca_base_component_var_register(&mca_coll_ftagree_component.collm_version, "agreement", "Agreement algorithm 0: Allreduce (NOT FAULT TOLERANT); 1: Early Returning Consensus (era); 2: Early Terminating Consensus (eta)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, - &value); - switch(value) { + &mca_coll_ft_agreement); + switch(mca_coll_ft_agreement) { case 0: mca_coll_ftagree_algorithm = COLL_FTAGREE_NOFT; opal_output_verbose(6, ompi_ftmpi_output_handle, diff --git a/ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c b/ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c index 9450c443349..f28c36a3d16 100644 --- a/ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c +++ b/ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c @@ -2956,6 +2956,15 @@ int mca_coll_ftagree_era_finalize(void) "%s ftagree:agreement (ERA) GC: %lu passed agreements remain in the passed agreements hash table\n", OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), opal_hash_table_get_size(&era_passed_agreements))); + /* Some agreements can remain in the era_passed_agreements table until + * finalize; notably, the last agreement in a communicator that has been + * freed. + * + * The commit that added this comment also removed the (unused) function + * mca_coll_ftagree_era_free_comm that could enforce purging that table + * during comm_free, at the cost of making comm_free hard synchronizing; + * this was deemed too disruptive for the small memory usage gain. + */ for( rc = opal_hash_table_get_first_key_uint64(&era_passed_agreements, &key64, &value, &node); OPAL_SUCCESS == rc; rc = opal_hash_table_get_next_key_uint64(&era_passed_agreements, &key64, &value, node, &node) ) { @@ -3368,46 +3377,3 @@ int mca_coll_ftagree_iera_intra(void *contrib, return OMPI_SUCCESS; } -#if 0 -// Per @bosilca and @jsquyres discussion 29 Apr 2021: there is -// probably a memory leak in MPI_FINALIZE right now, because this -// function does not appear to be being called from anywhere. -// @bosilca's team is looking into it. -int mca_coll_ftagree_era_free_comm(ompi_communicator_t* comm, - mca_coll_base_module_t *module) -{ - ompi_group_t* acked; - era_identifier_t aid; - int rc; - - OPAL_OUTPUT_VERBOSE((4, ompi_ftmpi_output_handle, - "%s ftagree:agreement (ERA) Freeing Communicator (%d.%d).\n", - OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), - comm->c_contextid, - comm->c_epoch)); - - opal_mutex_lock(&ompi_group_afp_mutex); - ompi_group_intersection(comm->c_remote_group, ompi_group_all_failed_procs, &acked); - opal_mutex_unlock(&ompi_group_afp_mutex); - do { - rc = mca_coll_ftagree_era_intra(NULL, - 0, - &ompi_mpi_int.dt, - &ompi_mpi_op_band.op, - &acked, true, - comm, - comm->c_coll->coll_agree_module); - } while(rc != MPI_SUCCESS); - OBJ_RELEASE(acked); - - aid.ERAID_FIELDS.contextid = comm->c_contextid.cid_sub.u64; - aid.ERAID_FIELDS.epoch = comm->c_epoch; - - opal_mutex_lock(&era_mutex); - /** We don't need to set aid.ERAID_FIELDS.agreementid to collect all of them */ - era_collect_passed_agreements(aid, 0, (uint16_t)-1); - opal_mutex_unlock(&era_mutex); - - return OMPI_SUCCESS; -} -#endif diff --git a/ompi/mca/coll/han/coll_han_component.c b/ompi/mca/coll/han/coll_han_component.c index 1d78bf87158..7ae17b9e4f8 100644 --- a/ompi/mca/coll/han/coll_han_component.c +++ b/ompi/mca/coll/han/coll_han_component.c @@ -301,7 +301,7 @@ static int han_register(void) OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL, &cs->han_output_verbose); - cs->han_bcast_segsize = 65536; + cs->han_bcast_segsize = 524288; (void) mca_base_component_var_register(c, "bcast_segsize", "segment size for bcast", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, @@ -321,7 +321,7 @@ static int han_register(void) &cs->han_bcast_low_module, &cs->han_op_module_name.bcast.han_op_low_module_name); - cs->han_reduce_segsize = 65536; + cs->han_reduce_segsize = 524288; (void) mca_base_component_var_register(c, "reduce_segsize", "segment size for reduce", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, @@ -340,7 +340,7 @@ static int han_register(void) OPAL_INFO_LVL_9, &cs->han_reduce_low_module, &cs->han_op_module_name.reduce.han_op_low_module_name); - cs->han_allreduce_segsize = 65536; + cs->han_allreduce_segsize = 524288; (void) mca_base_component_var_register(c, "allreduce_segsize", "segment size for allreduce", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, diff --git a/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c b/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c index e3482116c84..9dca14bcc55 100644 --- a/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c @@ -34,6 +34,8 @@ static int coll_tuned_alltoall_segment_size = 0; static int coll_tuned_alltoall_tree_fanout; static int coll_tuned_alltoall_chain_fanout; +static int deprecated_mca_params = -1; + /* valid values for coll_tuned_alltoall_forced_algorithm */ static const mca_base_var_enum_value_t alltoall_algorithms[] = { {0, "ignore"}, @@ -119,7 +121,6 @@ int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm MCA_BASE_VAR_SCOPE_ALL, &coll_tuned_alltoall_chain_fanout); - int deprecated_mca_params = -1; (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, "alltoall_large_msg", "use pairwise exchange algorithm for messages larger than this value", diff --git a/ompi/mca/coll/tuned/coll_tuned_component.c b/ompi/mca/coll/tuned/coll_tuned_component.c index d8dbb7959e4..6f5a8c57987 100644 --- a/ompi/mca/coll/tuned/coll_tuned_component.c +++ b/ompi/mca/coll/tuned/coll_tuned_component.c @@ -71,6 +71,8 @@ int ompi_coll_tuned_scatter_large_msg = 0; int ompi_coll_tuned_scatter_min_procs = 0; int ompi_coll_tuned_scatter_blocking_send_ratio = 0; +static int deprecated_mca_params = -1; + /* forced algorithm variables */ /* indices for the MCA parameters */ coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT] = {{0}}; @@ -161,7 +163,6 @@ static int tuned_register(void) MCA_BASE_VAR_SCOPE_ALL, &ompi_coll_tuned_init_chain_fanout); - int deprecated_mca_params = -1; (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, "alltoall_small_msg", "threshold (if supported) to decide if small MSGs alltoall algorithm will be used", diff --git a/ompi/mca/coll/xhc/README.md b/ompi/mca/coll/xhc/README.md index 213062a5edc..438fd712507 100644 --- a/ompi/mca/coll/xhc/README.md +++ b/ompi/mca/coll/xhc/README.md @@ -1,371 +1,8 @@ -# XHC: XPMEM-based Hierarchical Collectives +# XPMEM Hierarchical Collectives (XHC) XHC implements hierarchical & topology-aware intra-node MPI collectives, -utilizing XPMEM for efficient shared address space memory access between -processes. +(mainly) utilizing XPMEM for efficient shared address space data transfers +between MPI ranks. -## Main features - -* XHC constructs an **n-level hierarchy** (i.e. no algorithmic limitation on -level count), based on intra-node topological features. Rank/process locality -information is known thanks to Hwloc, and is obtained from Open MPI's -integrated book-keeping. - - Topological features that can currently be defined: - - - NUMA node - - CPU Socket - - L1/L2/L3 cache - - Hwthread/core - - Node (all ranks *are* in same node --> flat, no hierarchy at all) - - Example of a 3-level XHC hierarchy (numa+socket+node configuration): - - ![Example of 3-level XHC hierarchy](resources/xhc-hierarchy.svg) - - Furthermore, support for custom virtual user-defined hierarchies is - available, to allow fine-grained control over the communication pattern. - -* **Single-copy** transportation - - - Supported through integration with Open MPI's `opal/smsc` - (shared-memory-single-copy) framework. Selecting `smsc/xpmem` is highly - recommended. - - - Bcast support: XPMEM, CMA, KNEM - - Allreduce/Reduce support: XPMEM - - Barrier support: *(irrelevant)* - - - Application buffers are attached on the fly the first time they appear, - saved on and recovered from the registration cache in subsequent - appearances. (assuming smsc/xpmem) - -* **Copy-in-copy-out (CICO)** transportation - - - Through shared memory buffers that remain active throughout the - component's lifetime. - - - Switchover with single-copy at configurable message size. - - - Supported in all ops, regardless of smsc support or XPMEM presence (up to - maximum allowed message size). - -* **Inline** transportation - - - For especially small messages, payload data is inlined in the same cache - line as the control data. - - - Supported in all ops, regardless of smsc support or XPMEM presence (up to - maximum allowed message size). - -* Data-wise **pipelining** across all levels of the hierarchy. Allows for -lowering hierarchy-induced start-up overheads, and interleaving of operations -in applicable operations (e.g. reduce+bcast in allreduce). - -* **Lock-free** single-writer synchronization, with appropriate cache-line -separation where necessary. Consistency ensured via lightweight *read* or -*write* memory barriers. - -## Configuration options -- MCA params - -XHC can be customized via a number of standard Open MPI MCA parameters, though -defaults that should satisfy a wide number of systems are in place. - -The available parameters (also found in `coll_xhc_component.c`): - -#### *(prepend with "coll_xhc_")* - -* **priority** (default `0`): The priority of the coll/xhc component, used -during the component selection process. - -* **print_info** (default `false`): Print information about XHC's generated -hierarchy and its configuration. - -* **shmem_backing** (default `/dev/shm`): Backing directory for shmem files -used for XHC's synchronization fields and CICO buffers. - -* **dynamic_leader** (default `false`): Enables the feature that dynamically -elects an XHC-communicator leader at each collective (currently only applicable -for bcast). - -* **dynamic_reduce** (default `1`=`non-float`): Enables support for -out-of-order reduction. Ranks fetch data to reduce from multiple peers; -out-of-order reduction allows them to temporarily skip a peer when the expected -data is not yet prepared, instead of stalling. The default value auto-enables -it when the data is of non-float type; setting to `2`=`enabled for all types`, -might/will harm reproducibility of reductions with float types. - -* **reduce_load_balance** (default `0`=`non-leader`): Controls the -leader-to-member load balancing mode in reductions. Under `non-leader`, the -members, and not the leaders, perform reductions. With `top-level`, all members -as well as the leader of the top-most level perform reductions. With -`first-chunk`, leaders perform a single reduction on each level for a single -chunk at the beginning of the operation. `top+first` combines `top-level` and -`first-chunk`. Finally, with `all`, all ranks perform reductions equally. - -* **hierarchy** (default `"numa,socket"`): A comma separated list of -topological feature to which XHC's hierarchy-building algorithm should be -sensitive. `ompi_info` reports the possible values for the parameter. - - - In some ways, this is "just" a suggestion. The resulting hierarchy may - not exactly match the requested one. Reasons that this will occur: - - - A requested topological feature does not effectively segment the set - of ranks. (eg. `numa` was specified, but all ranks reside in the same - NUMA node) - - - No feature that all ranks have in common was provided. This a more - intrinsic detail, that you probably don't need to be aware of, but you - might come across if eg. you investigate the output of `print_info`. An - additional level will automatically be added in this case, no need to - worry about it. - - For all intents and purposes, a hierarchy of `numa,socket` is - interpreted as "segment the ranks according to NUMA node locality, - and then further segment them according to CPU socket locality". - - - The provided features will automatically be re-ordered when their - order does not match their order in the physical system. (unless a - virtual hierarchy is present in the list) - - - *Virtual Hierarchies*: The string may alternatively also contain "rank - lists" which specify exactly which ranks to group together, as well as some - other special modifiers. See - `coll_xhc_component.c:xhc_component_parse_hierarchy()` for further - explanation as well as syntax information. - -* **chunk_size** (default `16K`): The chunk size for the pipelining process. -Data is processed (eg broadcast, reduced) in this-much sized pieces at once. - - - It's possible to have a different chunk size for each level of the - hierarchy, achieved via providing a comma-separated list of sizes (eg. - `"16K,16K,128K"`) instead of single one. The sizes in this list's *DO NOT* - correspond to the items on hierarchy list; the hierarchy keys might be - re-ordered or reduced to match the system, but the chunk sizes will be - consumed in the order they are given, left-to-right -> bottom-to-top. - -* **uniform_chunks** (default `true`): Automatically optimize the chunk size -in reduction collectives, according to the message size, so that all members -will perform equal work. - -* **uniform_chunks_min** (default `1K`): The lowest allowed value for the chunk -size when uniform chunks are enabled. - -* **cico_max** (default `1K`): Copy-in-copy-out, instead of single-copy, will -be used for messages of *cico_max* or less bytes. - -*(Removed Parameters)* - -* **rcache_max**, **rcache_max_global** *(REMOVED with shift to opal/smsc)*: -Limit to number of attachments that the registration cache should hold. - - - A case can be made about their usefulness. If desired, shall be - re-implemented at smsc-level. - -## Limitations - -- *Intra-node support only* - - Define XHC as `coll/HAN`'s intra-node component to reap its benefits in - multi-node runs. - -- **Heterogeneity**: XHC does not support nodes with non-uniform (rank-wise) -datatype representations. (determined according to Open MPI's `proc_arch`) - -- **Non-commutative** operators are not supported by XHC's reduction -collectives. In past versions, they were, but only with a flat hierarchy; this -could make a return at some point. - -- **Derived Datatypes** are currently not supported. - -- XHC's Reduce currently only supports rank 0 as the root, and will -automatically fall back to another component for other cases. - -## Building - -This section describes how to compile the XHC component. - -XPMEM support in Open MPI is required to reap the full benefits of XHC. - -- The XHC component will build and work without XPMEM support, but for large -messages (i.e. ones above the CICO threshold) Allreduce/Reduce will be -disabled, and Broadcast will fall-back to less efficient mechanisms. - -- XPMEM can be obtained from , and then -compiled like a common kernel module. You might need to manually point Open -MPI's configure script to XPMEM's installation location, via the -`--with-xpmem=` parameter. - -- At run-time, you will need to insert the kernel module and obtain proper -access rights to `/dev/xpmem`. - -Apart from instructing Open MPI to include XPMEM support, the rest of the build -process is standard. General information on building Open MPI can be found in -its documentation. - - - - - -## Running - -General information on running Open MPI jobs can be found here: - - - -`mpirun`'s man page will also be useful: - - -In order for the XHC component to be chosen, its priority must be manually set -higher than other collectives components that implement the same primitives, -via the `coll_xhc_priority` MCA param. - - - Example: `--mca coll_xhc_priority 100` - -* Most likely, you will also want the `--bind-to core` param. Otherwise, the -reported process localities might be too general, preventing XHC from correctly -segmenting the system. (MCA `coll_xhc_print_info` will report the generated -hierarchy if you wish to experiment) - -### Tuning - -* Optional: You might wish to manually specify the topological features that -XHC's hierarchy should conform to. The default is `numa,socket`, which will -group the processes according to NUMA locality and then further group them -according to socket locality. See the `coll_xhc_hierarchy` param. - - - Example: `--mca coll_xhc_hierarchy numa,socket` - - Example: `--mca coll_xhc_hierarchy numa` - - Example: `--mca coll_xhc_hierarchy flat` - - In some systems, small-message Broadcast or the Barrier operation might - perform better with a flat tree instead of a hierarchical one. Currently, - manual benchmarking is required to accurately determine this. - -* Optional: You might wish to tune XHC's chunk size (default `16K`). Use the -`coll_xhc_chunk_size` param, and try values close to the default and see if -improvements are observed. - - - Example: `--mca coll_xhc_chunk_size 16K` - -* Optional: If you wish to focus on latencies of small/medium size messages, -you can try altering the cico-to-zcopy switchover point (MCA -`coll_xhc_cico_max`, default `1K`). - - - Example: `--mca coll_xhc_cico_max 1K` - -* Optional: If your application is heavy in Broadcast calls and you suspect -that specific ranks might be joining the collective with delay and causing -others to stall waiting for them, try enabling dynamic leadership (MCA -`coll_xhc_dynamic_leader`), and seeing if it makes an improvement. Please let -us know if it does :-). - - - Example: `--mca coll_xhc_dynamic_leader 1` - -### Example command lines - -*Assuming `PATH` and `LD_LIBRARY_PATH` have been set appropriately.* - -Default XHC configuration: -`$ mpirun --mca coll_xhc_priority 100 --bind-to core ` - -XHC w/ numa-sensitive hierarchy, chunk size @ 16K: -`$ mpirun --mca coll_xhc_priority 100 --mca coll_xhc_hierarchy numa --mca coll_xhc_chunk_size 16K --bind-to core ` - -XHC with flat hierarchy (ie. none at all): -`$ mpirun --mca coll_xhc_priority 100 --mca coll_xhc_hierarchy node [--bind-to core] ` - -## Benchmarking - -This section outlines some tips for benchmarking XHC and intra-node MPI -collectives in general. - -### Micro-Benchmarks - -For our micro-benchmarking purposes, we have been using [OSU's microbenchmark -suite](https://mvapich.cse.ohio-state.edu/benchmarks/). However, when -micro-benchmarking intra-node collectives, there are some important details -that one needs to look out for. - -**CPU Cache** An issue with the OSU micro-benchmarks is that they use the same -buffer for each iteration without altering it. Since modern processors -implicitly cache data, this can lead to false/unrealistic/unrepresentative -results, given that actual real-world applications do not (usually/optimally!) -perform duplicate operations. - -Availability of collective operation source data on a processor's local cache -hierarchy will cause certain phenomenons (e.g. slow path memory transactions) -and their effects to remain hidden and undetected in the micro-benchmarking -process, even though they *will* negatively impact performance in actual -applications, - -We have created "data-varying" (`_dv` suffix) benchmarks to counter this -problem, which will alter the data before each iteration. - -**Microbenchmark's pre-op Barrier** One also needs to be aware how the barrier -that appears before each iteration in the OSU micro-benchmarks affects the -result, especially so when latencies of small messages are concerned. The -underlying implementation of this barrier and the speed/efficiency of its -"release stage" will affect how fast and how synchronized ranks will exit the -barrier, and therefore how fast/synchronized they will enter the benchmarked -collective operation. - -For as accurate/clean performance reporting as possible, use a barrier -implementation that has as low a latency as possible. Furthermore, ideally, -all ranks should exit the barrier at the exact same time -- this is more -complex to measure, but can make a difference. In order to have a common -baseline when benchmarking and comparing multiple collectives implementation, -use this same barrier implementation for all benchmark scenarios. - -In the environments we tested, XHC's barrier was the best performing one. To -make using this barrier easier, we have put together a small new collective -component, `XB` (= xhc barrier). - -XB creates a new nested (duplicate) communicator with a hint to prioritize XHC, -and delegates barrier operations to it. A slightly inconvenient side-effect is -that XHC needs to be on the coll list (MCA `--mca coll`); it doesn't need to -have a high priority, though it can't be less than 0. - -* To benchmark Open MPI's `coll/tuned` with XB: `--mca coll basic,libnbc,tuned,xb,xhc --mca coll_xhc_priority 0 --mca coll_xb_priority 95 --mca coll_tuned_priority 90` - -* Or XHC itself, with XB: `--mca coll basic,libnbc,xb,xhc --mca coll_xhc_priority 90 --mca coll_xb_priority 95` - -It is also possible to specify the hierarchy to be used for XB's barrier (the -request will be passed in string form to XHC, only for the nested communicator) -via the `coll_xb_hierarchy` MCA parameter. - -In our fork of the OSU micro-benchmarks, you will also find -"integrity-checking" variants (`_integrity` suffix). These can help verify that -collective operations complete successfully without data corruption. - -Our OSU micro-benchmarks fork: - - -The XB component: - - -### Applications - -We expect to see any meaningful performance improvement with XHC in actual -applications, only if they spend a non-insignificant percentage of their -runtime in the collective operations that XHC implements: Broadcast, Barrier, -Allreduce, Reduce. - -One known such application is [miniAMR](https://github.com/Mantevo/miniAMR). -The application parameters (e.g. the refine count and frequency) will affect -the amount of time spent in the Allreduce primitive. - -Another one is Microsoft's [CNTK](https://github.com/microsoft/CNTK), also -heavy in Allreduce, though it actually makes use of the non-blocking -`Iallreduce` variant. However, it can easily be converted to use the blocking -variant instead (contact for patch). Comparing the performance of the -unmodified CNTK with OpenMPI's `coll/libnbc`, versus that of the patched CNTK -with XHC reveals that this modification is sensible and beneficial. - -Finally, while we have not yet rigorously evaluated it, -[PiSvM](http://pisvm.sourceforge.net/) is another candidate, with intense use -of MPI Broadcast. - ---- - -Contact: George Katevenis (gkatev@ics.forth.gr), Manolis Ploumidis (ploumid@ics.forth.gr) -Computer Architecture and VLSI Systems (CARV) Laboratory, ICS Forth +For additional info and resources about XHC, check the Open MPI docs: +https://docs.open-mpi.org/ diff --git a/ompi/mca/osc/osc.h b/ompi/mca/osc/osc.h index c8f77404c1c..b43f34ac3c5 100644 --- a/ompi/mca/osc/osc.h +++ b/ompi/mca/osc/osc.h @@ -216,6 +216,15 @@ typedef int (*ompi_osc_base_module_put_fn_t)(const void *origin_addr, struct ompi_datatype_t *target_dt, struct ompi_win_t *win); +typedef int (*ompi_osc_base_module_put_notify_fn_t)(const void *origin_addr, + size_t origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + size_t target_count, + struct ompi_datatype_t *target_dt, + int notify, + struct ompi_win_t *win); typedef int (*ompi_osc_base_module_get_fn_t)(void *origin_addr, size_t origin_count, @@ -226,6 +235,23 @@ typedef int (*ompi_osc_base_module_get_fn_t)(void *origin_addr, struct ompi_datatype_t *target_dt, struct ompi_win_t *win); +typedef int (*ompi_osc_base_module_get_notify_fn_t)(void *origin_addr, + size_t origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + size_t target_count, + struct ompi_datatype_t *target_dt, + int notify, + struct ompi_win_t *win); + +typedef int (*ompi_osc_base_module_win_get_notify_value_fn_t)(struct ompi_win_t *win, + int notify, + OMPI_MPI_COUNT_TYPE *value); + +typedef int (*ompi_osc_base_module_win_reset_notify_value_fn_t)(struct ompi_win_t *win, + int notify, + OMPI_MPI_COUNT_TYPE *value); typedef int (*ompi_osc_base_module_accumulate_fn_t)(const void *origin_addr, size_t origin_count, @@ -276,6 +302,17 @@ typedef int (*ompi_osc_base_module_rput_fn_t)(const void *origin_addr, struct ompi_win_t *win, struct ompi_request_t **request); +typedef int (*ompi_osc_base_module_rput_notify_fn_t)(const void *origin_addr, + size_t origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + size_t target_count, + struct ompi_datatype_t *target_dt, + int notify, + struct ompi_win_t *win, + struct ompi_request_t **request); + typedef int (*ompi_osc_base_module_rget_fn_t)(void *origin_addr, size_t origin_count, struct ompi_datatype_t *origin_dt, @@ -286,6 +323,16 @@ typedef int (*ompi_osc_base_module_rget_fn_t)(void *origin_addr, struct ompi_win_t *win, struct ompi_request_t **request); +typedef int (*ompi_osc_base_module_rget_notify_fn_t)(void *origin_addr, + size_t origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + size_t target_count, + struct ompi_datatype_t *target_dt, + int notify, + struct ompi_win_t *win, + struct ompi_request_t **request); typedef int (*ompi_osc_base_module_raccumulate_fn_t)(const void *origin_addr, size_t origin_count, @@ -371,7 +418,6 @@ typedef int (*ompi_osc_base_module_flush_local_all_fn_t)(struct ompi_win_t *win) * module structure. */ - // TODO: extend the struct and add pointers to put/get_with_notify functions struct ompi_osc_base_module_4_0_0_t { ompi_osc_base_module_win_shared_query_fn_t osc_win_shared_query; @@ -409,6 +455,12 @@ struct ompi_osc_base_module_4_0_0_t { ompi_osc_base_module_flush_all_fn_t osc_flush_all; ompi_osc_base_module_flush_local_fn_t osc_flush_local; ompi_osc_base_module_flush_local_all_fn_t osc_flush_local_all; + ompi_osc_base_module_put_notify_fn_t osc_put_notify; + ompi_osc_base_module_get_notify_fn_t osc_get_notify; + ompi_osc_base_module_win_get_notify_value_fn_t osc_win_get_notify_value; + ompi_osc_base_module_win_reset_notify_value_fn_t osc_win_reset_notify_value; + ompi_osc_base_module_rput_notify_fn_t osc_rput_notify; + ompi_osc_base_module_rget_notify_fn_t osc_rget_notify; }; typedef struct ompi_osc_base_module_4_0_0_t ompi_osc_base_module_4_0_0_t; typedef ompi_osc_base_module_4_0_0_t ompi_osc_base_module_t; diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index cc34c109683..14eeb928e40 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -1649,37 +1649,39 @@ int ompi_osc_rdma_shared_query( ptrdiff_t *disp_unit, void *baseptr) { int rc = OMPI_ERR_NOT_SUPPORTED; - ompi_osc_rdma_peer_t *peer; - int actual_rank = rank; + ompi_osc_rdma_peer_t *peer = NULL; ompi_osc_rdma_module_t *module = GET_MODULE(win); - peer = ompi_osc_module_get_peer (module, actual_rank); - if (NULL == peer) { - return OMPI_ERR_NOT_SUPPORTED; - } - /* currently only supported for allocated windows */ if (MPI_WIN_FLAVOR_ALLOCATE != module->flavor) { return OMPI_ERR_NOT_SUPPORTED; } - if (!ompi_osc_rdma_peer_local_base(peer)) { - return OMPI_ERR_NOT_SUPPORTED; - } - if (MPI_PROC_NULL == rank) { /* iterate until we find a rank that has a non-zero size */ for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) { peer = ompi_osc_module_get_peer (module, i); - ompi_osc_rdma_peer_extended_t *ex_peer = (ompi_osc_rdma_peer_extended_t *) peer; - if (!ompi_osc_rdma_peer_local_base(peer)) { + if (NULL == peer) { + /* peer object not cached yet (typically non-local here since local peers are added eagerly) */ continue; - } else if (module->same_size && ex_peer->super.base) { - break; - } else if (ex_peer->size > 0) { - break; } + ompi_osc_rdma_peer_extended_t *ex_peer = (ompi_osc_rdma_peer_extended_t *) peer; + if (ompi_osc_rdma_peer_local_base(peer)) { + if (module->same_size && ex_peer->super.base) { + break; + } else if (ex_peer->size > 0) { + break; + } + } + // reset so we don't mistakenly use a peer without memory + peer = NULL; } + } else { + peer = ompi_osc_module_get_peer (module, rank); + } + + if (NULL == peer || !ompi_osc_rdma_peer_local_base(peer)) { + return OMPI_ERR_NOT_SUPPORTED; } if (module->same_size && module->same_disp_unit) { diff --git a/ompi/mca/osc/sm/osc_sm.h b/ompi/mca/osc/sm/osc_sm.h index 23afacd7d49..85d250bfa18 100644 --- a/ompi/mca/osc/sm/osc_sm.h +++ b/ompi/mca/osc/sm/osc_sm.h @@ -22,6 +22,7 @@ typedef uint64_t osc_sm_post_type_t; typedef opal_atomic_uint64_t osc_sm_post_atomic_type_t; #define OSC_SM_POST_BITS 6 #define OSC_SM_POST_MASK 0x3f +#define OSC_SM_MAX_NOTIFY_COUNTERS 16 /* data shared across all peers */ struct ompi_osc_sm_global_state_t { @@ -47,6 +48,9 @@ struct ompi_osc_sm_node_state_t { opal_atomic_int32_t complete_count; ompi_osc_sm_lock_t lock; opal_atomic_lock_t accumulate_lock; + uint32_t notify_counter_count; + uint64_t notify_counter_offset; /* offset from segment_base, not raw pointer */ + }; typedef struct ompi_osc_sm_node_state_t ompi_osc_sm_node_state_t; @@ -79,7 +83,7 @@ struct ompi_osc_sm_module_t { size_t *sizes; void **bases; ptrdiff_t *disp_units; - uint64_t **notify_counters; + uint64_t *notify_counters; ompi_group_t *start_group; @@ -107,7 +111,6 @@ int ompi_osc_sm_detach(struct ompi_win_t *win, const void *base); int ompi_osc_sm_free(struct ompi_win_t *win); -// TODO: add put/get_with_notify prototypes int ompi_osc_sm_put(const void *origin_addr, size_t origin_count, @@ -118,6 +121,16 @@ int ompi_osc_sm_put(const void *origin_addr, struct ompi_datatype_t *target_dt, struct ompi_win_t *win); + int ompi_osc_sm_put_notify(const void *origin_addr, + size_t origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + size_t target_count, + struct ompi_datatype_t *target_dt, + int notify, + struct ompi_win_t *win); + int ompi_osc_sm_get(void *origin_addr, size_t origin_count, struct ompi_datatype_t *origin_dt, @@ -127,6 +140,24 @@ int ompi_osc_sm_get(void *origin_addr, struct ompi_datatype_t *target_dt, struct ompi_win_t *win); +int ompi_osc_sm_get_notify(void *origin_addr, + size_t origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + size_t target_count, + struct ompi_datatype_t *target_dt, + int notify, + struct ompi_win_t *win); + +int ompi_osc_sm_win_get_notify_value(struct ompi_win_t *win, + int notify, + OMPI_MPI_COUNT_TYPE *value); + +int ompi_osc_sm_win_reset_notify_value(struct ompi_win_t *win, + int notify, + OMPI_MPI_COUNT_TYPE *value); + int ompi_osc_sm_accumulate(const void *origin_addr, size_t origin_count, struct ompi_datatype_t *origin_dt, @@ -176,6 +207,17 @@ int ompi_osc_sm_rput(const void *origin_addr, struct ompi_win_t *win, struct ompi_request_t **request); +int ompi_osc_sm_rput_notify(const void *origin_addr, + size_t origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + size_t target_count, + struct ompi_datatype_t *target_dt, + int notify, + struct ompi_win_t *win, + struct ompi_request_t **request); + int ompi_osc_sm_rget(void *origin_addr, size_t origin_count, struct ompi_datatype_t *origin_dt, @@ -186,6 +228,17 @@ int ompi_osc_sm_rget(void *origin_addr, struct ompi_win_t *win, struct ompi_request_t **request); +int ompi_osc_sm_rget_notify(void *origin_addr, + size_t origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + size_t target_count, + struct ompi_datatype_t *target_dt, + int notify, + struct ompi_win_t *win, + struct ompi_request_t **request); + int ompi_osc_sm_raccumulate(const void *origin_addr, size_t origin_count, struct ompi_datatype_t *origin_dt, diff --git a/ompi/mca/osc/sm/osc_sm_comm.c b/ompi/mca/osc/sm/osc_sm_comm.c index f9bae370870..fbd4f17856c 100644 --- a/ompi/mca/osc/sm/osc_sm_comm.c +++ b/ompi/mca/osc/sm/osc_sm_comm.c @@ -17,9 +17,58 @@ #include "ompi/mca/osc/osc.h" #include "ompi/mca/osc/base/base.h" #include "ompi/mca/osc/base/osc_base_obj_convert.h" +#include "ompi/communicator/communicator.h" #include "osc_sm.h" +static inline uint64_t *osc_sm_target_notify_base(ompi_osc_sm_module_t *module, int target) +{ + if (NULL == module->segment_base) { + /* single-rank path: notify_counters is a regular local allocation */ + return module->notify_counters; + } + + return (uint64_t *) ((char *) module->segment_base + + module->node_states[target].notify_counter_offset); +} + +int +ompi_osc_sm_win_get_notify_value(struct ompi_win_t *win, + int notify, + OMPI_MPI_COUNT_TYPE *value) +{ + ompi_osc_sm_module_t *module = (ompi_osc_sm_module_t *) win->w_osc_module; + int rank = ompi_comm_rank(module->comm); + + if (notify < 0 || (uint32_t) notify >= module->node_states[rank].notify_counter_count) { + return MPI_ERR_NOTIFY_IDX; + } + + *value = (OMPI_MPI_COUNT_TYPE) osc_sm_target_notify_base(module, rank)[notify]; + opal_atomic_rmb(); + + return OMPI_SUCCESS; +} + +int +ompi_osc_sm_win_reset_notify_value(struct ompi_win_t *win, + int notify, + OMPI_MPI_COUNT_TYPE *value) +{ + ompi_osc_sm_module_t *module = (ompi_osc_sm_module_t *) win->w_osc_module; + int rank = ompi_comm_rank(module->comm); + + if (notify < 0 || (uint32_t) notify >= module->node_states[rank].notify_counter_count) { + return MPI_ERR_NOTIFY_IDX; + } + + /* Atomically swap the counter to 0 and return the previous value */ + *value = (OMPI_MPI_COUNT_TYPE) opal_atomic_swap_64( + &osc_sm_target_notify_base(module, rank)[notify], 0); + + return OMPI_SUCCESS; +} + int ompi_osc_sm_rput(const void *origin_addr, size_t origin_count, @@ -59,6 +108,53 @@ ompi_osc_sm_rput(const void *origin_addr, return OMPI_SUCCESS; } +int +ompi_osc_sm_rput_notify(const void *origin_addr, + size_t origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + size_t target_count, + struct ompi_datatype_t *target_dt, + int notify, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req) +{ + int ret; + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + void *remote_address; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "rput_notify: 0x%lx, %zu, %s, %d, %d, %zu, %s, %d, 0x%lx", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, + notify, + (unsigned long) win)); + + remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp; + + ret = ompi_datatype_sndrcv((void *)origin_addr, origin_count, origin_dt, + remote_address, target_count, target_dt); + if (OMPI_SUCCESS != ret) { + return ret; + } + + /* the only valid field of RMA request status is the MPI_ERROR field. + * ompi_request_empty has status MPI_SUCCESS and indicates the request is + * complete. */ + *ompi_req = &ompi_request_empty; + + if (notify < 0 || (uint32_t) notify >= module->node_states[target].notify_counter_count) { + return MPI_ERR_NOTIFY_IDX; + } + + opal_atomic_wmb(); + opal_atomic_add(&osc_sm_target_notify_base(module, target)[notify], 1); + + return OMPI_SUCCESS; +} int ompi_osc_sm_rget(void *origin_addr, @@ -99,6 +195,53 @@ ompi_osc_sm_rget(void *origin_addr, return OMPI_SUCCESS; } +int +ompi_osc_sm_rget_notify(void *origin_addr, + size_t origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + size_t target_count, + struct ompi_datatype_t *target_dt, + int notify, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req) +{ + int ret; + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + void *remote_address; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "rget_notify: 0x%lx, %zu, %s, %d, %d, %zu, %s, %d, 0x%lx", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, + notify, + (unsigned long) win)); + + remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp; + + ret = ompi_datatype_sndrcv(remote_address, target_count, target_dt, + origin_addr, origin_count, origin_dt); + if (OMPI_SUCCESS != ret) { + return ret; + } + + /* the only valid field of RMA request status is the MPI_ERROR field. + * ompi_request_empty has status MPI_SUCCESS and indicates the request is + * complete. */ + *ompi_req = &ompi_request_empty; + + if (notify < 0 || (uint32_t) notify >= module->node_states[target].notify_counter_count) { + return MPI_ERR_NOTIFY_IDX; + } + + opal_atomic_rmb(); + opal_atomic_add(&osc_sm_target_notify_base(module, target)[notify], 1); + + return OMPI_SUCCESS; +} int ompi_osc_sm_raccumulate(const void *origin_addr, @@ -236,6 +379,48 @@ ompi_osc_sm_put(const void *origin_addr, } +int +ompi_osc_sm_put_notify(const void *origin_addr, + size_t origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + size_t target_count, + struct ompi_datatype_t *target_dt, + int notify, + struct ompi_win_t *win) +{ + int ret; + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + void *remote_address; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "put_notify: 0x%lx, %zu, %s, %d, %d, %zu, %s, %d, 0x%lx", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, + notify, + (unsigned long) win)); + + remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp; + + ret = ompi_datatype_sndrcv((void *)origin_addr, origin_count, origin_dt, + remote_address, target_count, target_dt); + if (OMPI_SUCCESS != ret) { + return ret; + } + + if (notify < 0 || (uint32_t) notify >= module->node_states[target].notify_counter_count) { + return MPI_ERR_NOTIFY_IDX; + } + + opal_atomic_wmb(); + opal_atomic_add(&osc_sm_target_notify_base(module, target)[notify], 1); + + return ret; +} + int ompi_osc_sm_get(void *origin_addr, size_t origin_count, @@ -268,7 +453,7 @@ ompi_osc_sm_get(void *origin_addr, int -ompi_osc_sm_get_with_notify(void *origin_addr, +ompi_osc_sm_get_notify(void *origin_addr, size_t origin_count, struct ompi_datatype_t *origin_dt, int target, @@ -294,9 +479,15 @@ ompi_osc_sm_get_with_notify(void *origin_addr, ret = ompi_datatype_sndrcv(remote_address, target_count, target_dt, origin_addr, origin_count, origin_dt); - // TODO: do the same for put_with_notify + if (OMPI_SUCCESS != ret) { + return ret; + } + if (notify < 0 || (uint32_t) notify >= module->node_states[target].notify_counter_count) { + return OMPI_ERR_BAD_PARAM; + } + opal_atomic_rmb(); - opal_atomic_add(&module->notify_counters[target][notify], 1); + opal_atomic_add(&osc_sm_target_notify_base(module, target)[notify], 1); return ret; } @@ -472,5 +663,5 @@ ompi_osc_sm_fetch_and_op(const void *origin_addr, done: opal_atomic_unlock(&module->node_states[target].accumulate_lock); - return OMPI_SUCCESS;; + return OMPI_SUCCESS; } diff --git a/ompi/mca/osc/sm/osc_sm_component.c b/ompi/mca/osc/sm/osc_sm_component.c index 1ad9a48cfd2..259c0826017 100644 --- a/ompi/mca/osc/sm/osc_sm_component.c +++ b/ompi/mca/osc/sm/osc_sm_component.c @@ -70,8 +70,6 @@ ompi_osc_sm_component_t mca_osc_sm_component = { MCA_BASE_COMPONENT_INIT(ompi, osc, sm) -// TODO: extend the struct and add pointers to put/get_with_notify functions -// TODO: extend it to rput/rget_with_notify as well ompi_osc_sm_module_t ompi_osc_sm_module_template = { { .osc_win_shared_query = ompi_osc_sm_shared_query, @@ -81,14 +79,20 @@ ompi_osc_sm_module_t ompi_osc_sm_module_template = { .osc_free = ompi_osc_sm_free, .osc_put = ompi_osc_sm_put, + .osc_put_notify = ompi_osc_sm_put_notify, .osc_get = ompi_osc_sm_get, + .osc_get_notify = ompi_osc_sm_get_notify, + .osc_win_get_notify_value = ompi_osc_sm_win_get_notify_value, + .osc_win_reset_notify_value = ompi_osc_sm_win_reset_notify_value, .osc_accumulate = ompi_osc_sm_accumulate, .osc_compare_and_swap = ompi_osc_sm_compare_and_swap, .osc_fetch_and_op = ompi_osc_sm_fetch_and_op, .osc_get_accumulate = ompi_osc_sm_get_accumulate, .osc_rput = ompi_osc_sm_rput, + .osc_rput_notify = ompi_osc_sm_rput_notify, .osc_rget = ompi_osc_sm_rget, + .osc_rget_notify = ompi_osc_sm_rget_notify, .osc_raccumulate = ompi_osc_sm_raccumulate, .osc_rget_accumulate = ompi_osc_sm_rget_accumulate, @@ -253,12 +257,19 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis module->posts = calloc (1, sizeof(module->posts[0]) + sizeof (module->posts[0][0])); if (NULL == module->posts) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; module->posts[0] = (osc_sm_post_atomic_type_t *) (module->posts + 1); + + /* allocate notify counters for single process case */ + module->notify_counters = calloc(OSC_SM_MAX_NOTIFY_COUNTERS, sizeof(uint64_t)); + if (NULL == module->notify_counters) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + module->node_states[0].notify_counter_count = OSC_SM_MAX_NOTIFY_COUNTERS; + module->node_states[0].notify_counter_offset = 0; } else { - unsigned long total, *rbuf; + unsigned long total, total_counters, gather_values[2], *rbuf; int i, flag; size_t pagesize; size_t state_size; size_t posts_size, post_size = (comm_size + OSC_SM_POST_MASK) / (OSC_SM_POST_MASK + 1); + size_t notify_counters_size; size_t data_base_size; opal_output_verbose(MCA_BASE_VERBOSE_DEBUG, ompi_osc_base_framework.framework_output, @@ -267,7 +278,7 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis /* get the pagesize */ pagesize = opal_getpagesize(); - rbuf = malloc(sizeof(unsigned long) * comm_size); + rbuf = malloc(sizeof(unsigned long) * comm_size * 2 ); if (NULL == rbuf) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; /* Note that the alloc_shared_noncontig info key only has @@ -291,9 +302,10 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis "allocating window using contiguous strategy"); } - total = size; - ret = module->comm->c_coll->coll_allgather(&total, 1, MPI_UNSIGNED_LONG, - rbuf, 1, MPI_UNSIGNED_LONG, + gather_values[0] = size; + gather_values[1] = OSC_SM_MAX_NOTIFY_COUNTERS; + ret = module->comm->c_coll->coll_allgather(gather_values, 2, MPI_UNSIGNED_LONG, + rbuf, 2, MPI_UNSIGNED_LONG, module->comm, module->comm->c_coll->coll_allgather_module); if (OMPI_SUCCESS != ret) { @@ -302,8 +314,10 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis } total = 0; + total_counters = 0; for (i = 0 ; i < comm_size ; ++i) { - total += rbuf[i]; + total += rbuf[2 * i]; + total_counters += rbuf[2 * i + 1]; if (module->noncontig) { total += OPAL_ALIGN_PAD_AMOUNT(total, pagesize); } @@ -314,7 +328,9 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis state_size += OPAL_ALIGN_PAD_AMOUNT(state_size, 64); posts_size = comm_size * post_size * sizeof (module->posts[0][0]); posts_size += OPAL_ALIGN_PAD_AMOUNT(posts_size, 64); - data_base_size = state_size + posts_size; + notify_counters_size = total_counters * sizeof(uint64_t); + notify_counters_size += OPAL_ALIGN_PAD_AMOUNT(notify_counters_size, 64); + data_base_size = state_size + posts_size + notify_counters_size; data_base_size += OPAL_ALIGN_PAD_AMOUNT(data_base_size, pagesize); if (0 == ompi_comm_rank (module->comm)) { char *data_file; @@ -375,15 +391,27 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis module->global_state = (ompi_osc_sm_global_state_t *) (module->posts[0] + comm_size * post_size); module->node_states = (ompi_osc_sm_node_state_t *) (module->global_state + 1); - for (i = 0, total = data_base_size ; i < comm_size ; ++i) { + /* set up notify counters in shared memory after node_states */ + module->notify_counters = (uint64_t *) ((char *)(module->node_states + comm_size) + + OPAL_ALIGN_PAD_AMOUNT((uintptr_t)(module->node_states + comm_size), 64)); + /* zero out notify counters */ + memset(module->notify_counters, 0, total_counters * sizeof(uint64_t)); + + for (i = 0, total = data_base_size, total_counters = 0 ; i < comm_size ; ++i) { if (i > 0) { module->posts[i] = module->posts[i - 1] + post_size; } - module->sizes[i] = rbuf[i]; + module->node_states[i].notify_counter_count = (uint32_t) rbuf[2 * i + 1]; + module->node_states[i].notify_counter_offset = + (uint64_t) ((char *) (module->notify_counters + total_counters) - + (char *) module->segment_base); + total_counters += rbuf[2 * i + 1]; + + module->sizes[i] = rbuf[2 * i]; if (module->sizes[i] || !module->noncontig) { module->bases[i] = ((char *) module->segment_base) + total; - total += rbuf[i]; + total += rbuf[2 * i]; if (module->noncontig) { total += OPAL_ALIGN_PAD_AMOUNT(total, pagesize); } @@ -397,7 +425,8 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis /* initialize my state shared */ module->my_node_state = &module->node_states[ompi_comm_rank(module->comm)]; - memset (module->my_node_state, 0, sizeof(*module->my_node_state)); + module->my_node_state->complete_count = 0; + memset (&module->my_node_state->lock, 0, sizeof(module->my_node_state->lock)); *base = module->bases[ompi_comm_rank(module->comm)]; @@ -553,6 +582,7 @@ ompi_osc_sm_free(struct ompi_win_t *win) module->comm->c_coll->coll_barrier_module); opal_shmem_segment_detach (&module->seg_ds); + /* notify_counters points into shared memory segment, no separate free needed */ } else { free(module->node_states); free(module->global_state); @@ -560,6 +590,8 @@ ompi_osc_sm_free(struct ompi_win_t *win) mca_mpool_base_default_module->mpool_free(mca_mpool_base_default_module, module->bases[0]); } + /* free notify_counters for single process case */ + free(module->notify_counters); } free(module->disp_units); free(module->outstanding_locks); diff --git a/ompi/mca/osc/ucx/osc_ucx_comm.c b/ompi/mca/osc/ucx/osc_ucx_comm.c index ab122e67263..0354edb71c0 100644 --- a/ompi/mca/osc/ucx/osc_ucx_comm.c +++ b/ompi/mca/osc/ucx/osc_ucx_comm.c @@ -944,7 +944,7 @@ static inline int ompi_osc_ucx_check_ops_and_flush (ompi_osc_ucx_module_t *modul uint64_t base_tmp, tail_tmp; int ret = OMPI_SUCCESS; - if (module->ctx->num_incomplete_req_ops > ompi_osc_ucx_outstanding_ops_flush_threshold) { + if ((size_t)module->ctx->num_incomplete_req_ops > ompi_osc_ucx_outstanding_ops_flush_threshold) { ret = opal_common_ucx_ctx_flush(module->ctx, OPAL_COMMON_UCX_SCOPE_WORKER, 0); if (ret != OPAL_SUCCESS) { ret = OMPI_ERROR; diff --git a/ompi/mca/part/persist/part_persist.h b/ompi/mca/part/persist/part_persist.h index ccc8f8f1971..86fb9bac42d 100644 --- a/ompi/mca/part/persist/part_persist.h +++ b/ompi/mca/part/persist/part_persist.h @@ -490,7 +490,7 @@ mca_part_persist_psend_init(const void* buf, return err; } -__opal_attribute_always_inline__ static inline int +static inline int mca_part_persist_start(size_t count, ompi_request_t** requests) { int err = OMPI_SUCCESS; diff --git a/ompi/mca/pml/ob1/pml_ob1_iprobe.c b/ompi/mca/pml/ob1/pml_ob1_iprobe.c index 4d6a0eb8dfd..97744cce5dc 100644 --- a/ompi/mca/pml/ob1/pml_ob1_iprobe.c +++ b/ompi/mca/pml/ob1/pml_ob1_iprobe.c @@ -47,6 +47,11 @@ int mca_pml_ob1_iprobe(int src, *matched = 1; } else { *matched = 0; +#if OPAL_ENABLE_FT_MPI + if( ompi_request_is_failed((ompi_request_t*)&recvreq) ) { + rc = recvreq.req_recv.req_base.req_ompi.req_status.MPI_ERROR; + } +#endif opal_progress(); } MCA_PML_BASE_RECV_REQUEST_FINI( &recvreq.req_recv ); @@ -119,6 +124,11 @@ mca_pml_ob1_improbe(int src, (*message)->count = recvreq->req_recv.req_base.req_ompi.req_status._ucount; } else { *matched = 0; +#if OPAL_ENABLE_FT_MPI + if( ompi_request_is_failed((ompi_request_t*)recvreq) ) { + rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR; + } +#endif /* we only free if we didn't match, because we're going to translate the request into a receive request later on if it diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index 57aba677a8a..a6a2866f2a2 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -108,16 +108,19 @@ static int mca_pml_ob1_recv_request_cancel(struct ompi_request_t* ompi_request, } if( !request->req_match_received ) { /* the match has not been already done */ assert( OMPI_ANY_TAG == ompi_request->req_status.MPI_TAG ); /* not matched isn't it */ + if(OPAL_LIKELY(request->req_recv.req_base.req_type != MCA_PML_REQUEST_IPROBE && + request->req_recv.req_base.req_type != MCA_PML_REQUEST_IMPROBE)) { #if MCA_PML_OB1_CUSTOM_MATCH - custom_match_prq_cancel(ob1_comm->prq, request); + custom_match_prq_cancel(ob1_comm->prq, request); #else - if( request->req_recv.req_base.req_peer == OMPI_ANY_SOURCE ) { - opal_list_remove_item( &ob1_comm->wild_receives, (opal_list_item_t*)request ); - } else { - mca_pml_ob1_comm_proc_t* proc = mca_pml_ob1_peer_lookup (comm, request->req_recv.req_base.req_peer); - opal_list_remove_item(&proc->specific_receives, (opal_list_item_t*)request); - } + if( request->req_recv.req_base.req_peer == OMPI_ANY_SOURCE ) { + opal_list_remove_item( &ob1_comm->wild_receives, (opal_list_item_t*)request ); + } else { + mca_pml_ob1_comm_proc_t* proc = mca_pml_ob1_peer_lookup (comm, request->req_recv.req_base.req_peer); + opal_list_remove_item(&proc->specific_receives, (opal_list_item_t*)request); + } #endif + } PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_REMOVE_FROM_POSTED_Q, &(request->req_recv.req_base), PERUSE_RECV ); OB1_MATCHING_UNLOCK(&ob1_comm->matching_lock); diff --git a/ompi/mpi/bindings/ompi_bindings/consts.py b/ompi/mpi/bindings/ompi_bindings/consts.py index 43bca486b57..759b342f64a 100644 --- a/ompi/mpi/bindings/ompi_bindings/consts.py +++ b/ompi/mpi/bindings/ompi_bindings/consts.py @@ -23,6 +23,7 @@ 'MPI_SUCCESS', 'MPI_ERR_BUFFER', 'MPI_ERR_COUNT', + 'MPI_ERR_NOTIFY_IDX' 'MPI_ERR_TYPE', 'MPI_ERR_TAG', 'MPI_ERR_COMM', diff --git a/ompi/mpi/c/Makefile.am b/ompi/mpi/c/Makefile.am index 25b871fa7d4..49619694d0b 100644 --- a/ompi/mpi/c/Makefile.am +++ b/ompi/mpi/c/Makefile.am @@ -223,6 +223,7 @@ prototype_sources = \ get_accumulate.c.in \ get_address.c.in \ get.c.in \ + get_notify.c.in \ get_count.c.in \ get_elements.c.in \ get_elements_x.c.in \ @@ -341,6 +342,7 @@ prototype_sources = \ psend_init.c.in \ publish_name.c.in \ put.c.in \ + put_notify.c.in \ query_thread.c.in \ raccumulate.c.in \ recv.c.in \ @@ -484,6 +486,8 @@ prototype_sources = \ win_get_group.c.in \ win_get_info.c.in \ win_get_name.c.in \ + win_get_notify_value.c.in \ + win_reset_notify_value.c.in \ win_lock_all.c.in \ win_lock.c.in \ win_post.c.in \ @@ -954,6 +958,8 @@ interface_profile_sources = \ win_get_group_generated.c \ win_get_info_generated.c \ win_get_name_generated.c \ + win_get_notify_value_generated.c \ + win_reset_notify_value_generated.c \ win_lock_all_generated.c \ win_lock_generated.c \ win_post_generated.c \ diff --git a/ompi/mpi/c/get_notify.c.in b/ompi/mpi/c/get_notify.c.in new file mode 100644 index 00000000000..1bad16944ab --- /dev/null +++ b/ompi/mpi/c/get_notify.c.in @@ -0,0 +1,77 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2024 Triad National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/win/win.h" +#include "ompi/mca/osc/osc.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/runtime/ompi_spc.h" + +PROTOTYPE ERROR_CLASS get_notify(BUFFER_OUT origin_addr, COUNT origin_count, + DATATYPE origin_datatype, INT target_rank, + AINT target_disp, COUNT target_count, + DATATYPE target_datatype, INT notification_idx, WIN win) +{ + int rc; + + SPC_RECORD(OMPI_SPC_GET_NOTIFY, 1); + + if (MPI_PARAM_CHECK) { + rc = OMPI_SUCCESS; + + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_NOHANDLE_INVOKE(MPI_ERR_WIN, FUNC_NAME); + } else if (origin_count < 0 || target_count < 0) { + rc = MPI_ERR_COUNT; + } else if (ompi_win_peer_invalid(win, target_rank) && + (MPI_PROC_NULL != target_rank)) { + rc = MPI_ERR_RANK; + } else if ( MPI_WIN_FLAVOR_DYNAMIC != win->w_flavor && target_disp < 0 ) { + rc = MPI_ERR_DISP; + } else if (notification_idx < 0) { + rc = MPI_ERR_NOTIFY_IDX; + } else { + OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, origin_datatype, origin_count); + if (OMPI_SUCCESS == rc) { + OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, target_datatype, target_count); + } + } + OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME); + } + + if (MPI_PROC_NULL == target_rank) return MPI_SUCCESS; + + rc = win->w_osc_module->osc_get_notify(origin_addr, origin_count, origin_datatype, + target_rank, target_disp, target_count, + target_datatype, notification_idx, win); + OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME); +} diff --git a/ompi/mpi/c/put_notify.c.in b/ompi/mpi/c/put_notify.c.in new file mode 100644 index 00000000000..14ee5c7e365 --- /dev/null +++ b/ompi/mpi/c/put_notify.c.in @@ -0,0 +1,80 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2024 Triad National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/win/win.h" +#include "ompi/mca/osc/osc.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/runtime/ompi_spc.h" + +PROTOTYPE ERROR_CLASS put_notify(BUFFER origin_addr, COUNT origin_count, DATATYPE origin_datatype, + INT target_rank, AINT target_disp, COUNT target_count, + DATATYPE target_datatype, INT notification_idx, WIN win) +{ + int rc; + + SPC_RECORD(OMPI_SPC_PUT_NOTIFY, 1); + + if (MPI_PARAM_CHECK) { + rc = OMPI_SUCCESS; + + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_NOHANDLE_INVOKE(MPI_ERR_WIN, FUNC_NAME); + } else if (origin_count < 0 || target_count < 0) { + rc = MPI_ERR_COUNT; + } else if (ompi_win_peer_invalid(win, target_rank) && + (MPI_PROC_NULL != target_rank)) { + rc = MPI_ERR_RANK; + } else if (NULL == target_datatype || + MPI_DATATYPE_NULL == target_datatype) { + rc = MPI_ERR_TYPE; + } else if ( MPI_WIN_FLAVOR_DYNAMIC != win->w_flavor && target_disp < 0 ) { + rc = MPI_ERR_DISP; + } else if (notification_idx < 0) { + rc = MPI_ERR_NOTIFY_IDX; + } else { + OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, origin_datatype, origin_count); + if (OMPI_SUCCESS == rc) { + OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, target_datatype, target_count); + } + } + OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME); + } + + if (MPI_PROC_NULL == target_rank) return MPI_SUCCESS; + + rc = win->w_osc_module->osc_put_notify(origin_addr, origin_count, origin_datatype, + target_rank, target_disp, target_count, + target_datatype, notification_idx, win); + OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME); +} diff --git a/ompi/mpi/c/win_get_notify_value.c.in b/ompi/mpi/c/win_get_notify_value.c.in new file mode 100644 index 00000000000..228999c13ea --- /dev/null +++ b/ompi/mpi/c/win_get_notify_value.c.in @@ -0,0 +1,41 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2026 Triad National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/win/win.h" +#include "ompi/mca/osc/osc.h" + +PROTOTYPE ERROR_CLASS win_get_notify_value(WIN win, INT notification_idx, ELEMENT_COUNT value) +{ + int rc; + + if (MPI_PARAM_CHECK) { + rc = OMPI_SUCCESS; + + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_NOHANDLE_INVOKE(MPI_ERR_WIN, FUNC_NAME); + } else if (notification_idx < 0) { + rc = MPI_ERR_NOTIFY_IDX; + } else if (NULL == value) { + rc = MPI_ERR_ARG; + } + + OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME); + } + + rc = win->w_osc_module->osc_win_get_notify_value(win, notification_idx, value); + OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME); +} diff --git a/ompi/mpi/c/win_reset_notify_value.c.in b/ompi/mpi/c/win_reset_notify_value.c.in new file mode 100644 index 00000000000..99aa1755a76 --- /dev/null +++ b/ompi/mpi/c/win_reset_notify_value.c.in @@ -0,0 +1,41 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2026 Triad National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/win/win.h" +#include "ompi/mca/osc/osc.h" + +PROTOTYPE ERROR_CLASS win_reset_notify_value(WIN win, INT notification_idx, ELEMENT_COUNT value) +{ + int rc; + + if (MPI_PARAM_CHECK) { + rc = OMPI_SUCCESS; + + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_NOHANDLE_INVOKE(MPI_ERR_WIN, FUNC_NAME); + } else if (notification_idx < 0) { + rc = MPI_ERR_NOTIFY_IDX; + } else if (NULL == value) { + rc = MPI_ERR_ARG; + } + + OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME); + } + + rc = win->w_osc_module->osc_win_reset_notify_value(win, notification_idx, value); + OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME); +} diff --git a/ompi/mpi/fortran/mpif-h/request_get_status_f.c b/ompi/mpi/fortran/mpif-h/request_get_status_f.c index 7a5c9d57716..7fac2b2e051 100644 --- a/ompi/mpi/fortran/mpif-h/request_get_status_f.c +++ b/ompi/mpi/fortran/mpif-h/request_get_status_f.c @@ -12,6 +12,7 @@ * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -75,16 +76,11 @@ void ompi_request_get_status_f(MPI_Fint *request, ompi_fortran_logical_t *flag, MPI_Request c_req = PMPI_Request_f2c( *request ); OMPI_LOGICAL_NAME_DECL(flag); - /* This seems silly, but someone will do it */ - - if (OMPI_IS_FORTRAN_STATUS_IGNORE(status)) { - *flag = OMPI_INT_2_LOGICAL(0); - c_ierr = MPI_SUCCESS; - } else { - c_ierr = PMPI_Request_get_status(c_req, - OMPI_LOGICAL_SINGLE_NAME_CONVERT(flag), - &c_status); - OMPI_SINGLE_INT_2_LOGICAL(flag); + c_ierr = PMPI_Request_get_status(c_req, + OMPI_LOGICAL_SINGLE_NAME_CONVERT(flag), + &c_status); + OMPI_SINGLE_INT_2_LOGICAL(flag); + if (!OMPI_IS_FORTRAN_STATUS_IGNORE(status)) { PMPI_Status_c2f( &c_status, status ); } if (NULL != ierr) *ierr = OMPI_INT_2_FINT(c_ierr); diff --git a/ompi/request/req_ft.c b/ompi/request/req_ft.c index 2c53ce076b0..e855afc59fd 100644 --- a/ompi/request/req_ft.c +++ b/ompi/request/req_ft.c @@ -128,7 +128,9 @@ bool ompi_request_is_failed_fn(ompi_request_t *req) req->req_status.MPI_ERROR = MPI_ERR_PROC_FAILED_PENDING; /* If it is a probe/mprobe, escalate the error */ if( (MCA_PML_REQUEST_MPROBE == pml_req->req_type) || - (MCA_PML_REQUEST_PROBE == pml_req->req_type) ) { + (MCA_PML_REQUEST_IMPROBE == pml_req->req_type) || + (MCA_PML_REQUEST_PROBE == pml_req->req_type) || + (MCA_PML_REQUEST_IPROBE == pml_req->req_type) ) { req->req_status.MPI_ERROR = MPI_ERR_PROC_FAILED; } opal_output_verbose(10, ompi_ftmpi_output_handle, diff --git a/ompi/runtime/ompi_mpi_finalize.c b/ompi/runtime/ompi_mpi_finalize.c index ad8a328dc55..08c6efaa616 100644 --- a/ompi/runtime/ompi_mpi_finalize.c +++ b/ompi/runtime/ompi_mpi_finalize.c @@ -24,6 +24,7 @@ * reserved. * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. * All Rights reserved. + * Copyright (c) 2026 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -281,14 +282,25 @@ int ompi_mpi_finalize(void) * communications/actions to complete. See * https://github.com/open-mpi/ompi/issues/1576 for the * original bug report. */ - if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, NULL, 0, fence_cbfunc, (void*)&active))) { - ret = opal_pmix_convert_status(rc); - OMPI_ERROR_LOG(ret); + rc = PMIx_Fence_nb(NULL, 0, NULL, 0, fence_cbfunc, (void*)&active); + if (PMIX_SUCCESS != rc) { /* Reset the active flag to false, to avoid waiting for * completion when the fence was failed. */ active = false; + // can return operation_succeeded if atomically completed + if (PMIX_OPERATION_SUCCEEDED == rc) { + ret = MPI_SUCCESS; + } else { + ret = opal_pmix_convert_status(rc); + OMPI_ERROR_LOG(ret); + } + } else { + OMPI_LAZY_WAIT_FOR_COMPLETION(active); + /* NOTE: we lose the fence return status here. This can be + * a problem as the fence CAN fail. Might consider retrieving + * the returned status so you can respond if it doesn't + * successfully complete? */ } - OMPI_LAZY_WAIT_FOR_COMPLETION(active); } ompi_mpi_instance_finalize (&ompi_mpi_instance_default); diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index c7e61c5bf94..deea53cb02e 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -26,7 +26,7 @@ * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. * All Rights reserved. - * Copyright (c) 2021 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2026 Nanook Consulting All rights reserved. * Copyright (c) 2021-2022 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. @@ -464,12 +464,17 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, active = true; OPAL_POST_OBJECT(&active); PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL); - if( PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, NULL, 0, - fence_release, - (void*)&active))) { - ret = opal_pmix_convert_status(rc); - error = "PMIx_Fence_nb() failed"; - goto error; + rc = PMIx_Fence_nb(NULL, 0, NULL, 0, fence_release, (void*)&active); + if (PMIX_SUCCESS != rc) { + active = false; + if (PMIX_OPERATION_SUCCEEDED == rc) { + // can return operation_succeeded if atomically completed + ret = MPI_SUCCESS; + } else { + ret = opal_pmix_convert_status(rc); + error = "PMIx_Fence_nb() failed"; + goto error; + } } } } else { @@ -482,12 +487,19 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL); rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active); if( PMIX_SUCCESS != rc) { - ret = opal_pmix_convert_status(rc); - error = "PMIx_Fence() failed"; - goto error; + active = false; + if (PMIX_OPERATION_SUCCEEDED == rc) { + // can return operation_succeeded if atomically completed + ret = MPI_SUCCESS; + } else { + ret = opal_pmix_convert_status(rc); + error = "PMIx_Fence_nb() failed"; + goto error; + } + } else { + /* cannot just wait on thread as we need to call opal_progress */ + OMPI_LAZY_WAIT_FOR_COMPLETION(active); } - /* cannot just wait on thread as we need to call opal_progress */ - OMPI_LAZY_WAIT_FOR_COMPLETION(active); } } @@ -537,7 +549,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, * we have to wait here for it to complete. However, there * is no reason to do two barriers! */ if (background_fence) { - OMPI_LAZY_WAIT_FOR_COMPLETION(active); + if (active) { + OMPI_LAZY_WAIT_FOR_COMPLETION(active); + } } else if (!ompi_async_mpi_init) { /* wait for everyone to reach this point - this is a hard * barrier requirement at this time, though we hope to relax @@ -546,13 +560,20 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, active = true; OPAL_POST_OBJECT(&active); PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &flag, PMIX_BOOL); - if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, info, 1, - fence_release, (void*)&active))) { - ret = opal_pmix_convert_status(rc); - error = "PMIx_Fence_nb() failed"; - goto error; + rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active); + if (PMIX_SUCCESS != rc) { + active = false; + if (PMIX_OPERATION_SUCCEEDED == rc) { + // can return operation_succeeded if atomically completed + ret = MPI_SUCCESS; + } else { + ret = opal_pmix_convert_status(rc); + error = "PMIx_Fence_nb() failed"; + goto error; + } + } else { + OMPI_LAZY_WAIT_FOR_COMPLETION(active); } - OMPI_LAZY_WAIT_FOR_COMPLETION(active); } } diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c index c747d55ee7d..7b5d1f3c55e 100644 --- a/ompi/runtime/ompi_mpi_params.c +++ b/ompi/runtime/ompi_mpi_params.c @@ -104,11 +104,12 @@ bool ompi_ftmpi_enabled = false; #endif /* OPAL_ENABLE_FT_MPI */ static int ompi_stream_buffering_mode = -1; +static int ompi_mpi_ft_verbose = 0; int ompi_comm_verbose_level = 0; int ompi_mpi_register_params(void) { - int value; + int value = 0; #if OPAL_ENABLE_FT_MPI mca_base_var_scope_t ftscope = MCA_BASE_VAR_SCOPE_READONLY; @@ -121,15 +122,14 @@ int ompi_mpi_register_params(void) "Enable UFLM MPI Fault Tolerance framework", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_4, ftscope, &ompi_ftmpi_enabled); - value = 0; (void) mca_base_var_register ("ompi", "mpi", "ft", "verbose", "Verbosity level of the ULFM MPI Fault Tolerance framework", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, - OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL, &value); + OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL, &ompi_mpi_ft_verbose); #if OPAL_ENABLE_FT_MPI - if( 0 < value ) { + if( 0 < ompi_mpi_ft_verbose ) { ompi_ftmpi_output_handle = opal_output_open(NULL); - opal_output_set_verbosity(ompi_ftmpi_output_handle, value); + opal_output_set_verbosity(ompi_ftmpi_output_handle, ompi_mpi_ft_verbose); } (void) ompi_comm_rbcast_register_params(); diff --git a/ompi/runtime/ompi_rte.c b/ompi/runtime/ompi_rte.c index 651cf9d0b5a..f94df4fbd5d 100644 --- a/ompi/runtime/ompi_rte.c +++ b/ompi/runtime/ompi_rte.c @@ -85,56 +85,20 @@ static int _setup_proc_session_dir(char **sdir); #define OPAL_PRINT_NAME_ARGS_MAX_SIZE 50 #define OPAL_PRINT_NAME_ARG_NUM_BUFS 16 -static bool fns_init=false; -static opal_tsd_tracked_key_t print_args_tsd_key; static char* opal_print_args_null = "NULL"; typedef struct { - char *buffers[OPAL_PRINT_NAME_ARG_NUM_BUFS]; + char buffers[OPAL_PRINT_NAME_ARG_NUM_BUFS][OPAL_PRINT_NAME_ARGS_MAX_SIZE + 1]; int cntr; } opal_print_args_buffers_t; -static void -buffer_cleanup(void *value) -{ - int i; - opal_print_args_buffers_t *ptr; - - if (NULL != value) { - ptr = (opal_print_args_buffers_t*)value; - for (i=0; i < OPAL_PRINT_NAME_ARG_NUM_BUFS; i++) { - free(ptr->buffers[i]); - } - free (ptr); - } - fns_init = false; -} - static opal_print_args_buffers_t* get_print_name_buffer(void) { - opal_print_args_buffers_t *ptr; - int ret, i; - - if (!fns_init) { - /* setup the print_args function */ - OBJ_CONSTRUCT(&print_args_tsd_key, opal_tsd_tracked_key_t); - opal_tsd_tracked_key_set_destructor(&print_args_tsd_key, buffer_cleanup); - fns_init = true; - } - - ret = opal_tsd_tracked_key_get(&print_args_tsd_key, (void**)&ptr); - if (OPAL_SUCCESS != ret) return NULL; + static opal_thread_local opal_print_args_buffers_t name_buffer = { + .cntr = 0 + }; - if (NULL == ptr) { - ptr = (opal_print_args_buffers_t*)malloc(sizeof(opal_print_args_buffers_t)); - for (i=0; i < OPAL_PRINT_NAME_ARG_NUM_BUFS; i++) { - ptr->buffers[i] = (char *) malloc((OPAL_PRINT_NAME_ARGS_MAX_SIZE+1) * sizeof(char)); - } - ptr->cntr = 0; - ret = opal_tsd_tracked_key_set(&print_args_tsd_key, (void*)ptr); - } - - return (opal_print_args_buffers_t*) ptr; + return &name_buffer; } static char* ompi_pmix_print_jobids(const opal_jobid_t job) @@ -1043,10 +1007,6 @@ int ompi_rte_finalize(void) opal_process_info.initial_errhandler = NULL; } - if (fns_init) { - OBJ_DESTRUCT(&print_args_tsd_key); - } - /* cleanup our internal nspace hack */ opal_pmix_finalize_nspace_tracker(); diff --git a/ompi/runtime/ompi_spc.c b/ompi/runtime/ompi_spc.c index 6f1d8aa7d6a..dcbbe04b256 100644 --- a/ompi/runtime/ompi_spc.c +++ b/ompi/runtime/ompi_spc.c @@ -71,8 +71,10 @@ static const ompi_spc_event_t ompi_spc_events_desc[OMPI_SPC_NUM_COUNTERS] = { SET_COUNTER_ARRAY(OMPI_SPC_SENDRECV, "The number of times MPI_Sendrecv was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_SENDRECV_REPLACE, "The number of times MPI_Sendrecv_replace was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_PUT, "The number of times MPI_Put was called.", false, false), + SET_COUNTER_ARRAY(OMPI_SPC_PUT_NOTIFY, "The number of times MPI_Put_notify was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_RPUT, "The number of times MPI_Rput was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_GET, "The number of times MPI_Get was called.", false, false), + SET_COUNTER_ARRAY(OMPI_SPC_GET_NOTIFY, "The number of times MPI_Get_notify was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_RGET, "The number of times MPI_Rget was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_PROBE, "The number of times MPI_Probe was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_IPROBE, "The number of times MPI_Iprobe was called.", false, false), diff --git a/ompi/runtime/ompi_spc.h b/ompi/runtime/ompi_spc.h index 76ec7f25f16..3d0efd257b3 100644 --- a/ompi/runtime/ompi_spc.h +++ b/ompi/runtime/ompi_spc.h @@ -58,8 +58,10 @@ typedef enum ompi_spc_counters { OMPI_SPC_SENDRECV, OMPI_SPC_SENDRECV_REPLACE, OMPI_SPC_PUT, + OMPI_SPC_PUT_NOTIFY, OMPI_SPC_RPUT, OMPI_SPC_GET, + OMPI_SPC_GET_NOTIFY, OMPI_SPC_RGET, OMPI_SPC_PROBE, OMPI_SPC_IPROBE, diff --git a/opal/mca/btl/smcuda/Makefile.am b/opal/mca/btl/smcuda/Makefile.am index c0cdf788e8d..9aed69bfb7f 100644 --- a/opal/mca/btl/smcuda/Makefile.am +++ b/opal/mca/btl/smcuda/Makefile.am @@ -46,15 +46,11 @@ component_noinst = libmca_btl_smcuda.la component_install = endif -# See opal/mca/common/cuda/Makefile.am for an explanation of -# libmca_common_sm.la. - mcacomponentdir = $(opallibdir) mcacomponent_LTLIBRARIES = $(component_install) mca_btl_smcuda_la_SOURCES = $(libmca_btl_smcuda_la_sources) mca_btl_smcuda_la_LDFLAGS = -module -avoid-version $(btl_smcuda_LDFLAGS) mca_btl_smcuda_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \ - $(OPAL_TOP_BUILDDIR)/opal/mca/common/sm/lib@OPAL_LIB_NAME@mca_common_sm.la \ $(btl_smcuda_LIBS) mca_btl_smcuda_la_CPPFLAGS = $(btl_smcuda_CPPFLAGS) diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c index 1ce2b966ece..e832c8ed81e 100644 --- a/opal/mca/btl/smcuda/btl_smcuda.c +++ b/opal/mca/btl/smcuda/btl_smcuda.c @@ -235,7 +235,6 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s free(loc); } else { /* If we have hwloc support, then get accurate information */ - loc = NULL; if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) { rc = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE, 0, OPAL_HWLOC_AVAILABLE); @@ -249,6 +248,7 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s mca_btl_smcuda_component.num_mem_nodes = rc; } } + loc = NULL; /* see if we were given our location */ OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING, &OPAL_PROC_MY_NAME, &loc, PMIX_STRING); if (OPAL_SUCCESS == rc) { @@ -267,6 +267,7 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s free(mynuma); } free(loc); + loc = NULL; } } else { /* If we have hwloc support, then get accurate information */ diff --git a/opal/mca/btl/smcuda/configure.m4 b/opal/mca/btl/smcuda/configure.m4 new file mode 100644 index 00000000000..e9cb2df2996 --- /dev/null +++ b/opal/mca/btl/smcuda/configure.m4 @@ -0,0 +1,29 @@ +# Copyright (c) 2024 NVIDIA Corporation. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# +# If any accelerators have been discovered, then build support for the +# accelerator BTL. This assumes the discovery has already been done. +# +# Beware: Un like the name seems to indicate this BTl is generic and used by +# all accelerators. + +AC_DEFUN([MCA_opal_btl_smcuda_CONFIG],[ + AC_CONFIG_FILES([opal/mca/btl/smcuda/Makefile]) + + # This component shall be configured only after the accelerator discovery + # has been completed. This discovery is part of the OPAL accelerator framework. + AC_MSG_CHECKING([if any accelerator components were found (cuda, rocm, ze)]) + AS_IF([test "x$OMPI_HAVE_ACCELERATOR_SUPPORT" = "x1"], + [AC_MSG_RESULT([yes]) + $1], + [AC_MSG_RESULT([no]) + $2]) + +])dnl diff --git a/opal/mca/rcache/gpusm/configure.m4 b/opal/mca/rcache/gpusm/configure.m4 new file mode 100644 index 00000000000..d721910500e --- /dev/null +++ b/opal/mca/rcache/gpusm/configure.m4 @@ -0,0 +1,27 @@ +# Copyright (c) 2026 NVIDIA Corporation. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# +# If any accelerators have been discovered, then build support for the +# accelerator rcache component. +# +AC_DEFUN([MCA_opal_rcache_gpusm_CONFIG],[ + + AC_CONFIG_FILES([opal/mca/rcache/gpusm/Makefile]) + + # This component shall be configured only after the accelerator discovery + # has been completed. This discovery is part of the OPAL accelerator framework. + AC_MSG_CHECKING([if any accelerator components were found (cuda, rocm, ze)]) + AS_IF([test "x$OMPI_HAVE_ACCELERATOR_SUPPORT" = "x1"], + [AC_MSG_RESULT([yes]) + $1], + [AC_MSG_RESULT([no]) + $2]) + +])dnl diff --git a/opal/mca/rcache/rgpusm/configure.m4 b/opal/mca/rcache/rgpusm/configure.m4 new file mode 100644 index 00000000000..f5e3eda0154 --- /dev/null +++ b/opal/mca/rcache/rgpusm/configure.m4 @@ -0,0 +1,27 @@ +# Copyright (c) 2026 NVIDIA Corporation. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# +# If any accelerators have been discovered, then build support for the +# accelerator rcache component. +# +AC_DEFUN([MCA_opal_rcache_rgpusm_CONFIG],[ + + AC_CONFIG_FILES([opal/mca/rcache/rgpusm/Makefile]) + + # This component shall be configured only after the accelerator discovery + # has been completed. This discovery is part of the OPAL accelerator framework. + AC_MSG_CHECKING([if any accelerator components were found (cuda, rocm, ze)]) + AS_IF([test "x$OMPI_HAVE_ACCELERATOR_SUPPORT" = "x1"], + [AC_MSG_RESULT([yes]) + $1], + [AC_MSG_RESULT([no]) + $2]) + +])dnl diff --git a/opal/mca/smsc/accelerator/configure.m4 b/opal/mca/smsc/accelerator/configure.m4 new file mode 100644 index 00000000000..9fa993e9cf5 --- /dev/null +++ b/opal/mca/smsc/accelerator/configure.m4 @@ -0,0 +1,27 @@ +# Copyright (c) 2026 NVIDIA Corporation. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# +# If any accelerators have been discovered, then build support for the +# accelerator SMSC component. +# +AC_DEFUN([MCA_opal_smsc_accelerator_CONFIG],[ + + AC_CONFIG_FILES([opal/mca/smsc/accelerator/Makefile]) + + # This component shall be configured only after the accelerator discovery + # has been completed. This discovery is part of the OPAL accelerator framework. + AC_MSG_CHECKING([if any accelerator components were found (cuda, rocm, ze)]) + AS_IF([test "x$OMPI_HAVE_ACCELERATOR_SUPPORT" = "x1"], + [AC_MSG_RESULT([yes]) + $1], + [AC_MSG_RESULT([no]) + $2]) + +])dnl diff --git a/oshmem/mca/memheap/base/memheap_base_frame.c b/oshmem/mca/memheap/base/memheap_base_frame.c index 53a71b27a9e..82658e09791 100644 --- a/oshmem/mca/memheap/base/memheap_base_frame.c +++ b/oshmem/mca/memheap/base/memheap_base_frame.c @@ -33,9 +33,9 @@ int mca_memheap_base_output = -1; int mca_memheap_base_key_exchange = 1; -opal_list_t mca_memheap_base_components_opened = {{0}}; +opal_list_t mca_memheap_base_components_opened = {}; int mca_memheap_base_already_opened = 0; -mca_memheap_map_t mca_memheap_base_map = {{{{0}}}}; +mca_memheap_map_t mca_memheap_base_map = {}; int mca_memheap_num_segments_warn = 32; static int mca_memheap_base_register(mca_base_register_flag_t flags) diff --git a/oshmem/shmem/c/shmem_put_nb.c b/oshmem/shmem/c/shmem_put_nb.c index 89e4bf18240..cef6abcc40b 100644 --- a/oshmem/shmem/c/shmem_put_nb.c +++ b/oshmem/shmem/c/shmem_put_nb.c @@ -11,6 +11,7 @@ #include "oshmem/constants.h" #include "oshmem/include/shmem.h" +#include "oshmem/include/shmemx.h" #include "oshmem/runtime/runtime.h"