diff --git a/.ci/community-jenkins/Jenkinsfile b/.ci/community-jenkins/Jenkinsfile
index f4b305f1d66..2c20d630ac1 100644
--- a/.ci/community-jenkins/Jenkinsfile
+++ b/.ci/community-jenkins/Jenkinsfile
@@ -14,7 +14,6 @@
//
//
// WORKSPACE Layout:
-// autotools-install/ Autotools install for the builder
// ompi/ Open MPI source tree
// We if we push changes to a PR, we don't need to keep old jobs running, so
@@ -56,9 +55,26 @@ println('Tests Completed')
// although currently we only support the one stage of "everything", where each
// build stage is a map of different configurations to test.
def prepare_check_stages() {
- def configure_options = ["--disable-dlopen", "--disable-oshmem", "--enable-builtin-atomic", "--enable-ipv6"]
- def compilers = ["clang10", "gcc7", "gcc8", "gcc9", "gcc10"]
- def platforms = ["amazon_linux_2", "amazon_linux_2-arm64", "rhel8"]
+ def configure_options = [
+ "--disable-dlopen",
+ "--disable-oshmem",
+ "--enable-builtin-atomic",
+ "--enable-ipv6"
+ ]
+ def compilers = [
+ "gcc14",
+ "clang18"
+ ]
+ def platforms = [
+ "amazon_linux_2",
+ "amazon_linux_2-arm64",
+ "rhel8",
+ "amazon_linux_2023-arm64",
+ "amazon_linux_2023-x86_64",
+ "ubuntu_20.04",
+ "ubuntu_24.04-arm64",
+ "ubuntu_24.04-x86_64"
+ ]
def check_stages_list = []
// Build everything stage
@@ -79,6 +95,7 @@ def prepare_check_stages() {
}
build_parallel_map.put("distcheck", prepare_build("distcheck", "tarball_build", "--distcheck"))
+ build_parallel_map.put("vpath", prepare_build("vpath", "", "--build-dir ompi-build"))
check_stages_list.add(build_parallel_map)
@@ -89,14 +106,20 @@ def prepare_build(build_name, label, build_arg) {
return {
stage("${build_name}") {
node(label) {
- checkout(changelog: false, poll: false, scm: scm)
+ // Checkout into ompi-source instead of the top of the
+ // workspace, so that we have room in the workspace to setup a
+ // vpath build.
+ dir ('ompi-source') {
+ checkout(changelog: false, poll: false, scm: scm)
+ }
+
// If pr-builder.sh fails, the sh step will throw an exception,
// which we catch so that the job doesn't abort and continues on
// to other steps - such as cleanup. Because we catch the
// exception, we need to tell Jenkins the overall job has
// failed.
try {
- sh "/bin/bash -x .ci/community-jenkins/pr-builder.sh ${build_arg} ompi"
+ sh "/bin/bash -x ompi-source/.ci/community-jenkins/pr-builder.sh ${build_arg} --source-dir ompi-source"
} catch (Exception e) {
currentBuild.result = "FAILURE"
}
diff --git a/.ci/community-jenkins/pr-builder.sh b/.ci/community-jenkins/pr-builder.sh
index eb88b4c1538..88426859bf0 100755
--- a/.ci/community-jenkins/pr-builder.sh
+++ b/.ci/community-jenkins/pr-builder.sh
@@ -21,6 +21,8 @@ MAKE_ARGS=
MAKE_J="-j 8"
PREFIX="${WORKSPACE}/install"
MPIRUN_MODE=${MPIRUN_MODE:-runall}
+SOURCE_DIR=
+BUILD_DIR=
#
# Options Parsing
@@ -77,6 +79,24 @@ while (( "$#" )); do
exit 1
fi
;;
+ --source-dir)
+ if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then
+ SOURCE_DIR=$2
+ shift 2
+ else
+ echo "Error: Argument for $1 is missing" >&2
+ exit 1
+ fi
+ ;;
+ --build-dir)
+ if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then
+ BUILD_DIR=$2
+ shift 2
+ else
+ echo "Error: Argument for $1 is missing" >&2
+ exit 1
+ fi
+ ;;
-*|--*=) # Unsupported flags
echo "Error: Unsupported flag $1" >&2
exit 1
@@ -105,93 +125,43 @@ fi
echo "--> platform: $PLATFORM_ID"
echo "--> version: $VERSION_ID"
+if test "${SOURCE_DIR}" = "" ; then
+ echo "SOURCED_DIR is unset. Cannot continue."
+ exit 1
+fi
+
+echo "--> Workspace: ${WORKSPACE}"
+echo "--> Source Dir: ${SOURCE_DIR}"
+echo "--> Build Dir: ${BUILD_DIR}"
+echo "--> Install Dir: ${PREFIX}"
+
#
# See if builder provided a compiler we should use, and translate it to
# CONFIGURE_ARGS.
#
-case ${PLATFORM_ID} in
- rhel)
- case "$COMPILER" in
- gcc48|"")
- echo "--> Using default compilers"
- ;;
- *)
- echo "Unsupported compiler ${COMPILER}. Aborting"
- exit 1
- ;;
- esac
- ;;
- amzn)
- case "$COMPILER" in
- "")
- echo "--> Using default compilers"
- ;;
- gcc44)
- CONFIGURE_ARGS="$CONFIGURE_ARGS CC=gcc44 CXX=g++44 FC=gfortran44"
- ;;
- gcc48)
- CONFIGURE_ARGS="$CONFIGURE_ARGS CC=gcc48 CXX=g++48 FC=gfortran48"
- ;;
- clang36)
- CONFIGURE_ARGS="$CONFIGURE_ARGS CC=clang CXX=clang++ --disable-mpi-fortran"
- ;;
- *)
- echo "Unsupported compiler ${COMPILER}. Aborting"
- exit 1
- ;;
- esac
- ;;
- ubuntu)
- case "$COMPILER" in
- "")
- echo "--> Using default compilers"
- ;;
- gcc4*)
- version=`echo "$COMPILER" | sed -e 's/gcc4\([0-9]*\)/4.\1/'`
- CONFIGURE_ARGS="CC=gcc-${version} CXX=g++-${version} FC=gfortran-${version}"
- ;;
- gcc*)
- version=`echo "$COMPILER" | sed -e 's/gcc\([0-9]*\)/\1/'`
- CONFIGURE_ARGS="CC=gcc-${version} CXX=g++-${version} FC=gfortran-${version}"
- ;;
- clang3*|clang4*|clang5*|clang6*)
- version=`echo "$COMPILER" | sed -e 's/clang\([0-9]\)\([0-9]*\)/\1.\2/'`
- CONFIGURE_ARGS="CC=clang-${version} CXX=clang++-${version} --disable-mpi-fortran"
- ;;
+if test "${COMPILER}" != "" ; then
+ if test ! -r ${HOME}/ompi-compiler-setup.sh ; then
+ echo "Could not find compiler setup script ompi-compiler-setup.sh. Aborting."
+ exit 1
+ fi
+
+ . ${HOME}/ompi-compiler-setup.sh
+ activate_compiler ${COMPILER}
+
+ CONFIGURE_ARGS="${CONFIGURE_ARGS} CC=${CC} CPP=${CPP} CXX=${CXX} FC=${FC}"
+ if test "$FC" = "" ; then
+ CONFIGURE_ARGS="${CONFIGURE_ARGS} --disable-mpi-fortran"
+ else
+ # Flang doesn't seem good enough (yet) to compile our Fortran bindings,
+ # so skip for now.
+ case "${COMPILER}" in
clang*)
- version=`echo "$COMPILER" | sed -e 's/clang\([0-9]*\)/\1/'`
- CONFIGURE_ARGS="CC=clang-${version} CXX=clang++-${version} --disable-mpi-fortran"
- ;;
- *)
- echo "Unsupported compiler ${COMPILER}. Aborting"
- exit 1
+ CONFIGURE_ARGS="${CONFIGURE_ARGS} --disable-mpi-fortran"
;;
esac
- ;;
- sles)
- case "$COMPILER" in
- "")
- echo "--> Using default compilers"
- ;;
- gcc48)
- CONFIGURE_ARGS="$CONFIGURE_ARGS CC=gcc-48 CXX=g++-48 FC=gfortran-48"
- ;;
- gcc5)
- CONFIGURE_ARGS="$CONFIGURE_ARGS CC=gcc-5 CXX=g++-5 FC=gfortran-5"
- ;;
- gcc6)
- CONFIGURE_ARGS="$CONFIGURE_ARGS CC=gcc-6 CXX=g++-6 FC=gfortran-6"
- ;;
- *)
- echo "Unsupported compiler ${COMPILER}. Aborting"
- exit 1
- ;;
- esac
- ;;
- FreeBSD)
- CONFIGURE_ARGS="$CONFIGURE_ARGS LDFLAGS=-Wl,-rpath,/usr/local/lib/gcc5 --with-wrapper-ldflags=-Wl,-rpath,/usr/local/lib/gcc5"
- ;;
-esac
+ fi
+fi
+
CONFIGURE_ARGS="$CONFIGURE_ARGS --disable-silent-rules"
echo "--> Compiler setup: $CONFIGURE_ARGS"
@@ -210,10 +180,20 @@ fi
echo "--> Autogen arguments: $AUTOGEN_ARGS"
echo "--> Configure arguments: $CONFIGURE_ARGS"
+cd "${WORKSPACE}/${SOURCE_DIR}"
+
# Build
sha1=`git rev-parse HEAD`
echo "--> Building commit ${sha1}"
+if test "${HOME}/ompi-setup-python.sh" ; then
+ echo "--> Initializing Python environment"
+ . ${HOME}/ompi-setup-python.sh
+ find . -name "requirements.txt" -exec ${PIP_CMD} install -r {} \;
+else
+ echo "--> No Python environment found, hoping for the best."
+fi
+
if test -f autogen.pl; then
echo "--> running ./autogen.pl ${AUTOGEN_ARGS}"
./autogen.pl ${AUTOGEN_ARGS}
@@ -227,9 +207,20 @@ else
fi
fi
-echo "--> running ./configure --prefix=\"${PREFIX}\" ${CONFIGURE_ARGS}"
-if ! ./configure --prefix="${PREFIX}" ${CONFIGURE_ARGS}; then
- echo "./configure --prefix=\"${PREFIX}\" ${CONFIGURE_ARGS} failed, ABORTING !"
+if test "${BUILD_DIR}" != "" ; then
+ cd "${WORKSPACE}"
+ rm -rf "${BUILD_DIR}"
+ mkdir "${BUILD_DIR}"
+ cd "${WORKSPACE}/${BUILD_DIR}"
+ CONFIGURE=../${SOURCE_DIR}/configure
+else
+ # already in ${WORKSPACE}/${SOURCE_DIR}
+ CONFIGURE=./configure
+fi
+
+echo "--> running ${CONFIGURE} --prefix=\"${PREFIX}\" ${CONFIGURE_ARGS}"
+if ! ${CONFIGURE} --prefix="${PREFIX}" ${CONFIGURE_ARGS}; then
+ echo "${CONFIGURE} --prefix=\"${PREFIX}\" ${CONFIGURE_ARGS} failed, ABORTING !"
if test -f config.log; then
echo "config.log content :"
cat config.log
@@ -268,7 +259,7 @@ echo "--> running ompi_info"
ompi_info
echo "--> running make all in examples"
-cd "examples"
+cd "${WORKSPACE}/${SOURCE_DIR}/examples"
make ${MAKE_ARGS} all
cd ..
diff --git a/.github/workflows/ompi_mpi4py_asan.yaml b/.github/workflows/ompi_mpi4py_asan.yaml
new file mode 100644
index 00000000000..240e3d2f101
--- /dev/null
+++ b/.github/workflows/ompi_mpi4py_asan.yaml
@@ -0,0 +1,148 @@
+name: mpi4py (ASAN)
+
+on:
+ pull_request:
+ workflow_dispatch:
+ inputs:
+ repository:
+ description: 'mpi4py repository'
+ default: 'mpi4py/mpi4py'
+ required: false
+ type: string
+ ref:
+ description: 'mpi4py branch/tag/SHA'
+ default: 'master'
+ required: false
+ type: string
+
+permissions:
+ contents: read
+
+jobs:
+ test:
+ # We need Ubuntu 24.04 (over 22.04) due to a kernel bug,
+ # see https://github.com/google/sanitizers/issues/856.
+ runs-on: ubuntu-24.04
+ timeout-minutes: 30
+ env:
+ MPI4PY_TEST_SPAWN: true
+ # disable ASAN while building
+ ASAN_OPTIONS: verify_asan_link_order=0,detect_odr_violation=0,abort_on_error=0
+ # disable leak detection
+ LSAN_OPTIONS: detect_leaks=0,exitcode=0
+
+ steps:
+ - name: Configure hostname
+ run: echo 127.0.0.1 `hostname` | sudo tee -a /etc/hosts > /dev/null
+ if: ${{ runner.os == 'Linux' || runner.os == 'macOS' }}
+
+ - name: Install dependencies
+ run: sudo apt-get install -y -q
+ libnuma-dev libasan8
+ if: ${{ runner.os == 'Linux' }}
+
+ - name: Checkout Open MPI
+ uses: actions/checkout@v4
+ with:
+ path: mpi-build
+ submodules: recursive
+
+ - name: Bootstrap Open MPI
+ run: ./autogen.pl
+ working-directory: mpi-build
+
+ # Install into a separate directory (/opt/openmpi) so that we can
+ # bundle up that tree into an artifact to share with other jobs in
+ # this github action. Specifically don't use /usr/local, because
+ # there's a bunch of other stuff already installed in /usr/local,
+ # and we don't need to include that in our artifact.
+ - name: Configure Open MPI
+ run: ./configure
+ --enable-debug
+ --disable-dependency-tracking
+ --disable-sphinx
+ --disable-mpi-fortran
+ --disable-oshmem
+ --disable-silent-rules
+ --prefix=/opt/openmpi
+ CFLAGS="-O1 -fno-omit-frame-pointer -g -fsanitize=address"
+ LDFLAGS="-Wl,-rpath,/opt/openmpi/lib -fsanitize=address"
+ working-directory: mpi-build
+
+ - name: Build MPI
+ run: make -j $(nproc)
+ working-directory: mpi-build
+
+ - name: Install MPI
+ run: sudo make install
+ working-directory: mpi-build
+
+ - name: Add Open MPI to PATH
+ run: echo /opt/openmpi/bin >> $GITHUB_PATH
+
+ - name: Tweak MPI
+ run: |
+ # Tweak MPI
+ mca_params="$HOME/.openmpi/mca-params.conf"
+ mkdir -p "$(dirname "$mca_params")"
+ echo mpi_param_check = true >> "$mca_params"
+ echo mpi_show_handle_leaks = true >> "$mca_params"
+ mca_params="$HOME/.prte/mca-params.conf"
+ mkdir -p "$(dirname "$mca_params")"
+ echo rmaps_default_mapping_policy = :oversubscribe >> "$mca_params"
+
+ - name: Use Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: 3
+ architecture: x64
+
+ - name: Install Python packages (build)
+ run: python -m pip install --upgrade
+ setuptools pip wheel
+
+ - name: Install Python packages (test)
+ run: python -m pip install --upgrade
+ numpy cffi pyyaml
+
+ - name: Checkout mpi4py
+ uses: actions/checkout@v4
+ with:
+ repository: ${{ inputs.repository || 'mpi4py/mpi4py' }}
+ ref: ${{ inputs.ref }}
+
+ - name: Setting up ASAN environment
+ # LD_PRELOAD is needed to make sure ASAN is the first thing loaded
+ # as it will otherwise complain.
+ # Leak detection is currently disabled because of the size of the report.
+ # The patcher is disabled because ASAN fails if code mmaps data at fixed
+ # memory addresses, see https://github.com/open-mpi/ompi/issues/12819.
+ # ODR violation detection is disabled until #13469 is fixed.
+ run: |
+ echo LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libasan.so.8 >> $GITHUB_ENV
+ echo ASAN_OPTIONS=detect_odr_violation=0,abort_on_error=1,detect_stack_use_after_return=1 >> $GITHUB_ENV
+ echo LSAN_OPTIONS=detect_leaks=0,exitcode=0 >> $GITHUB_ENV
+ echo OMPI_MCA_memory=^patcher >> $GITHUB_ENV
+
+ - name: Show MPI
+ run: ompi_info --all --all
+
+ - name: Install mpi4py
+ run: python -m pip install .
+ env:
+ CFLAGS: "-O0"
+
+ - name: Test mpi4py (singleton)
+ run: python test/main.py -v -x TestExcErrhandlerNull
+ if: ${{ true }}
+ timeout-minutes: 10
+
+ - name: Test mpi4py (np=1)
+ run: mpiexec -n 1 python test/main.py -v -x TestExcErrhandlerNull
+ if: ${{ true }}
+ timeout-minutes: 10
+
+ - name: Test mpi4py (np=4)
+ run: mpiexec -n 4 python test/main.py -v -f -x TestExcErrhandlerNull
+ if: ${{ true }}
+ timeout-minutes: 10
diff --git a/.gitignore b/.gitignore
index 7ab0b99af7d..b30321da7ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -542,3 +542,5 @@ ompi/mpi/fortran/use-mpi-f08/base/*_generated.c
ompi/mpi/fortran/use-mpi-f08/mod/mpi-f08-interfaces-generated.h
ompi/mpi/fortran/use-mpi-ignore-tkr/mpi-ignore-tkr-interfaces-generated.h
ompi/mpi/fortran/use-mpi-ignore-tkr/*_generated.F90
+
+.vscode/
diff --git a/3rd-party/openpmix b/3rd-party/openpmix
index 7704efaf865..53fce423d5d 160000
--- a/3rd-party/openpmix
+++ b/3rd-party/openpmix
@@ -1 +1 @@
-Subproject commit 7704efaf865328234e3cb1f77ff393adc971c9fe
+Subproject commit 53fce423d5d6b25798ed1f32837671dc55d0230d
diff --git a/3rd-party/prrte b/3rd-party/prrte
index 91544b8d2c5..2d9b0aaaeea 160000
--- a/3rd-party/prrte
+++ b/3rd-party/prrte
@@ -1 +1 @@
-Subproject commit 91544b8d2c5ac84585022d0edad68e38f375a917
+Subproject commit 2d9b0aaaeea49a0e7850aed95e5ace9340c7d847
diff --git a/config/ompi_fortran_check.m4 b/config/ompi_fortran_check.m4
index e479a87ac64..7fd2a790353 100644
--- a/config/ompi_fortran_check.m4
+++ b/config/ompi_fortran_check.m4
@@ -137,8 +137,8 @@ AC_DEFUN([OMPI_FORTRAN_CHECK], [
long*double*_Complex) ofc_type_kind=C_LONG_DOUBLE_COMPLEX ;;
opal_short_float_t) ofc_type_kind=C_SHORT_FLOAT ;;
opal_short_float_complex_t) ofc_type_kind=C_SHORT_FLOAT_COMPLEX ;;
- _Float128) ofc_type_kind=C__FLOAT128 ;;
- __float128) ofc_type_kind=C___FLOAT128 ;;
+ _Float128) ofc_type_kind=C_FLOAT128 ;;
+ __float128) ofc_type_kind=C_FLOAT128 ;;
*)
# Skip types like "DOUBLE PRECISION"
;;
diff --git a/config/ompi_fortran_check_real16_c_equiv.m4 b/config/ompi_fortran_check_real16_c_equiv.m4
index 85141c798b6..b9e67d9606e 100644
--- a/config/ompi_fortran_check_real16_c_equiv.m4
+++ b/config/ompi_fortran_check_real16_c_equiv.m4
@@ -61,19 +61,26 @@ AC_DEFUN([OMPI_FORTRAN_CHECK_REAL16_C_EQUIV],[
AC_MSG_RESULT([works!])],
[AC_MSG_RESULT([does not work])])
])
- # As recent Intel compilers identify as GNU we will always test for Quad support if no other tests were succesfull
+ # As recent Intel compilers identify as GNU we will always test for Quad
+ # support if no other tests were succesfull
AS_IF([test "$fortran_real16_happy" = "no"],
- [AC_CHECK_TYPES(_Quad)
- AS_IF([test "$ac_cv_type__Quad" = "yes"],
- [AC_MSG_CHECKING([if the compiler _Quad == REAL*16])
- CFLAGS_save="$CFLAGS"
+ [AC_CHECK_TYPES([_Quad])
+ AS_IF([test "$ac_cv_type__Quad" != "yes"],
+ [CFLAGS_save="$CFLAGS"
OPAL_FLAGS_APPEND_UNIQ([CFLAGS], ["-Qoption,cpp,--extended_float_types"])
+ # force the check as we have updated CFLAGS
+ unset ac_cv_type__Quad
+ AC_CHECK_TYPES([_Quad])
+ AS_IF([test "$ac_cv_type__Quad" != "yes"],
+ [CFLAGS="$CFLAGS_save"])
+ ])
+ AS_IF([test "$ac_cv_type__Quad" != "yes"],
+ [AC_MSG_CHECKING([if the compiler _Quad == REAL*16])
OMPI_FORTRAN_CHECK_REAL16_EQUIV_TYPE([_Quad], [q])
AS_IF([test "$fortran_real16_happy" = "yes"],
[OMPI_FORTRAN_REAL16_C_TYPE="_Quad"
AC_MSG_RESULT([works!])],
- [CFLAGS="$CFLAGS_save"
- AC_MSG_RESULT([does not work])])
+ [AC_MSG_RESULT([does not work])])
])
])
# We have to [re-]print a new message here, because
diff --git a/config/ompi_setup_mpi_fortran.m4 b/config/ompi_setup_mpi_fortran.m4
index 3474276e661..c396a2efab6 100644
--- a/config/ompi_setup_mpi_fortran.m4
+++ b/config/ompi_setup_mpi_fortran.m4
@@ -226,7 +226,7 @@ AC_DEFUN([OMPI_SETUP_MPI_FORTRAN],[
[long double _Complex, double _Complex, float _Complex, short float _Complex, opal_short_float_complex_t],
[16], [no])
OMPI_FORTRAN_CHECK([COMPLEX*32], [no],
- [_Float128 _Complex, long double _Complex, double _Complex, float _Complex, short float _Complex, opal_short_float_complex_t],
+ [_Float128 _Complex, __float128 _Complex, long double _Complex, double _Complex, float _Complex, short float _Complex, opal_short_float_complex_t],
[32], [no])
# Double precision complex types are not standard, but many
# compilers support it. Code should be wrapped with #ifdef
diff --git a/config/opal_check_cuda.m4 b/config/opal_check_cuda.m4
index a6bf80a1b2a..ed3a51a26e8 100644
--- a/config/opal_check_cuda.m4
+++ b/config/opal_check_cuda.m4
@@ -154,6 +154,7 @@ AC_MSG_CHECKING([if have cuda support])
if test "$opal_check_cuda_happy" = "yes"; then
AC_MSG_RESULT([yes (-I$opal_cuda_incdir)])
CUDA_SUPPORT=1
+ OMPI_HAVE_ACCELERATOR_SUPPORT=1
common_cuda_CPPFLAGS="-I$opal_cuda_incdir"
AC_SUBST([common_cuda_CPPFLAGS])
else
diff --git a/config/opal_check_rocm.m4 b/config/opal_check_rocm.m4
index 25ac54e438e..0d1e6053469 100644
--- a/config/opal_check_rocm.m4
+++ b/config/opal_check_rocm.m4
@@ -57,7 +57,8 @@ AC_DEFUN([OPAL_CHECK_ROCM],[
AS_IF([ test "$opal_check_rocm_happy" = "yes" ],
[ OPAL_APPEND([$1_CPPFLAGS], [$rocm_CPPFLAGS])
AC_DEFINE_UNQUOTED([OPAL_ROCM_SUPPORT], [1], [Enable ROCm support])
- ROCM_SUPPORT=1 ],
+ ROCM_SUPPORT=1
+ OMPI_HAVE_ACCELERATOR_SUPPORT=1 ],
[ AC_DEFINE_UNQUOTED([OPAL_ROCM_SUPPORT], [0], [Disable ROCm support])
ROCM_SUPPORT=0 ])
diff --git a/config/opal_check_ze.m4 b/config/opal_check_ze.m4
index d1d47bb67c1..84c8dacd2df 100644
--- a/config/opal_check_ze.m4
+++ b/config/opal_check_ze.m4
@@ -56,7 +56,8 @@ AC_DEFUN([OPAL_CHECK_ZE],[
AS_IF([ test "$opal_check_ze_happy" = "yes" ],
[ AC_DEFINE_UNQUOTED([OPAL_ZE_SUPPORT], [1], [Enable Intel ZE support])
- ZE_SUPPORT=1 ],
+ ZE_SUPPORT=1
+ OMPI_HAVE_ACCELERATOR_SUPPORT=1 ],
[ AC_DEFINE_UNQUOTED([OPAL_ZE_SUPPORT], [0], [Disable Intel ZE support])
ZE_SUPPORT=0 ])
diff --git a/config/opal_mca.m4 b/config/opal_mca.m4
index cdeb935a3a3..bb51d3bc5f1 100644
--- a/config/opal_mca.m4
+++ b/config/opal_mca.m4
@@ -186,7 +186,7 @@ of type-component pairs. For example, --enable-mca-no-build=pml-ob1])
else
msg=
if test -z "$enable_mca_dso"; then
- enable_mca_dso="accelerator-cuda,accelerator-rocm,accelerator-ze,btl-smcuda,rcache-gpusm,rcache-rgpusm"
+ enable_mca_dso="accelerator-cuda,accelerator-rocm,accelerator-ze"
msg="(default)"
fi
DSO_all=0
diff --git a/configure.ac b/configure.ac
index 928f41b0415..d4276b23284 100644
--- a/configure.ac
+++ b/configure.ac
@@ -276,6 +276,7 @@ m4_ifdef([project_oshmem],
############################################################################
# Configuration options
############################################################################
+OMPI_HAVE_ACCELERATOR_SUPPORT=0
OPAL_CONFIGURE_OPTIONS
diff --git a/contrib/platform/mellanox/optimized.conf b/contrib/platform/mellanox/optimized.conf
index 6a7be025a66..b1316c4b67d 100644
--- a/contrib/platform/mellanox/optimized.conf
+++ b/contrib/platform/mellanox/optimized.conf
@@ -85,8 +85,6 @@ opal_warn_on_missing_libcuda = 0
bml_r2_show_unreach_errors = 0
# alltoall algorithm selection settings for tuned coll mca
-coll_tuned_alltoall_large_msg = 250000
-coll_tuned_alltoall_min_procs = 2048
coll_tuned_alltoall_algorithm_max_requests = 8
coll_tuned_scatter_intermediate_msg = 8192
coll_tuned_scatter_large_msg = 250000
diff --git a/docs/Makefile.am b/docs/Makefile.am
index 871184eb01d..a6edc6ae045 100644
--- a/docs/Makefile.am
+++ b/docs/Makefile.am
@@ -38,7 +38,8 @@ TEXT_SOURCE_FILES = \
$(srcdir)/license/*.txt
IMAGE_SOURCE_FILES = \
$(srcdir)/openmpi_logo.png \
- $(srcdir)/installing-open-mpi/required-support-libraries-dependency-graph.png
+ $(srcdir)/installing-open-mpi/required-support-libraries-dependency-graph.png \
+ $(srcdir)/tuning-apps/collectives/images/xhc-hierarchy.svg
RST_SOURCE_FILES = \
$(srcdir)/*.rst \
$(srcdir)/release-notes/*.rst \
diff --git a/docs/tuning-apps/collectives/components.rst b/docs/tuning-apps/collectives/components.rst
index f29c202e358..921f7e12036 100644
--- a/docs/tuning-apps/collectives/components.rst
+++ b/docs/tuning-apps/collectives/components.rst
@@ -28,7 +28,9 @@ The following provides a list of components and their primary target scenario:
more details.
- ``ucc``: component using the `UCC library `_
for collective operations.
- - ``xhc``: shared memory collective component using XPMEM for data transfers.
+ - ``xhc``: shared memory collective component, employing hierarchical &
+ topology-aware algorithms, with XPMEM for data transfers. See :doc:`xhc` for
+ more details.
- ``acoll``: collective component tuned for AMD Zen architectures. See :doc:`acoll` for
more details.
- ``accelerator``: component providing host-proxy algorithms for some
diff --git a/ompi/mca/coll/xhc/resources/xhc-hierarchy.svg b/docs/tuning-apps/collectives/images/xhc-hierarchy.svg
similarity index 86%
rename from ompi/mca/coll/xhc/resources/xhc-hierarchy.svg
rename to docs/tuning-apps/collectives/images/xhc-hierarchy.svg
index c8f6d8a2da3..b4ae62a6c4f 100644
--- a/ompi/mca/coll/xhc/resources/xhc-hierarchy.svg
+++ b/docs/tuning-apps/collectives/images/xhc-hierarchy.svg
@@ -7,7 +7,7 @@
viewBox="0 0 169.571 119.89402"
version="1.1"
id="svg5"
- inkscape:version="1.2.1 (9c6d41e410, 2022-07-14, custom)"
+ inkscape:version="1.4.3 (0d15f75042, 2025-12-25)"
sodipodi:docname="xhc-hierarchy.svg"
inkscape:export-filename="../xhc-hierarchy.png"
inkscape:export-xdpi="300"
@@ -26,11 +26,11 @@
inkscape:pagecheckerboard="0"
inkscape:document-units="mm"
showgrid="false"
- inkscape:zoom="0.75290071"
- inkscape:cx="286.22632"
- inkscape:cy="274.93665"
+ inkscape:zoom="1.4452058"
+ inkscape:cx="278.16108"
+ inkscape:cy="266.39805"
inkscape:window-width="1920"
- inkscape:window-height="1018"
+ inkscape:window-height="1136"
inkscape:window-x="1920"
inkscape:window-y="0"
inkscape:window-maximized="1"
@@ -78,25 +78,6 @@
id="path-effect556"
is_visible="true"
lpeversion="1" />
-
-
-
+ transform="translate(-430.99854,-193.98109)">
+ y="193.98109" />
NUMA Level
+ y="296.00598">NUMA Level
Socket Level
+ y="259.80359">Socket Level
+ transform="translate(28.708569,27.920669)">
System Level
+ style="font-size:5.64444px;text-align:center;text-anchor:middle;stroke-width:0.264583"
+ x="524.14557"
+ y="204.60033">Node Level
+ inkscape:original-d="m 561.29231,236.42783 c -10.38789,-6.52565 -20.67275,-12.94489 -31.00982,-19.41762"
+ transform="translate(0,-1.0583333)" />
+ y="241.07695" />
+ y="241.07695" />
+ transform="translate(76.684113,23.158255)">
+ y="240.44742" />
+ y="240.44742" />
+ transform="translate(118.68254,23.158255)">
+ transform="translate(150.43255,23.158255)">
+ inkscape:original-d="m 487.56018,236.95873 c 10.17386,-6.63057 20.2468,-13.15301 30.3709,-19.72977"
+ transform="translate(0,-1.0583333)" />
Cores
+ style="font-size:4.93889px;stroke-width:0.264583"
+ x="-163.80605"
+ y="497.17615">Cores
@@ -768,23 +751,23 @@
+ transform="matrix(-1,0,0,1,924.11737,0.52916667)">
NUMA 0Leader
+ transform="translate(44.916471,23.158255)">
P0
+ y="279.12918">P0
P1
+ y="279.12741">P1
P2
+ y="279.12741">P2
P3
+ y="279.12741">P3
P4
+ y="279.12918">P4
P5
+ y="279.12741">P5
P6
+ y="279.12741">P6
P7
+ y="279.12741">P7
P8
+ y="279.12921">P8
P9
+ y="279.12744">P9
10
+ y="279.12744">10
11
+ y="279.12744">11
12
+ y="279.12921">12
13
+ y="279.12744">13
14
+ y="279.12744">14
15
+ y="279.12744">15
+ transform="matrix(-1,0,0,1,944.89717,-1.0583333)">
+ transform="translate(84.666671)">
+ id="g8800">
+ transform="translate(-83.60834,-0.52916667)">
+ transform="rotate(180,501.4769,222.70799)">
+ transform="matrix(1,0,0,-1,45.394312,445.41596)">
+ transform="matrix(-1,0,0,1,955.86739,0.52916667)">
NUMA 1Leader
+ transform="matrix(-1,0,0,1,1029.9509,0.52916667)">
NUMA 3Leader
[...]
+
+Main Features
+-------------
+
+Hierarchy
+~~~~~~~~~
+
+XHC constructs an *n*-level hierarchy (i.e. no limitation on number of levels),
+based on intra-node topological features. Rank/process locality information
+originates from Hwloc, and is obtained through Open MPI's internal structures.
+
+The following topological features can currently be defined:
+
+ * NUMA node
+ * CPU Socket
+ * L1/L2/L3 cache
+ * Hwthread/core
+ * Node (all ranks *are* in same node -> flat hierarchy)
+
+An example of a 3-level XHC hierarchy (``numa,socket`` configuration):
+
+.. image:: images/xhc-hierarchy.svg
+ :width: 450px
+
+Furthermore, support for virtual/user-defined hierarchies is available, to
+allow for even finer control and custom experiments.
+
+**Pipelining** is seamlessly applied across all levels of the hierarchy, to
+minimize hierarchy-induced overheads, and to allow for interleaving of
+operations in certain collectives (e.g. reduce+bcast in allreduce).
+
+Single-copy data transfers
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+XHC supports data transfers between MPI ranks using a single copy, through Open
+MPI's ``opal/smsc`` (shared-memory-single-copy) framework. Despite the
+component's name, XHC actually also supports additional single-copy mechanisms
+in some collectives, though XPMEM is highly recommended.
+
+ * Bcast: XPMEM, CMA, KNEM
+ * Allreduce/Reduce: XPMEM
+ * Barrier: *(irrelevant)*
+
+In XPMEM mode, application buffers are attached on the fly the first time they
+appear, and are saved in ``smsc/xpmem``'s internal registration cache for
+future uses.
+
+Shared-memory data transfers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+XHC also supports data transfers using copy-in-copy-out (CICO) over shared
+memory. Copy-in-copy-out is always used for small messages, with automatic
+switching to single-copy for large ones. All primitives support this mode,
+regardless of XPMEM or SMSC presence, as long as the size of the message is
+below the threshold.
+
+Inline data transfers
+~~~~~~~~~~~~~~~~~~~~~
+
+For especially small messages, the payload data is inlined in the same cache
+line as the control data. This achieves exceptionally low latency in such
+messages. Supported in all primitives, regardless of XPMEM or SMSC presence.
+
+Synchronization
+~~~~~~~~~~~~~~~
+
+XHC uses **lock-free** synchronization, using the single-writer paradigm and lightweight *read* or *write* memory barriers wherever appropriate.
+
+Multi-node with HAN
+-------------------
+
+Even though ``xhc`` only works over shared memory, it may also be utilized in
+multi-node environments, through ``coll/han``. HAN is already the default
+component in multi-node runs, so all that's needed is to define ``xhc`` as the
+component to be used for the intra-node phase:
+
+.. code-block:: sh
+
+ $ mpirun --mca coll_han_bcast_low_module 2 --mca coll_han_reduce_low_module 2 \
+ --mca coll_han_allreduce_low_module 2
+
+.. _mca-params:
+
+MCA Parameters
+--------------
+
+Basic
+~~~~~
+
+.. list-table::
+ :header-rows: 1
+ :widths: 20 10 70
+
+ * - Parameter
+ - Default
+ - Description
+
+ * - coll_xhc_priority
+ - 0
+ - The priority of the component. Set it to a value higher than other
+ components to enable xhc.
+
+Main
+~~~~
+
+.. list-table::
+ :header-rows: 1
+ :widths: 20 20 60
+
+ * - Parameter
+ - Default
+ - Description
+
+ * - coll_xhc_hierarchy
+ - *unset*
+ - A comma separated list of topological features to which XHC's hierarchy
+ should be sensitive. This is a hint -- xhc will automatically: disregard
+ features that don't exist in the system, or that don't further segment
+ the ranks (e.g. ``numa`` was specified, but all ranks are in the same
+ NUMA node); re-order the list to match the system's hierarchy; add an
+ extra top level that's common to all ranks. This parameter applies to
+ all primitives, and is mutually exclusive with the primitive-specific
+ ones below.
+
+ This parameter also supports the use of special modifiers for *virtual
+ hierarchies*. Check ``xhc_component_parse_hierarchy()`` for further
+ explanation and syntax.
+
+ * - coll_xhc_chunk_size
+ - *unset*
+ - The chunk size for the pipelining. Data is processed in this-much sized
+ pieces at once. Applies to all primitives -- mutually exclusive with
+ primitive-specific parameters.
+
+ * - coll_xhc_cico_max
+ - *unset*
+ - The max size up to which to use copy-in-copy-out. Single copy will be
+ used for messages above this size. Applies to all primitives -- mutually
+ exclusive with primitive-specific parameters.
+
+ * - coll_xhc__hierarchy
+ - bcast/barrier: ``numa,socket``
+ (all)reduce: ``l3,numa,socket``
+ - Topological features to consider for XHC's hierarchy, specifially for
+ this primitive. Mutually exclusive with the respective non-specific
+ parameter.
+
+ * - coll_xhc__chunk_size
+ - 16K
+ - Pipeline chunk size, specifically for this primitive. Mutually exclusive
+ with the non-specific parameter.
+
+ * - coll_xhc__cico_max
+ - bcast: ``256``
+ (all)reduce: ``4K``
+ - Max size for copy-in-copy-out transfers, specifically for this
+ primitive. Mutually exclusive with the non-specific parameter.
+
+Advanced
+~~~~~~~~
+
+.. list-table::
+ :header-rows: 1
+ :widths: 20 20 60
+
+ * - Parameter
+ - Default
+ - Description
+
+ * - coll_xhc__root
+ - 0
+ - Internal root rank, for either of these operations.
+
+ * - coll_xhc_uniforms_chunks
+ - true
+ - Whether to dynamically adjust (decrease) the chunk size in reduction
+ primitives, so that all ranks will perform equal work, depending on
+ the message size.
+
+ * - coll_xhc_uniforms_chunks_min
+ - 4K
+ - Minimum allowed value for the automatically decreased chunk size in
+ reduction primitives.
+
+ * - coll_xhc_reduce_load_balance
+ - top,first
+ - Controls load balancing features in reduction primitives. With no such
+ features enabled, leader ranks don't perform any reduction work, on the
+ levels on which they are leaders. Add ``top`` to have the root perform
+ reductions on the top-most level of the hierarchy, as if a common rank.
+ Add ``first``, to have all leaders reduce a single chunk, at the
+ beginning of the operation as if they weren't leaders. Add ``all`` to
+ have leaders always perform reductions, even on the levels on which they
+ are leaders (not recommended).
+
+ * - coll_xhc_dynamic_reduce
+ - non-float
+ - Controls support for out-of-order reduction (rank wise), which allows
+ temporarily skipping a peer that's not yet ready. The default value only
+ enables the feature for non-float types, to avoid reproducibility issues
+ with floats. Set to ``disabled`` or ``all`` to turn off or on,
+ respectively, for all types.
+
+ * - coll_xhc_dynamic_leader
+ - false
+ - Dynamically elect the first rank from each hierarchy group to join the
+ collective as its leader, in broadcast. Introduces an atomic
+ compare-exchange per each call, when enabled.
+
+Other
+~~~~~
+
+.. list-table::
+ :header-rows: 1
+ :widths: 20 20 60
+
+ * - Parameter
+ - Default
+ - Description
+
+ * - coll_xhc_shmem_backing
+ - /dev/shm
+ - Backing directory for shmem files.
+
+ * - coll_xhc_memcpy_chunk_size
+ - 256K
+ - Break up large memcpy calls to smaller ones, using this chunk size.
+ Will actually attempt to mirror the value of ``smsc/xpmem``'s respective
+ parameter at run-time.
+
+Debug
+~~~~~
+
+.. list-table::
+ :header-rows: 1
+ :widths: 25 15 60
+
+ * - Parameter
+ - Default
+ - Description
+
+ * - coll_xhc_print_info
+ - *none*
+ - Print information about the component's configuration, and its
+ constructed hierarchies. Takes a comma delimited list of: the name of
+ the collective primitive about which to print information; ``config``
+ to print the configuration; ``all`` to print everything; ``dot`` along
+ with the name of a collective primitive to print its hierarchy in DOT
+ format.
+
+Limitations
+-----------
+
+* **Heterogeneity**: XHC does not support nodes with non-uniform
+ datatype representations across ranks (Open MPI's ``proc_arch``).
+
+* **Non-commutative** operators are not currently supported in
+ reduction collectives.
+
+* **Derived datatypes** are not yet supported.
+
+* The Reduce implementation only supports rank 0 as the root, and will
+ automatically fall back to another component in other scenarios. Work in
+ progress.
+
+Other resources
+---------------
+
+All things XHC landing page: https://github.com/CARV-ICS-FORTH/XHC-OpenMPI
+
+Publications
+~~~~~~~~~~~~
+
+.. **Publications**
+
+| **A framework for hierarchical single-copy MPI collectives on multicore nodes**
+| *George Katevenis, Manolis Ploumidis, and Manolis Marazakis*
+| Cluster 2022, Heidelberg, Germany
+| https://ieeexplore.ieee.org/document/9912729
+
+| **Impact of Cache Coherence on the Performance of Shared-Memory based MPI Primitives: A Case Study for Broadcast on Intel Xeon Scalable Processors**
+| *George Katevenis, Manolis Ploumidis, and Manolis Marazakis*
+| ICPP 2023, Salt Lake City, Utah, USA
+| https://dl.acm.org/doi/10.1145/3605573.3605616
diff --git a/ompi/communicator/comm_cid.c b/ompi/communicator/comm_cid.c
index be99de913ab..ddf1657b9ab 100644
--- a/ompi/communicator/comm_cid.c
+++ b/ompi/communicator/comm_cid.c
@@ -24,7 +24,7 @@
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
- * Copyright (c) 2020-2025 Triad National Security, LLC. All rights
+ * Copyright (c) 2020-2026 Triad National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@@ -1094,7 +1094,7 @@ int ompi_comm_get_remote_cid_from_pmix (ompi_communicator_t *comm, int dest, uin
}
if (val->type != PMIX_SIZE) {
- OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID type mismatch"));
+ OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID type mismatch - %s", PMIx_Value_string(val)));
rc = OMPI_ERR_TYPE_MISMATCH;
goto done;
}
diff --git a/ompi/include/mpi.h.in b/ompi/include/mpi.h.in
index e06865b182f..1422695ea37 100644
--- a/ompi/include/mpi.h.in
+++ b/ompi/include/mpi.h.in
@@ -764,6 +764,7 @@ enum {
#define MPI_ERR_SESSION 78
#define MPI_ERR_VALUE_TOO_LARGE 79
#define MPI_ERR_ERRHANDLER 80
+#define MPI_ERR_NOTIFY_IDX 81
/* Per MPI-3 p349 47, MPI_ERR_LASTCODE must be >= the last predefined
MPI_ERR_ code. Set the last code to allow some room for adding
@@ -1917,6 +1918,14 @@ OMPI_DECLSPEC int MPI_Get_c(void *origin_addr, MPI_Count origin_count,
MPI_Datatype origin_datatype, int target_rank,
MPI_Aint target_disp, MPI_Count target_count,
MPI_Datatype target_datatype, MPI_Win win);
+OMPI_DECLSPEC int MPI_Get_notify(void *origin_addr, int origin_count,
+ MPI_Datatype origin_datatype, int target_rank,
+ MPI_Aint target_disp, int target_count,
+ MPI_Datatype target_datatype, int notification_idx, MPI_Win win);
+OMPI_DECLSPEC int MPI_Get_notify_c(void *origin_addr, MPI_Count origin_count,
+ MPI_Datatype origin_datatype, int target_rank,
+ MPI_Aint target_disp, MPI_Count target_count,
+ MPI_Datatype target_datatype, int notification_idx, MPI_Win win);
OMPI_DECLSPEC int MPI_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
void *result_addr, int result_count, MPI_Datatype result_datatype,
int target_rank, MPI_Aint target_disp, int target_count,
@@ -2180,6 +2189,12 @@ OMPI_DECLSPEC int MPI_Put(const void *origin_addr, int origin_count, MPI_Dataty
OMPI_DECLSPEC int MPI_Put_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype,
int target_rank, MPI_Aint target_disp, MPI_Count target_count,
MPI_Datatype target_datatype, MPI_Win win);
+OMPI_DECLSPEC int MPI_Put_notify(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+ int target_rank, MPI_Aint target_disp, int target_count,
+ MPI_Datatype target_datatype, int notification_idx, MPI_Win win);
+OMPI_DECLSPEC int MPI_Put_notify_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype,
+ int target_rank, MPI_Aint target_disp, MPI_Count target_count,
+ MPI_Datatype target_datatype, int notification_idx, MPI_Win win);
OMPI_DECLSPEC int MPI_Query_thread(int *provided);
OMPI_DECLSPEC int MPI_Raccumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
int target_rank, MPI_Aint target_disp, int target_count,
@@ -2604,6 +2619,8 @@ OMPI_DECLSPEC int MPI_Win_get_errhandler(MPI_Win win, MPI_Errhandler *errhandle
OMPI_DECLSPEC int MPI_Win_get_group(MPI_Win win, MPI_Group *group);
OMPI_DECLSPEC int MPI_Win_get_info(MPI_Win win, MPI_Info *info_used);
OMPI_DECLSPEC int MPI_Win_get_name(MPI_Win win, char *win_name, int *resultlen);
+OMPI_DECLSPEC int MPI_Win_get_notify_value(MPI_Win win, int notification_idx, MPI_Count *value);
+OMPI_DECLSPEC int MPI_Win_reset_notify_value(MPI_Win win, int notification_idx, MPI_Count *value);
OMPI_DECLSPEC int MPI_Win_lock(int lock_type, int rank, int mpi_assert, MPI_Win win);
OMPI_DECLSPEC int MPI_Win_lock_all(int mpi_assert, MPI_Win win);
OMPI_DECLSPEC int MPI_Win_post(MPI_Group group, int mpi_assert, MPI_Win win);
@@ -3091,6 +3108,14 @@ OMPI_DECLSPEC int PMPI_Get_c(void *origin_addr, MPI_Count origin_count,
MPI_Datatype origin_datatype, int target_rank,
MPI_Aint target_disp, MPI_Count target_count,
MPI_Datatype target_datatype, MPI_Win win);
+OMPI_DECLSPEC int PMPI_Get_notify(void *origin_addr, int origin_count,
+ MPI_Datatype origin_datatype, int target_rank,
+ MPI_Aint target_disp, int target_count,
+ MPI_Datatype target_datatype, int notification_idx, MPI_Win win);
+OMPI_DECLSPEC int PMPI_Get_notify_c(void *origin_addr, MPI_Count origin_count,
+ MPI_Datatype origin_datatype, int target_rank,
+ MPI_Aint target_disp, MPI_Count target_count,
+ MPI_Datatype target_datatype, int notification_idx, MPI_Win win);
OMPI_DECLSPEC int PMPI_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
void *result_addr, int result_count, MPI_Datatype result_datatype,
int target_rank, MPI_Aint target_disp, int target_count,
@@ -3354,6 +3379,12 @@ OMPI_DECLSPEC int PMPI_Put(const void *origin_addr, int origin_count, MPI_Datat
OMPI_DECLSPEC int PMPI_Put_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype,
int target_rank, MPI_Aint target_disp, MPI_Count target_count,
MPI_Datatype target_datatype, MPI_Win win);
+OMPI_DECLSPEC int PMPI_Put_notify(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+ int target_rank, MPI_Aint target_disp, int target_count,
+ MPI_Datatype target_datatype, int notification_idx, MPI_Win win);
+OMPI_DECLSPEC int PMPI_Put_notify_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype,
+ int target_rank, MPI_Aint target_disp, MPI_Count target_count,
+ MPI_Datatype target_datatype, int notification_idx, MPI_Win win);
OMPI_DECLSPEC int PMPI_Query_thread(int *provided);
OMPI_DECLSPEC int PMPI_Raccumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
int target_rank, MPI_Aint target_disp, int target_count,
@@ -3778,6 +3809,8 @@ OMPI_DECLSPEC int PMPI_Win_get_errhandler(MPI_Win win, MPI_Errhandler *errhandl
OMPI_DECLSPEC int PMPI_Win_get_group(MPI_Win win, MPI_Group *group);
OMPI_DECLSPEC int PMPI_Win_get_info(MPI_Win win, MPI_Info *info_used);
OMPI_DECLSPEC int PMPI_Win_get_name(MPI_Win win, char *win_name, int *resultlen);
+OMPI_DECLSPEC int PMPI_Win_get_notify_value(MPI_Win win, int notification_idx, MPI_Count *value);
+OMPI_DECLSPEC int PMPI_Win_reset_notify_value(MPI_Win win, int notification_idx, MPI_Count *value);
OMPI_DECLSPEC int PMPI_Win_lock(int lock_type, int rank, int mpi_assert, MPI_Win win);
OMPI_DECLSPEC int PMPI_Win_lock_all(int mpi_assert, MPI_Win win);
OMPI_DECLSPEC int PMPI_Win_post(MPI_Group group, int mpi_assert, MPI_Win win);
diff --git a/ompi/include/mpif-values.py b/ompi/include/mpif-values.py
index 53159d5d8dd..b74fbcbaf1f 100755
--- a/ompi/include/mpif-values.py
+++ b/ompi/include/mpif-values.py
@@ -301,6 +301,7 @@
'MPI_ERR_SESSION': 78,
'MPI_ERR_VALUE_TOO_LARGE': 79,
'MPI_ERR_ERRHANDLER': 80,
+ 'MPI_ERR_NOTIFY_IDX': 81,
'MPI_ERR_LASTCODE': 92,
'MPI_IDENT': 0,
'MPI_CONGRUENT': 1,
diff --git a/ompi/instance/instance.c b/ompi/instance/instance.c
index bd686d2bab2..6d50d32ffb2 100644
--- a/ompi/instance/instance.c
+++ b/ompi/instance/instance.c
@@ -8,6 +8,7 @@
* reserved.
* Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved.
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
+ * Copyright (c) 2026 Nanook Consulting All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -586,11 +587,16 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
active = true;
OPAL_POST_OBJECT(&active);
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
- if( PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, NULL, 0,
- fence_release,
- (void*)&active))) {
- ret = opal_pmix_convert_status(rc);
- return ompi_instance_print_error ("PMIx_Fence_nb() failed", ret);
+ rc = PMIx_Fence_nb(NULL, 0, NULL, 0, fence_release, (void*)&active);
+ if (PMIX_SUCCESS != rc) {
+ active = false;
+ if (PMIX_OPERATION_SUCCEEDED == rc) {
+ // can return operation_succeeded if atomically completed
+ ret = MPI_SUCCESS;
+ } else {
+ ret = opal_pmix_convert_status(rc);
+ return ompi_instance_print_error ("PMIx_Fence_nb() failed", ret);
+ }
}
}
} else {
@@ -602,12 +608,19 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
OPAL_POST_OBJECT(&active);
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active);
- if( PMIX_SUCCESS != rc) {
- ret = opal_pmix_convert_status(rc);
- return ompi_instance_print_error ("PMIx_Fence() failed", ret);
+ if (PMIX_SUCCESS != rc) {
+ active = false;
+ if (PMIX_OPERATION_SUCCEEDED == rc) {
+ // can return operation_succeeded if atomically completed
+ ret = MPI_SUCCESS;
+ } else {
+ ret = opal_pmix_convert_status(rc);
+ return ompi_instance_print_error ("PMIx_Fence() failed", ret);
+ }
+ } else {
+ /* cannot just wait on thread as we need to call opal_progress */
+ OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
- /* cannot just wait on thread as we need to call opal_progress */
- OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
}
@@ -748,7 +761,9 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
* we have to wait here for it to complete. However, there
* is no reason to do two barriers! */
if (background_fence) {
- OMPI_LAZY_WAIT_FOR_COMPLETION(active);
+ if (active) {
+ OMPI_LAZY_WAIT_FOR_COMPLETION(active);
+ }
} else if (!ompi_async_mpi_init) {
/* wait for everyone to reach this point - this is a hard
* barrier requirement at this time, though we hope to relax
@@ -757,12 +772,19 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
active = true;
OPAL_POST_OBJECT(&active);
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
- if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, info, 1,
- fence_release, (void*)&active))) {
- ret = opal_pmix_convert_status(rc);
- return ompi_instance_print_error ("PMIx_Fence_nb() failed", ret);
+ rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active);
+ if (PMIX_SUCCESS != rc) {
+ active = false;
+ if (PMIX_OPERATION_SUCCEEDED == rc) {
+ // can return operation_succeeded if atomically completed
+ ret = MPI_SUCCESS;
+ } else {
+ ret = opal_pmix_convert_status(rc);
+ return ompi_instance_print_error ("PMIx_Fence() failed", ret);
+ }
+ } else {
+ OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
- OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
}
diff --git a/ompi/mca/coll/accelerator/configure.m4 b/ompi/mca/coll/accelerator/configure.m4
new file mode 100644
index 00000000000..057db874435
--- /dev/null
+++ b/ompi/mca/coll/accelerator/configure.m4
@@ -0,0 +1,27 @@
+# Copyright (c) 2026 NVIDIA Corporation. All rights reserved.
+#
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+#
+# If any accelerators have been discovered, then build support for the
+# accelerator collective component.
+#
+AC_DEFUN([MCA_ompi_coll_accelerator_CONFIG],[
+
+ AC_CONFIG_FILES([ompi/mca/coll/accelerator/Makefile])
+
+ # This component shall be configured only after the accelerator discovery
+ # has been completed. This discovery is part of the OPAL accelerator framework.
+ AC_MSG_CHECKING([if any accelerator components were found (cuda, rocm, ze)])
+ AS_IF([test "x$OMPI_HAVE_ACCELERATOR_SUPPORT" = "x1"],
+ [AC_MSG_RESULT([yes])
+ $1],
+ [AC_MSG_RESULT([no])
+ $2])
+
+])dnl
diff --git a/ompi/mca/coll/acoll/coll_acoll_reduce.c b/ompi/mca/coll/acoll/coll_acoll_reduce.c
index 69da3cb49cf..28fc3c62c6a 100644
--- a/ompi/mca/coll/acoll/coll_acoll_reduce.c
+++ b/ompi/mca/coll/acoll/coll_acoll_reduce.c
@@ -66,7 +66,7 @@ static inline int coll_acoll_reduce_topo(const void *sbuf, void *rbuf, size_t co
int use_socket = (0 == acoll_module->use_socket) ? 1 : acoll_module->use_socket;
tmp_sbuf = (char *) sbuf;
- if ((MPI_IN_PLACE == sbuf) && (rank == root)) {
+ if (MPI_IN_PLACE == sbuf) {
tmp_sbuf = (char *) rbuf;
}
diff --git a/ompi/mca/coll/adapt/coll_adapt_ireduce.c b/ompi/mca/coll/adapt/coll_adapt_ireduce.c
index 15bd586901a..07616285616 100644
--- a/ompi/mca/coll/adapt/coll_adapt_ireduce.c
+++ b/ompi/mca/coll/adapt/coll_adapt_ireduce.c
@@ -48,7 +48,7 @@ int ompi_coll_adapt_ireduce_register(void)
mca_coll_adapt_component.adapt_ireduce_algorithm = 1;
}
- mca_coll_adapt_component.adapt_ireduce_segment_size = 163740;
+ mca_coll_adapt_component.adapt_ireduce_segment_size = 524288;
mca_base_component_var_register(c, "reduce_segment_size",
"Segment size in bytes used by default for reduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
diff --git a/ompi/mca/coll/ftagree/coll_ftagree_component.c b/ompi/mca/coll/ftagree/coll_ftagree_component.c
index 97e9ca4cee7..8a733ad3357 100644
--- a/ompi/mca/coll/ftagree/coll_ftagree_component.c
+++ b/ompi/mca/coll/ftagree/coll_ftagree_component.c
@@ -38,6 +38,8 @@ int mca_coll_ftagree_era_rebuild = 0;
double mca_coll_ftagree_debug_inject_proba = 0.0;
#endif
+static int mca_coll_ft_agreement;
+
/*
* Local function
*/
@@ -92,8 +94,6 @@ ftagree_close(void)
static int
ftagree_register(void)
{
- int value;
-
/* Use a low priority, but allow other components to be lower */
mca_coll_ftagree_priority = 30;
(void) mca_base_component_var_register(&mca_coll_ftagree_component.collm_version,
@@ -103,15 +103,15 @@ ftagree_register(void)
MCA_BASE_VAR_SCOPE_READONLY,
&mca_coll_ftagree_priority);
- if( ompi_ftmpi_enabled ) value = 1;
- else value = 0; /* NOFT: do not initialize ERA */
+ if( ompi_ftmpi_enabled ) mca_coll_ft_agreement = 1;
+ else mca_coll_ft_agreement = 0; /* NOFT: do not initialize ERA */
(void) mca_base_component_var_register(&mca_coll_ftagree_component.collm_version,
"agreement", "Agreement algorithm 0: Allreduce (NOT FAULT TOLERANT); 1: Early Returning Consensus (era); 2: Early Terminating Consensus (eta)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY,
- &value);
- switch(value) {
+ &mca_coll_ft_agreement);
+ switch(mca_coll_ft_agreement) {
case 0:
mca_coll_ftagree_algorithm = COLL_FTAGREE_NOFT;
opal_output_verbose(6, ompi_ftmpi_output_handle,
diff --git a/ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c b/ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c
index 9450c443349..f28c36a3d16 100644
--- a/ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c
+++ b/ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c
@@ -2956,6 +2956,15 @@ int mca_coll_ftagree_era_finalize(void)
"%s ftagree:agreement (ERA) GC: %lu passed agreements remain in the passed agreements hash table\n",
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),
opal_hash_table_get_size(&era_passed_agreements)));
+ /* Some agreements can remain in the era_passed_agreements table until
+ * finalize; notably, the last agreement in a communicator that has been
+ * freed.
+ *
+ * The commit that added this comment also removed the (unused) function
+ * mca_coll_ftagree_era_free_comm that could enforce purging that table
+ * during comm_free, at the cost of making comm_free hard synchronizing;
+ * this was deemed too disruptive for the small memory usage gain.
+ */
for( rc = opal_hash_table_get_first_key_uint64(&era_passed_agreements, &key64, &value, &node);
OPAL_SUCCESS == rc;
rc = opal_hash_table_get_next_key_uint64(&era_passed_agreements, &key64, &value, node, &node) ) {
@@ -3368,46 +3377,3 @@ int mca_coll_ftagree_iera_intra(void *contrib,
return OMPI_SUCCESS;
}
-#if 0
-// Per @bosilca and @jsquyres discussion 29 Apr 2021: there is
-// probably a memory leak in MPI_FINALIZE right now, because this
-// function does not appear to be being called from anywhere.
-// @bosilca's team is looking into it.
-int mca_coll_ftagree_era_free_comm(ompi_communicator_t* comm,
- mca_coll_base_module_t *module)
-{
- ompi_group_t* acked;
- era_identifier_t aid;
- int rc;
-
- OPAL_OUTPUT_VERBOSE((4, ompi_ftmpi_output_handle,
- "%s ftagree:agreement (ERA) Freeing Communicator (%d.%d).\n",
- OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),
- comm->c_contextid,
- comm->c_epoch));
-
- opal_mutex_lock(&ompi_group_afp_mutex);
- ompi_group_intersection(comm->c_remote_group, ompi_group_all_failed_procs, &acked);
- opal_mutex_unlock(&ompi_group_afp_mutex);
- do {
- rc = mca_coll_ftagree_era_intra(NULL,
- 0,
- &ompi_mpi_int.dt,
- &ompi_mpi_op_band.op,
- &acked, true,
- comm,
- comm->c_coll->coll_agree_module);
- } while(rc != MPI_SUCCESS);
- OBJ_RELEASE(acked);
-
- aid.ERAID_FIELDS.contextid = comm->c_contextid.cid_sub.u64;
- aid.ERAID_FIELDS.epoch = comm->c_epoch;
-
- opal_mutex_lock(&era_mutex);
- /** We don't need to set aid.ERAID_FIELDS.agreementid to collect all of them */
- era_collect_passed_agreements(aid, 0, (uint16_t)-1);
- opal_mutex_unlock(&era_mutex);
-
- return OMPI_SUCCESS;
-}
-#endif
diff --git a/ompi/mca/coll/han/coll_han_component.c b/ompi/mca/coll/han/coll_han_component.c
index 1d78bf87158..7ae17b9e4f8 100644
--- a/ompi/mca/coll/han/coll_han_component.c
+++ b/ompi/mca/coll/han/coll_han_component.c
@@ -301,7 +301,7 @@ static int han_register(void)
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_ALL, &cs->han_output_verbose);
- cs->han_bcast_segsize = 65536;
+ cs->han_bcast_segsize = 524288;
(void) mca_base_component_var_register(c, "bcast_segsize",
"segment size for bcast",
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
@@ -321,7 +321,7 @@ static int han_register(void)
&cs->han_bcast_low_module,
&cs->han_op_module_name.bcast.han_op_low_module_name);
- cs->han_reduce_segsize = 65536;
+ cs->han_reduce_segsize = 524288;
(void) mca_base_component_var_register(c, "reduce_segsize",
"segment size for reduce",
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
@@ -340,7 +340,7 @@ static int han_register(void)
OPAL_INFO_LVL_9, &cs->han_reduce_low_module,
&cs->han_op_module_name.reduce.han_op_low_module_name);
- cs->han_allreduce_segsize = 65536;
+ cs->han_allreduce_segsize = 524288;
(void) mca_base_component_var_register(c, "allreduce_segsize",
"segment size for allreduce",
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
diff --git a/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c b/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c
index e3482116c84..9dca14bcc55 100644
--- a/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c
@@ -34,6 +34,8 @@ static int coll_tuned_alltoall_segment_size = 0;
static int coll_tuned_alltoall_tree_fanout;
static int coll_tuned_alltoall_chain_fanout;
+static int deprecated_mca_params = -1;
+
/* valid values for coll_tuned_alltoall_forced_algorithm */
static const mca_base_var_enum_value_t alltoall_algorithms[] = {
{0, "ignore"},
@@ -119,7 +121,6 @@ int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm
MCA_BASE_VAR_SCOPE_ALL,
&coll_tuned_alltoall_chain_fanout);
- int deprecated_mca_params = -1;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_large_msg",
"use pairwise exchange algorithm for messages larger than this value",
diff --git a/ompi/mca/coll/tuned/coll_tuned_component.c b/ompi/mca/coll/tuned/coll_tuned_component.c
index d8dbb7959e4..6f5a8c57987 100644
--- a/ompi/mca/coll/tuned/coll_tuned_component.c
+++ b/ompi/mca/coll/tuned/coll_tuned_component.c
@@ -71,6 +71,8 @@ int ompi_coll_tuned_scatter_large_msg = 0;
int ompi_coll_tuned_scatter_min_procs = 0;
int ompi_coll_tuned_scatter_blocking_send_ratio = 0;
+static int deprecated_mca_params = -1;
+
/* forced algorithm variables */
/* indices for the MCA parameters */
coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT] = {{0}};
@@ -161,7 +163,6 @@ static int tuned_register(void)
MCA_BASE_VAR_SCOPE_ALL,
&ompi_coll_tuned_init_chain_fanout);
- int deprecated_mca_params = -1;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_small_msg",
"threshold (if supported) to decide if small MSGs alltoall algorithm will be used",
diff --git a/ompi/mca/coll/xhc/README.md b/ompi/mca/coll/xhc/README.md
index 213062a5edc..438fd712507 100644
--- a/ompi/mca/coll/xhc/README.md
+++ b/ompi/mca/coll/xhc/README.md
@@ -1,371 +1,8 @@
-# XHC: XPMEM-based Hierarchical Collectives
+# XPMEM Hierarchical Collectives (XHC)
XHC implements hierarchical & topology-aware intra-node MPI collectives,
-utilizing XPMEM for efficient shared address space memory access between
-processes.
+(mainly) utilizing XPMEM for efficient shared address space data transfers
+between MPI ranks.
-## Main features
-
-* XHC constructs an **n-level hierarchy** (i.e. no algorithmic limitation on
-level count), based on intra-node topological features. Rank/process locality
-information is known thanks to Hwloc, and is obtained from Open MPI's
-integrated book-keeping.
-
- Topological features that can currently be defined:
-
- - NUMA node
- - CPU Socket
- - L1/L2/L3 cache
- - Hwthread/core
- - Node (all ranks *are* in same node --> flat, no hierarchy at all)
-
- Example of a 3-level XHC hierarchy (numa+socket+node configuration):
-
- 
-
- Furthermore, support for custom virtual user-defined hierarchies is
- available, to allow fine-grained control over the communication pattern.
-
-* **Single-copy** transportation
-
- - Supported through integration with Open MPI's `opal/smsc`
- (shared-memory-single-copy) framework. Selecting `smsc/xpmem` is highly
- recommended.
-
- - Bcast support: XPMEM, CMA, KNEM
- - Allreduce/Reduce support: XPMEM
- - Barrier support: *(irrelevant)*
-
- - Application buffers are attached on the fly the first time they appear,
- saved on and recovered from the registration cache in subsequent
- appearances. (assuming smsc/xpmem)
-
-* **Copy-in-copy-out (CICO)** transportation
-
- - Through shared memory buffers that remain active throughout the
- component's lifetime.
-
- - Switchover with single-copy at configurable message size.
-
- - Supported in all ops, regardless of smsc support or XPMEM presence (up to
- maximum allowed message size).
-
-* **Inline** transportation
-
- - For especially small messages, payload data is inlined in the same cache
- line as the control data.
-
- - Supported in all ops, regardless of smsc support or XPMEM presence (up to
- maximum allowed message size).
-
-* Data-wise **pipelining** across all levels of the hierarchy. Allows for
-lowering hierarchy-induced start-up overheads, and interleaving of operations
-in applicable operations (e.g. reduce+bcast in allreduce).
-
-* **Lock-free** single-writer synchronization, with appropriate cache-line
-separation where necessary. Consistency ensured via lightweight *read* or
-*write* memory barriers.
-
-## Configuration options -- MCA params
-
-XHC can be customized via a number of standard Open MPI MCA parameters, though
-defaults that should satisfy a wide number of systems are in place.
-
-The available parameters (also found in `coll_xhc_component.c`):
-
-#### *(prepend with "coll_xhc_")*
-
-* **priority** (default `0`): The priority of the coll/xhc component, used
-during the component selection process.
-
-* **print_info** (default `false`): Print information about XHC's generated
-hierarchy and its configuration.
-
-* **shmem_backing** (default `/dev/shm`): Backing directory for shmem files
-used for XHC's synchronization fields and CICO buffers.
-
-* **dynamic_leader** (default `false`): Enables the feature that dynamically
-elects an XHC-communicator leader at each collective (currently only applicable
-for bcast).
-
-* **dynamic_reduce** (default `1`=`non-float`): Enables support for
-out-of-order reduction. Ranks fetch data to reduce from multiple peers;
-out-of-order reduction allows them to temporarily skip a peer when the expected
-data is not yet prepared, instead of stalling. The default value auto-enables
-it when the data is of non-float type; setting to `2`=`enabled for all types`,
-might/will harm reproducibility of reductions with float types.
-
-* **reduce_load_balance** (default `0`=`non-leader`): Controls the
-leader-to-member load balancing mode in reductions. Under `non-leader`, the
-members, and not the leaders, perform reductions. With `top-level`, all members
-as well as the leader of the top-most level perform reductions. With
-`first-chunk`, leaders perform a single reduction on each level for a single
-chunk at the beginning of the operation. `top+first` combines `top-level` and
-`first-chunk`. Finally, with `all`, all ranks perform reductions equally.
-
-* **hierarchy** (default `"numa,socket"`): A comma separated list of
-topological feature to which XHC's hierarchy-building algorithm should be
-sensitive. `ompi_info` reports the possible values for the parameter.
-
- - In some ways, this is "just" a suggestion. The resulting hierarchy may
- not exactly match the requested one. Reasons that this will occur:
-
- - A requested topological feature does not effectively segment the set
- of ranks. (eg. `numa` was specified, but all ranks reside in the same
- NUMA node)
-
- - No feature that all ranks have in common was provided. This a more
- intrinsic detail, that you probably don't need to be aware of, but you
- might come across if eg. you investigate the output of `print_info`. An
- additional level will automatically be added in this case, no need to
- worry about it.
-
- For all intents and purposes, a hierarchy of `numa,socket` is
- interpreted as "segment the ranks according to NUMA node locality,
- and then further segment them according to CPU socket locality".
-
- - The provided features will automatically be re-ordered when their
- order does not match their order in the physical system. (unless a
- virtual hierarchy is present in the list)
-
- - *Virtual Hierarchies*: The string may alternatively also contain "rank
- lists" which specify exactly which ranks to group together, as well as some
- other special modifiers. See
- `coll_xhc_component.c:xhc_component_parse_hierarchy()` for further
- explanation as well as syntax information.
-
-* **chunk_size** (default `16K`): The chunk size for the pipelining process.
-Data is processed (eg broadcast, reduced) in this-much sized pieces at once.
-
- - It's possible to have a different chunk size for each level of the
- hierarchy, achieved via providing a comma-separated list of sizes (eg.
- `"16K,16K,128K"`) instead of single one. The sizes in this list's *DO NOT*
- correspond to the items on hierarchy list; the hierarchy keys might be
- re-ordered or reduced to match the system, but the chunk sizes will be
- consumed in the order they are given, left-to-right -> bottom-to-top.
-
-* **uniform_chunks** (default `true`): Automatically optimize the chunk size
-in reduction collectives, according to the message size, so that all members
-will perform equal work.
-
-* **uniform_chunks_min** (default `1K`): The lowest allowed value for the chunk
-size when uniform chunks are enabled.
-
-* **cico_max** (default `1K`): Copy-in-copy-out, instead of single-copy, will
-be used for messages of *cico_max* or less bytes.
-
-*(Removed Parameters)*
-
-* **rcache_max**, **rcache_max_global** *(REMOVED with shift to opal/smsc)*:
-Limit to number of attachments that the registration cache should hold.
-
- - A case can be made about their usefulness. If desired, shall be
- re-implemented at smsc-level.
-
-## Limitations
-
-- *Intra-node support only*
- - Define XHC as `coll/HAN`'s intra-node component to reap its benefits in
- multi-node runs.
-
-- **Heterogeneity**: XHC does not support nodes with non-uniform (rank-wise)
-datatype representations. (determined according to Open MPI's `proc_arch`)
-
-- **Non-commutative** operators are not supported by XHC's reduction
-collectives. In past versions, they were, but only with a flat hierarchy; this
-could make a return at some point.
-
-- **Derived Datatypes** are currently not supported.
-
-- XHC's Reduce currently only supports rank 0 as the root, and will
-automatically fall back to another component for other cases.
-
-## Building
-
-This section describes how to compile the XHC component.
-
-XPMEM support in Open MPI is required to reap the full benefits of XHC.
-
-- The XHC component will build and work without XPMEM support, but for large
-messages (i.e. ones above the CICO threshold) Allreduce/Reduce will be
-disabled, and Broadcast will fall-back to less efficient mechanisms.
-
-- XPMEM can be obtained from , and then
-compiled like a common kernel module. You might need to manually point Open
-MPI's configure script to XPMEM's installation location, via the
-`--with-xpmem=` parameter.
-
-- At run-time, you will need to insert the kernel module and obtain proper
-access rights to `/dev/xpmem`.
-
-Apart from instructing Open MPI to include XPMEM support, the rest of the build
-process is standard. General information on building Open MPI can be found in
-its documentation.
-
-
-
-
-
-## Running
-
-General information on running Open MPI jobs can be found here:
-
-
-
-`mpirun`'s man page will also be useful:
-
-
-In order for the XHC component to be chosen, its priority must be manually set
-higher than other collectives components that implement the same primitives,
-via the `coll_xhc_priority` MCA param.
-
- - Example: `--mca coll_xhc_priority 100`
-
-* Most likely, you will also want the `--bind-to core` param. Otherwise, the
-reported process localities might be too general, preventing XHC from correctly
-segmenting the system. (MCA `coll_xhc_print_info` will report the generated
-hierarchy if you wish to experiment)
-
-### Tuning
-
-* Optional: You might wish to manually specify the topological features that
-XHC's hierarchy should conform to. The default is `numa,socket`, which will
-group the processes according to NUMA locality and then further group them
-according to socket locality. See the `coll_xhc_hierarchy` param.
-
- - Example: `--mca coll_xhc_hierarchy numa,socket`
- - Example: `--mca coll_xhc_hierarchy numa`
- - Example: `--mca coll_xhc_hierarchy flat`
-
- In some systems, small-message Broadcast or the Barrier operation might
- perform better with a flat tree instead of a hierarchical one. Currently,
- manual benchmarking is required to accurately determine this.
-
-* Optional: You might wish to tune XHC's chunk size (default `16K`). Use the
-`coll_xhc_chunk_size` param, and try values close to the default and see if
-improvements are observed.
-
- - Example: `--mca coll_xhc_chunk_size 16K`
-
-* Optional: If you wish to focus on latencies of small/medium size messages,
-you can try altering the cico-to-zcopy switchover point (MCA
-`coll_xhc_cico_max`, default `1K`).
-
- - Example: `--mca coll_xhc_cico_max 1K`
-
-* Optional: If your application is heavy in Broadcast calls and you suspect
-that specific ranks might be joining the collective with delay and causing
-others to stall waiting for them, try enabling dynamic leadership (MCA
-`coll_xhc_dynamic_leader`), and seeing if it makes an improvement. Please let
-us know if it does :-).
-
- - Example: `--mca coll_xhc_dynamic_leader 1`
-
-### Example command lines
-
-*Assuming `PATH` and `LD_LIBRARY_PATH` have been set appropriately.*
-
-Default XHC configuration:
-`$ mpirun --mca coll_xhc_priority 100 --bind-to core `
-
-XHC w/ numa-sensitive hierarchy, chunk size @ 16K:
-`$ mpirun --mca coll_xhc_priority 100 --mca coll_xhc_hierarchy numa --mca coll_xhc_chunk_size 16K --bind-to core `
-
-XHC with flat hierarchy (ie. none at all):
-`$ mpirun --mca coll_xhc_priority 100 --mca coll_xhc_hierarchy node [--bind-to core] `
-
-## Benchmarking
-
-This section outlines some tips for benchmarking XHC and intra-node MPI
-collectives in general.
-
-### Micro-Benchmarks
-
-For our micro-benchmarking purposes, we have been using [OSU's microbenchmark
-suite](https://mvapich.cse.ohio-state.edu/benchmarks/). However, when
-micro-benchmarking intra-node collectives, there are some important details
-that one needs to look out for.
-
-**CPU Cache** An issue with the OSU micro-benchmarks is that they use the same
-buffer for each iteration without altering it. Since modern processors
-implicitly cache data, this can lead to false/unrealistic/unrepresentative
-results, given that actual real-world applications do not (usually/optimally!)
-perform duplicate operations.
-
-Availability of collective operation source data on a processor's local cache
-hierarchy will cause certain phenomenons (e.g. slow path memory transactions)
-and their effects to remain hidden and undetected in the micro-benchmarking
-process, even though they *will* negatively impact performance in actual
-applications,
-
-We have created "data-varying" (`_dv` suffix) benchmarks to counter this
-problem, which will alter the data before each iteration.
-
-**Microbenchmark's pre-op Barrier** One also needs to be aware how the barrier
-that appears before each iteration in the OSU micro-benchmarks affects the
-result, especially so when latencies of small messages are concerned. The
-underlying implementation of this barrier and the speed/efficiency of its
-"release stage" will affect how fast and how synchronized ranks will exit the
-barrier, and therefore how fast/synchronized they will enter the benchmarked
-collective operation.
-
-For as accurate/clean performance reporting as possible, use a barrier
-implementation that has as low a latency as possible. Furthermore, ideally,
-all ranks should exit the barrier at the exact same time -- this is more
-complex to measure, but can make a difference. In order to have a common
-baseline when benchmarking and comparing multiple collectives implementation,
-use this same barrier implementation for all benchmark scenarios.
-
-In the environments we tested, XHC's barrier was the best performing one. To
-make using this barrier easier, we have put together a small new collective
-component, `XB` (= xhc barrier).
-
-XB creates a new nested (duplicate) communicator with a hint to prioritize XHC,
-and delegates barrier operations to it. A slightly inconvenient side-effect is
-that XHC needs to be on the coll list (MCA `--mca coll`); it doesn't need to
-have a high priority, though it can't be less than 0.
-
-* To benchmark Open MPI's `coll/tuned` with XB: `--mca coll basic,libnbc,tuned,xb,xhc --mca coll_xhc_priority 0 --mca coll_xb_priority 95 --mca coll_tuned_priority 90`
-
-* Or XHC itself, with XB: `--mca coll basic,libnbc,xb,xhc --mca coll_xhc_priority 90 --mca coll_xb_priority 95`
-
-It is also possible to specify the hierarchy to be used for XB's barrier (the
-request will be passed in string form to XHC, only for the nested communicator)
-via the `coll_xb_hierarchy` MCA parameter.
-
-In our fork of the OSU micro-benchmarks, you will also find
-"integrity-checking" variants (`_integrity` suffix). These can help verify that
-collective operations complete successfully without data corruption.
-
-Our OSU micro-benchmarks fork:
-
-
-The XB component:
-
-
-### Applications
-
-We expect to see any meaningful performance improvement with XHC in actual
-applications, only if they spend a non-insignificant percentage of their
-runtime in the collective operations that XHC implements: Broadcast, Barrier,
-Allreduce, Reduce.
-
-One known such application is [miniAMR](https://github.com/Mantevo/miniAMR).
-The application parameters (e.g. the refine count and frequency) will affect
-the amount of time spent in the Allreduce primitive.
-
-Another one is Microsoft's [CNTK](https://github.com/microsoft/CNTK), also
-heavy in Allreduce, though it actually makes use of the non-blocking
-`Iallreduce` variant. However, it can easily be converted to use the blocking
-variant instead (contact for patch). Comparing the performance of the
-unmodified CNTK with OpenMPI's `coll/libnbc`, versus that of the patched CNTK
-with XHC reveals that this modification is sensible and beneficial.
-
-Finally, while we have not yet rigorously evaluated it,
-[PiSvM](http://pisvm.sourceforge.net/) is another candidate, with intense use
-of MPI Broadcast.
-
----
-
-Contact: George Katevenis (gkatev@ics.forth.gr), Manolis Ploumidis (ploumid@ics.forth.gr)
-Computer Architecture and VLSI Systems (CARV) Laboratory, ICS Forth
+For additional info and resources about XHC, check the Open MPI docs:
+https://docs.open-mpi.org/
diff --git a/ompi/mca/osc/osc.h b/ompi/mca/osc/osc.h
index c8f77404c1c..b43f34ac3c5 100644
--- a/ompi/mca/osc/osc.h
+++ b/ompi/mca/osc/osc.h
@@ -216,6 +216,15 @@ typedef int (*ompi_osc_base_module_put_fn_t)(const void *origin_addr,
struct ompi_datatype_t *target_dt,
struct ompi_win_t *win);
+typedef int (*ompi_osc_base_module_put_notify_fn_t)(const void *origin_addr,
+ size_t origin_count,
+ struct ompi_datatype_t *origin_dt,
+ int target,
+ ptrdiff_t target_disp,
+ size_t target_count,
+ struct ompi_datatype_t *target_dt,
+ int notify,
+ struct ompi_win_t *win);
typedef int (*ompi_osc_base_module_get_fn_t)(void *origin_addr,
size_t origin_count,
@@ -226,6 +235,23 @@ typedef int (*ompi_osc_base_module_get_fn_t)(void *origin_addr,
struct ompi_datatype_t *target_dt,
struct ompi_win_t *win);
+typedef int (*ompi_osc_base_module_get_notify_fn_t)(void *origin_addr,
+ size_t origin_count,
+ struct ompi_datatype_t *origin_dt,
+ int target,
+ ptrdiff_t target_disp,
+ size_t target_count,
+ struct ompi_datatype_t *target_dt,
+ int notify,
+ struct ompi_win_t *win);
+
+typedef int (*ompi_osc_base_module_win_get_notify_value_fn_t)(struct ompi_win_t *win,
+ int notify,
+ OMPI_MPI_COUNT_TYPE *value);
+
+typedef int (*ompi_osc_base_module_win_reset_notify_value_fn_t)(struct ompi_win_t *win,
+ int notify,
+ OMPI_MPI_COUNT_TYPE *value);
typedef int (*ompi_osc_base_module_accumulate_fn_t)(const void *origin_addr,
size_t origin_count,
@@ -276,6 +302,17 @@ typedef int (*ompi_osc_base_module_rput_fn_t)(const void *origin_addr,
struct ompi_win_t *win,
struct ompi_request_t **request);
+typedef int (*ompi_osc_base_module_rput_notify_fn_t)(const void *origin_addr,
+ size_t origin_count,
+ struct ompi_datatype_t *origin_dt,
+ int target,
+ ptrdiff_t target_disp,
+ size_t target_count,
+ struct ompi_datatype_t *target_dt,
+ int notify,
+ struct ompi_win_t *win,
+ struct ompi_request_t **request);
+
typedef int (*ompi_osc_base_module_rget_fn_t)(void *origin_addr,
size_t origin_count,
struct ompi_datatype_t *origin_dt,
@@ -286,6 +323,16 @@ typedef int (*ompi_osc_base_module_rget_fn_t)(void *origin_addr,
struct ompi_win_t *win,
struct ompi_request_t **request);
+typedef int (*ompi_osc_base_module_rget_notify_fn_t)(void *origin_addr,
+ size_t origin_count,
+ struct ompi_datatype_t *origin_dt,
+ int target,
+ ptrdiff_t target_disp,
+ size_t target_count,
+ struct ompi_datatype_t *target_dt,
+ int notify,
+ struct ompi_win_t *win,
+ struct ompi_request_t **request);
typedef int (*ompi_osc_base_module_raccumulate_fn_t)(const void *origin_addr,
size_t origin_count,
@@ -371,7 +418,6 @@ typedef int (*ompi_osc_base_module_flush_local_all_fn_t)(struct ompi_win_t *win)
* module structure.
*/
- // TODO: extend the struct and add pointers to put/get_with_notify functions
struct ompi_osc_base_module_4_0_0_t {
ompi_osc_base_module_win_shared_query_fn_t osc_win_shared_query;
@@ -409,6 +455,12 @@ struct ompi_osc_base_module_4_0_0_t {
ompi_osc_base_module_flush_all_fn_t osc_flush_all;
ompi_osc_base_module_flush_local_fn_t osc_flush_local;
ompi_osc_base_module_flush_local_all_fn_t osc_flush_local_all;
+ ompi_osc_base_module_put_notify_fn_t osc_put_notify;
+ ompi_osc_base_module_get_notify_fn_t osc_get_notify;
+ ompi_osc_base_module_win_get_notify_value_fn_t osc_win_get_notify_value;
+ ompi_osc_base_module_win_reset_notify_value_fn_t osc_win_reset_notify_value;
+ ompi_osc_base_module_rput_notify_fn_t osc_rput_notify;
+ ompi_osc_base_module_rget_notify_fn_t osc_rget_notify;
};
typedef struct ompi_osc_base_module_4_0_0_t ompi_osc_base_module_4_0_0_t;
typedef ompi_osc_base_module_4_0_0_t ompi_osc_base_module_t;
diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c
index cc34c109683..14eeb928e40 100644
--- a/ompi/mca/osc/rdma/osc_rdma_component.c
+++ b/ompi/mca/osc/rdma/osc_rdma_component.c
@@ -1649,37 +1649,39 @@ int ompi_osc_rdma_shared_query(
ptrdiff_t *disp_unit, void *baseptr)
{
int rc = OMPI_ERR_NOT_SUPPORTED;
- ompi_osc_rdma_peer_t *peer;
- int actual_rank = rank;
+ ompi_osc_rdma_peer_t *peer = NULL;
ompi_osc_rdma_module_t *module = GET_MODULE(win);
- peer = ompi_osc_module_get_peer (module, actual_rank);
- if (NULL == peer) {
- return OMPI_ERR_NOT_SUPPORTED;
- }
-
/* currently only supported for allocated windows */
if (MPI_WIN_FLAVOR_ALLOCATE != module->flavor) {
return OMPI_ERR_NOT_SUPPORTED;
}
- if (!ompi_osc_rdma_peer_local_base(peer)) {
- return OMPI_ERR_NOT_SUPPORTED;
- }
-
if (MPI_PROC_NULL == rank) {
/* iterate until we find a rank that has a non-zero size */
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
peer = ompi_osc_module_get_peer (module, i);
- ompi_osc_rdma_peer_extended_t *ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;
- if (!ompi_osc_rdma_peer_local_base(peer)) {
+ if (NULL == peer) {
+ /* peer object not cached yet (typically non-local here since local peers are added eagerly) */
continue;
- } else if (module->same_size && ex_peer->super.base) {
- break;
- } else if (ex_peer->size > 0) {
- break;
}
+ ompi_osc_rdma_peer_extended_t *ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;
+ if (ompi_osc_rdma_peer_local_base(peer)) {
+ if (module->same_size && ex_peer->super.base) {
+ break;
+ } else if (ex_peer->size > 0) {
+ break;
+ }
+ }
+ // reset so we don't mistakenly use a peer without memory
+ peer = NULL;
}
+ } else {
+ peer = ompi_osc_module_get_peer (module, rank);
+ }
+
+ if (NULL == peer || !ompi_osc_rdma_peer_local_base(peer)) {
+ return OMPI_ERR_NOT_SUPPORTED;
}
if (module->same_size && module->same_disp_unit) {
diff --git a/ompi/mca/osc/sm/osc_sm.h b/ompi/mca/osc/sm/osc_sm.h
index 23afacd7d49..85d250bfa18 100644
--- a/ompi/mca/osc/sm/osc_sm.h
+++ b/ompi/mca/osc/sm/osc_sm.h
@@ -22,6 +22,7 @@ typedef uint64_t osc_sm_post_type_t;
typedef opal_atomic_uint64_t osc_sm_post_atomic_type_t;
#define OSC_SM_POST_BITS 6
#define OSC_SM_POST_MASK 0x3f
+#define OSC_SM_MAX_NOTIFY_COUNTERS 16
/* data shared across all peers */
struct ompi_osc_sm_global_state_t {
@@ -47,6 +48,9 @@ struct ompi_osc_sm_node_state_t {
opal_atomic_int32_t complete_count;
ompi_osc_sm_lock_t lock;
opal_atomic_lock_t accumulate_lock;
+ uint32_t notify_counter_count;
+ uint64_t notify_counter_offset; /* offset from segment_base, not raw pointer */
+
};
typedef struct ompi_osc_sm_node_state_t ompi_osc_sm_node_state_t;
@@ -79,7 +83,7 @@ struct ompi_osc_sm_module_t {
size_t *sizes;
void **bases;
ptrdiff_t *disp_units;
- uint64_t **notify_counters;
+ uint64_t *notify_counters;
ompi_group_t *start_group;
@@ -107,7 +111,6 @@ int ompi_osc_sm_detach(struct ompi_win_t *win, const void *base);
int ompi_osc_sm_free(struct ompi_win_t *win);
-// TODO: add put/get_with_notify prototypes
int ompi_osc_sm_put(const void *origin_addr,
size_t origin_count,
@@ -118,6 +121,16 @@ int ompi_osc_sm_put(const void *origin_addr,
struct ompi_datatype_t *target_dt,
struct ompi_win_t *win);
+ int ompi_osc_sm_put_notify(const void *origin_addr,
+ size_t origin_count,
+ struct ompi_datatype_t *origin_dt,
+ int target,
+ ptrdiff_t target_disp,
+ size_t target_count,
+ struct ompi_datatype_t *target_dt,
+ int notify,
+ struct ompi_win_t *win);
+
int ompi_osc_sm_get(void *origin_addr,
size_t origin_count,
struct ompi_datatype_t *origin_dt,
@@ -127,6 +140,24 @@ int ompi_osc_sm_get(void *origin_addr,
struct ompi_datatype_t *target_dt,
struct ompi_win_t *win);
+int ompi_osc_sm_get_notify(void *origin_addr,
+ size_t origin_count,
+ struct ompi_datatype_t *origin_dt,
+ int target,
+ ptrdiff_t target_disp,
+ size_t target_count,
+ struct ompi_datatype_t *target_dt,
+ int notify,
+ struct ompi_win_t *win);
+
+int ompi_osc_sm_win_get_notify_value(struct ompi_win_t *win,
+ int notify,
+ OMPI_MPI_COUNT_TYPE *value);
+
+int ompi_osc_sm_win_reset_notify_value(struct ompi_win_t *win,
+ int notify,
+ OMPI_MPI_COUNT_TYPE *value);
+
int ompi_osc_sm_accumulate(const void *origin_addr,
size_t origin_count,
struct ompi_datatype_t *origin_dt,
@@ -176,6 +207,17 @@ int ompi_osc_sm_rput(const void *origin_addr,
struct ompi_win_t *win,
struct ompi_request_t **request);
+int ompi_osc_sm_rput_notify(const void *origin_addr,
+ size_t origin_count,
+ struct ompi_datatype_t *origin_dt,
+ int target,
+ ptrdiff_t target_disp,
+ size_t target_count,
+ struct ompi_datatype_t *target_dt,
+ int notify,
+ struct ompi_win_t *win,
+ struct ompi_request_t **request);
+
int ompi_osc_sm_rget(void *origin_addr,
size_t origin_count,
struct ompi_datatype_t *origin_dt,
@@ -186,6 +228,17 @@ int ompi_osc_sm_rget(void *origin_addr,
struct ompi_win_t *win,
struct ompi_request_t **request);
+int ompi_osc_sm_rget_notify(void *origin_addr,
+ size_t origin_count,
+ struct ompi_datatype_t *origin_dt,
+ int target,
+ ptrdiff_t target_disp,
+ size_t target_count,
+ struct ompi_datatype_t *target_dt,
+ int notify,
+ struct ompi_win_t *win,
+ struct ompi_request_t **request);
+
int ompi_osc_sm_raccumulate(const void *origin_addr,
size_t origin_count,
struct ompi_datatype_t *origin_dt,
diff --git a/ompi/mca/osc/sm/osc_sm_comm.c b/ompi/mca/osc/sm/osc_sm_comm.c
index f9bae370870..fbd4f17856c 100644
--- a/ompi/mca/osc/sm/osc_sm_comm.c
+++ b/ompi/mca/osc/sm/osc_sm_comm.c
@@ -17,9 +17,58 @@
#include "ompi/mca/osc/osc.h"
#include "ompi/mca/osc/base/base.h"
#include "ompi/mca/osc/base/osc_base_obj_convert.h"
+#include "ompi/communicator/communicator.h"
#include "osc_sm.h"
+static inline uint64_t *osc_sm_target_notify_base(ompi_osc_sm_module_t *module, int target)
+{
+ if (NULL == module->segment_base) {
+ /* single-rank path: notify_counters is a regular local allocation */
+ return module->notify_counters;
+ }
+
+ return (uint64_t *) ((char *) module->segment_base +
+ module->node_states[target].notify_counter_offset);
+}
+
+int
+ompi_osc_sm_win_get_notify_value(struct ompi_win_t *win,
+ int notify,
+ OMPI_MPI_COUNT_TYPE *value)
+{
+ ompi_osc_sm_module_t *module = (ompi_osc_sm_module_t *) win->w_osc_module;
+ int rank = ompi_comm_rank(module->comm);
+
+ if (notify < 0 || (uint32_t) notify >= module->node_states[rank].notify_counter_count) {
+ return MPI_ERR_NOTIFY_IDX;
+ }
+
+ *value = (OMPI_MPI_COUNT_TYPE) osc_sm_target_notify_base(module, rank)[notify];
+ opal_atomic_rmb();
+
+ return OMPI_SUCCESS;
+}
+
+int
+ompi_osc_sm_win_reset_notify_value(struct ompi_win_t *win,
+ int notify,
+ OMPI_MPI_COUNT_TYPE *value)
+{
+ ompi_osc_sm_module_t *module = (ompi_osc_sm_module_t *) win->w_osc_module;
+ int rank = ompi_comm_rank(module->comm);
+
+ if (notify < 0 || (uint32_t) notify >= module->node_states[rank].notify_counter_count) {
+ return MPI_ERR_NOTIFY_IDX;
+ }
+
+ /* Atomically swap the counter to 0 and return the previous value */
+ *value = (OMPI_MPI_COUNT_TYPE) opal_atomic_swap_64(
+ &osc_sm_target_notify_base(module, rank)[notify], 0);
+
+ return OMPI_SUCCESS;
+}
+
int
ompi_osc_sm_rput(const void *origin_addr,
size_t origin_count,
@@ -59,6 +108,53 @@ ompi_osc_sm_rput(const void *origin_addr,
return OMPI_SUCCESS;
}
+int
+ompi_osc_sm_rput_notify(const void *origin_addr,
+ size_t origin_count,
+ struct ompi_datatype_t *origin_dt,
+ int target,
+ ptrdiff_t target_disp,
+ size_t target_count,
+ struct ompi_datatype_t *target_dt,
+ int notify,
+ struct ompi_win_t *win,
+ struct ompi_request_t **ompi_req)
+{
+ int ret;
+ ompi_osc_sm_module_t *module =
+ (ompi_osc_sm_module_t*) win->w_osc_module;
+ void *remote_address;
+
+ OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
+ "rput_notify: 0x%lx, %zu, %s, %d, %d, %zu, %s, %d, 0x%lx",
+ (unsigned long) origin_addr, origin_count,
+ origin_dt->name, target, (int) target_disp,
+ target_count, target_dt->name,
+ notify,
+ (unsigned long) win));
+
+ remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp;
+
+ ret = ompi_datatype_sndrcv((void *)origin_addr, origin_count, origin_dt,
+ remote_address, target_count, target_dt);
+ if (OMPI_SUCCESS != ret) {
+ return ret;
+ }
+
+ /* the only valid field of RMA request status is the MPI_ERROR field.
+ * ompi_request_empty has status MPI_SUCCESS and indicates the request is
+ * complete. */
+ *ompi_req = &ompi_request_empty;
+
+ if (notify < 0 || (uint32_t) notify >= module->node_states[target].notify_counter_count) {
+ return MPI_ERR_NOTIFY_IDX;
+ }
+
+ opal_atomic_wmb();
+ opal_atomic_add(&osc_sm_target_notify_base(module, target)[notify], 1);
+
+ return OMPI_SUCCESS;
+}
int
ompi_osc_sm_rget(void *origin_addr,
@@ -99,6 +195,53 @@ ompi_osc_sm_rget(void *origin_addr,
return OMPI_SUCCESS;
}
+int
+ompi_osc_sm_rget_notify(void *origin_addr,
+ size_t origin_count,
+ struct ompi_datatype_t *origin_dt,
+ int target,
+ ptrdiff_t target_disp,
+ size_t target_count,
+ struct ompi_datatype_t *target_dt,
+ int notify,
+ struct ompi_win_t *win,
+ struct ompi_request_t **ompi_req)
+{
+ int ret;
+ ompi_osc_sm_module_t *module =
+ (ompi_osc_sm_module_t*) win->w_osc_module;
+ void *remote_address;
+
+ OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
+ "rget_notify: 0x%lx, %zu, %s, %d, %d, %zu, %s, %d, 0x%lx",
+ (unsigned long) origin_addr, origin_count,
+ origin_dt->name, target, (int) target_disp,
+ target_count, target_dt->name,
+ notify,
+ (unsigned long) win));
+
+ remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp;
+
+ ret = ompi_datatype_sndrcv(remote_address, target_count, target_dt,
+ origin_addr, origin_count, origin_dt);
+ if (OMPI_SUCCESS != ret) {
+ return ret;
+ }
+
+ /* the only valid field of RMA request status is the MPI_ERROR field.
+ * ompi_request_empty has status MPI_SUCCESS and indicates the request is
+ * complete. */
+ *ompi_req = &ompi_request_empty;
+
+ if (notify < 0 || (uint32_t) notify >= module->node_states[target].notify_counter_count) {
+ return MPI_ERR_NOTIFY_IDX;
+ }
+
+ opal_atomic_rmb();
+ opal_atomic_add(&osc_sm_target_notify_base(module, target)[notify], 1);
+
+ return OMPI_SUCCESS;
+}
int
ompi_osc_sm_raccumulate(const void *origin_addr,
@@ -236,6 +379,48 @@ ompi_osc_sm_put(const void *origin_addr,
}
+int
+ompi_osc_sm_put_notify(const void *origin_addr,
+ size_t origin_count,
+ struct ompi_datatype_t *origin_dt,
+ int target,
+ ptrdiff_t target_disp,
+ size_t target_count,
+ struct ompi_datatype_t *target_dt,
+ int notify,
+ struct ompi_win_t *win)
+{
+ int ret;
+ ompi_osc_sm_module_t *module =
+ (ompi_osc_sm_module_t*) win->w_osc_module;
+ void *remote_address;
+
+ OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
+ "put_notify: 0x%lx, %zu, %s, %d, %d, %zu, %s, %d, 0x%lx",
+ (unsigned long) origin_addr, origin_count,
+ origin_dt->name, target, (int) target_disp,
+ target_count, target_dt->name,
+ notify,
+ (unsigned long) win));
+
+ remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp;
+
+ ret = ompi_datatype_sndrcv((void *)origin_addr, origin_count, origin_dt,
+ remote_address, target_count, target_dt);
+ if (OMPI_SUCCESS != ret) {
+ return ret;
+ }
+
+ if (notify < 0 || (uint32_t) notify >= module->node_states[target].notify_counter_count) {
+ return MPI_ERR_NOTIFY_IDX;
+ }
+
+ opal_atomic_wmb();
+ opal_atomic_add(&osc_sm_target_notify_base(module, target)[notify], 1);
+
+ return ret;
+}
+
int
ompi_osc_sm_get(void *origin_addr,
size_t origin_count,
@@ -268,7 +453,7 @@ ompi_osc_sm_get(void *origin_addr,
int
-ompi_osc_sm_get_with_notify(void *origin_addr,
+ompi_osc_sm_get_notify(void *origin_addr,
size_t origin_count,
struct ompi_datatype_t *origin_dt,
int target,
@@ -294,9 +479,15 @@ ompi_osc_sm_get_with_notify(void *origin_addr,
ret = ompi_datatype_sndrcv(remote_address, target_count, target_dt,
origin_addr, origin_count, origin_dt);
- // TODO: do the same for put_with_notify
+ if (OMPI_SUCCESS != ret) {
+ return ret;
+ }
+ if (notify < 0 || (uint32_t) notify >= module->node_states[target].notify_counter_count) {
+ return OMPI_ERR_BAD_PARAM;
+ }
+
opal_atomic_rmb();
- opal_atomic_add(&module->notify_counters[target][notify], 1);
+ opal_atomic_add(&osc_sm_target_notify_base(module, target)[notify], 1);
return ret;
}
@@ -472,5 +663,5 @@ ompi_osc_sm_fetch_and_op(const void *origin_addr,
done:
opal_atomic_unlock(&module->node_states[target].accumulate_lock);
- return OMPI_SUCCESS;;
+ return OMPI_SUCCESS;
}
diff --git a/ompi/mca/osc/sm/osc_sm_component.c b/ompi/mca/osc/sm/osc_sm_component.c
index 1ad9a48cfd2..259c0826017 100644
--- a/ompi/mca/osc/sm/osc_sm_component.c
+++ b/ompi/mca/osc/sm/osc_sm_component.c
@@ -70,8 +70,6 @@ ompi_osc_sm_component_t mca_osc_sm_component = {
MCA_BASE_COMPONENT_INIT(ompi, osc, sm)
-// TODO: extend the struct and add pointers to put/get_with_notify functions
-// TODO: extend it to rput/rget_with_notify as well
ompi_osc_sm_module_t ompi_osc_sm_module_template = {
{
.osc_win_shared_query = ompi_osc_sm_shared_query,
@@ -81,14 +79,20 @@ ompi_osc_sm_module_t ompi_osc_sm_module_template = {
.osc_free = ompi_osc_sm_free,
.osc_put = ompi_osc_sm_put,
+ .osc_put_notify = ompi_osc_sm_put_notify,
.osc_get = ompi_osc_sm_get,
+ .osc_get_notify = ompi_osc_sm_get_notify,
+ .osc_win_get_notify_value = ompi_osc_sm_win_get_notify_value,
+ .osc_win_reset_notify_value = ompi_osc_sm_win_reset_notify_value,
.osc_accumulate = ompi_osc_sm_accumulate,
.osc_compare_and_swap = ompi_osc_sm_compare_and_swap,
.osc_fetch_and_op = ompi_osc_sm_fetch_and_op,
.osc_get_accumulate = ompi_osc_sm_get_accumulate,
.osc_rput = ompi_osc_sm_rput,
+ .osc_rput_notify = ompi_osc_sm_rput_notify,
.osc_rget = ompi_osc_sm_rget,
+ .osc_rget_notify = ompi_osc_sm_rget_notify,
.osc_raccumulate = ompi_osc_sm_raccumulate,
.osc_rget_accumulate = ompi_osc_sm_rget_accumulate,
@@ -253,12 +257,19 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis
module->posts = calloc (1, sizeof(module->posts[0]) + sizeof (module->posts[0][0]));
if (NULL == module->posts) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
module->posts[0] = (osc_sm_post_atomic_type_t *) (module->posts + 1);
+
+ /* allocate notify counters for single process case */
+ module->notify_counters = calloc(OSC_SM_MAX_NOTIFY_COUNTERS, sizeof(uint64_t));
+ if (NULL == module->notify_counters) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
+ module->node_states[0].notify_counter_count = OSC_SM_MAX_NOTIFY_COUNTERS;
+ module->node_states[0].notify_counter_offset = 0;
} else {
- unsigned long total, *rbuf;
+ unsigned long total, total_counters, gather_values[2], *rbuf;
int i, flag;
size_t pagesize;
size_t state_size;
size_t posts_size, post_size = (comm_size + OSC_SM_POST_MASK) / (OSC_SM_POST_MASK + 1);
+ size_t notify_counters_size;
size_t data_base_size;
opal_output_verbose(MCA_BASE_VERBOSE_DEBUG, ompi_osc_base_framework.framework_output,
@@ -267,7 +278,7 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis
/* get the pagesize */
pagesize = opal_getpagesize();
- rbuf = malloc(sizeof(unsigned long) * comm_size);
+ rbuf = malloc(sizeof(unsigned long) * comm_size * 2 );
if (NULL == rbuf) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
/* Note that the alloc_shared_noncontig info key only has
@@ -291,9 +302,10 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis
"allocating window using contiguous strategy");
}
- total = size;
- ret = module->comm->c_coll->coll_allgather(&total, 1, MPI_UNSIGNED_LONG,
- rbuf, 1, MPI_UNSIGNED_LONG,
+ gather_values[0] = size;
+ gather_values[1] = OSC_SM_MAX_NOTIFY_COUNTERS;
+ ret = module->comm->c_coll->coll_allgather(gather_values, 2, MPI_UNSIGNED_LONG,
+ rbuf, 2, MPI_UNSIGNED_LONG,
module->comm,
module->comm->c_coll->coll_allgather_module);
if (OMPI_SUCCESS != ret) {
@@ -302,8 +314,10 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis
}
total = 0;
+ total_counters = 0;
for (i = 0 ; i < comm_size ; ++i) {
- total += rbuf[i];
+ total += rbuf[2 * i];
+ total_counters += rbuf[2 * i + 1];
if (module->noncontig) {
total += OPAL_ALIGN_PAD_AMOUNT(total, pagesize);
}
@@ -314,7 +328,9 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis
state_size += OPAL_ALIGN_PAD_AMOUNT(state_size, 64);
posts_size = comm_size * post_size * sizeof (module->posts[0][0]);
posts_size += OPAL_ALIGN_PAD_AMOUNT(posts_size, 64);
- data_base_size = state_size + posts_size;
+ notify_counters_size = total_counters * sizeof(uint64_t);
+ notify_counters_size += OPAL_ALIGN_PAD_AMOUNT(notify_counters_size, 64);
+ data_base_size = state_size + posts_size + notify_counters_size;
data_base_size += OPAL_ALIGN_PAD_AMOUNT(data_base_size, pagesize);
if (0 == ompi_comm_rank (module->comm)) {
char *data_file;
@@ -375,15 +391,27 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis
module->global_state = (ompi_osc_sm_global_state_t *) (module->posts[0] + comm_size * post_size);
module->node_states = (ompi_osc_sm_node_state_t *) (module->global_state + 1);
- for (i = 0, total = data_base_size ; i < comm_size ; ++i) {
+ /* set up notify counters in shared memory after node_states */
+ module->notify_counters = (uint64_t *) ((char *)(module->node_states + comm_size) +
+ OPAL_ALIGN_PAD_AMOUNT((uintptr_t)(module->node_states + comm_size), 64));
+ /* zero out notify counters */
+ memset(module->notify_counters, 0, total_counters * sizeof(uint64_t));
+
+ for (i = 0, total = data_base_size, total_counters = 0 ; i < comm_size ; ++i) {
if (i > 0) {
module->posts[i] = module->posts[i - 1] + post_size;
}
- module->sizes[i] = rbuf[i];
+ module->node_states[i].notify_counter_count = (uint32_t) rbuf[2 * i + 1];
+ module->node_states[i].notify_counter_offset =
+ (uint64_t) ((char *) (module->notify_counters + total_counters) -
+ (char *) module->segment_base);
+ total_counters += rbuf[2 * i + 1];
+
+ module->sizes[i] = rbuf[2 * i];
if (module->sizes[i] || !module->noncontig) {
module->bases[i] = ((char *) module->segment_base) + total;
- total += rbuf[i];
+ total += rbuf[2 * i];
if (module->noncontig) {
total += OPAL_ALIGN_PAD_AMOUNT(total, pagesize);
}
@@ -397,7 +425,8 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis
/* initialize my state shared */
module->my_node_state = &module->node_states[ompi_comm_rank(module->comm)];
- memset (module->my_node_state, 0, sizeof(*module->my_node_state));
+ module->my_node_state->complete_count = 0;
+ memset (&module->my_node_state->lock, 0, sizeof(module->my_node_state->lock));
*base = module->bases[ompi_comm_rank(module->comm)];
@@ -553,6 +582,7 @@ ompi_osc_sm_free(struct ompi_win_t *win)
module->comm->c_coll->coll_barrier_module);
opal_shmem_segment_detach (&module->seg_ds);
+ /* notify_counters points into shared memory segment, no separate free needed */
} else {
free(module->node_states);
free(module->global_state);
@@ -560,6 +590,8 @@ ompi_osc_sm_free(struct ompi_win_t *win)
mca_mpool_base_default_module->mpool_free(mca_mpool_base_default_module,
module->bases[0]);
}
+ /* free notify_counters for single process case */
+ free(module->notify_counters);
}
free(module->disp_units);
free(module->outstanding_locks);
diff --git a/ompi/mca/osc/ucx/osc_ucx_comm.c b/ompi/mca/osc/ucx/osc_ucx_comm.c
index ab122e67263..0354edb71c0 100644
--- a/ompi/mca/osc/ucx/osc_ucx_comm.c
+++ b/ompi/mca/osc/ucx/osc_ucx_comm.c
@@ -944,7 +944,7 @@ static inline int ompi_osc_ucx_check_ops_and_flush (ompi_osc_ucx_module_t *modul
uint64_t base_tmp, tail_tmp;
int ret = OMPI_SUCCESS;
- if (module->ctx->num_incomplete_req_ops > ompi_osc_ucx_outstanding_ops_flush_threshold) {
+ if ((size_t)module->ctx->num_incomplete_req_ops > ompi_osc_ucx_outstanding_ops_flush_threshold) {
ret = opal_common_ucx_ctx_flush(module->ctx, OPAL_COMMON_UCX_SCOPE_WORKER, 0);
if (ret != OPAL_SUCCESS) {
ret = OMPI_ERROR;
diff --git a/ompi/mca/part/persist/part_persist.h b/ompi/mca/part/persist/part_persist.h
index ccc8f8f1971..86fb9bac42d 100644
--- a/ompi/mca/part/persist/part_persist.h
+++ b/ompi/mca/part/persist/part_persist.h
@@ -490,7 +490,7 @@ mca_part_persist_psend_init(const void* buf,
return err;
}
-__opal_attribute_always_inline__ static inline int
+static inline int
mca_part_persist_start(size_t count, ompi_request_t** requests)
{
int err = OMPI_SUCCESS;
diff --git a/ompi/mca/pml/ob1/pml_ob1_iprobe.c b/ompi/mca/pml/ob1/pml_ob1_iprobe.c
index 4d6a0eb8dfd..97744cce5dc 100644
--- a/ompi/mca/pml/ob1/pml_ob1_iprobe.c
+++ b/ompi/mca/pml/ob1/pml_ob1_iprobe.c
@@ -47,6 +47,11 @@ int mca_pml_ob1_iprobe(int src,
*matched = 1;
} else {
*matched = 0;
+#if OPAL_ENABLE_FT_MPI
+ if( ompi_request_is_failed((ompi_request_t*)&recvreq) ) {
+ rc = recvreq.req_recv.req_base.req_ompi.req_status.MPI_ERROR;
+ }
+#endif
opal_progress();
}
MCA_PML_BASE_RECV_REQUEST_FINI( &recvreq.req_recv );
@@ -119,6 +124,11 @@ mca_pml_ob1_improbe(int src,
(*message)->count = recvreq->req_recv.req_base.req_ompi.req_status._ucount;
} else {
*matched = 0;
+#if OPAL_ENABLE_FT_MPI
+ if( ompi_request_is_failed((ompi_request_t*)recvreq) ) {
+ rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR;
+ }
+#endif
/* we only free if we didn't match, because we're going to
translate the request into a receive request later on if it
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
index 57aba677a8a..a6a2866f2a2 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@@ -108,16 +108,19 @@ static int mca_pml_ob1_recv_request_cancel(struct ompi_request_t* ompi_request,
}
if( !request->req_match_received ) { /* the match has not been already done */
assert( OMPI_ANY_TAG == ompi_request->req_status.MPI_TAG ); /* not matched isn't it */
+ if(OPAL_LIKELY(request->req_recv.req_base.req_type != MCA_PML_REQUEST_IPROBE &&
+ request->req_recv.req_base.req_type != MCA_PML_REQUEST_IMPROBE)) {
#if MCA_PML_OB1_CUSTOM_MATCH
- custom_match_prq_cancel(ob1_comm->prq, request);
+ custom_match_prq_cancel(ob1_comm->prq, request);
#else
- if( request->req_recv.req_base.req_peer == OMPI_ANY_SOURCE ) {
- opal_list_remove_item( &ob1_comm->wild_receives, (opal_list_item_t*)request );
- } else {
- mca_pml_ob1_comm_proc_t* proc = mca_pml_ob1_peer_lookup (comm, request->req_recv.req_base.req_peer);
- opal_list_remove_item(&proc->specific_receives, (opal_list_item_t*)request);
- }
+ if( request->req_recv.req_base.req_peer == OMPI_ANY_SOURCE ) {
+ opal_list_remove_item( &ob1_comm->wild_receives, (opal_list_item_t*)request );
+ } else {
+ mca_pml_ob1_comm_proc_t* proc = mca_pml_ob1_peer_lookup (comm, request->req_recv.req_base.req_peer);
+ opal_list_remove_item(&proc->specific_receives, (opal_list_item_t*)request);
+ }
#endif
+ }
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_REMOVE_FROM_POSTED_Q,
&(request->req_recv.req_base), PERUSE_RECV );
OB1_MATCHING_UNLOCK(&ob1_comm->matching_lock);
diff --git a/ompi/mpi/bindings/ompi_bindings/consts.py b/ompi/mpi/bindings/ompi_bindings/consts.py
index 43bca486b57..759b342f64a 100644
--- a/ompi/mpi/bindings/ompi_bindings/consts.py
+++ b/ompi/mpi/bindings/ompi_bindings/consts.py
@@ -23,6 +23,7 @@
'MPI_SUCCESS',
'MPI_ERR_BUFFER',
'MPI_ERR_COUNT',
+ 'MPI_ERR_NOTIFY_IDX'
'MPI_ERR_TYPE',
'MPI_ERR_TAG',
'MPI_ERR_COMM',
diff --git a/ompi/mpi/c/Makefile.am b/ompi/mpi/c/Makefile.am
index 25b871fa7d4..49619694d0b 100644
--- a/ompi/mpi/c/Makefile.am
+++ b/ompi/mpi/c/Makefile.am
@@ -223,6 +223,7 @@ prototype_sources = \
get_accumulate.c.in \
get_address.c.in \
get.c.in \
+ get_notify.c.in \
get_count.c.in \
get_elements.c.in \
get_elements_x.c.in \
@@ -341,6 +342,7 @@ prototype_sources = \
psend_init.c.in \
publish_name.c.in \
put.c.in \
+ put_notify.c.in \
query_thread.c.in \
raccumulate.c.in \
recv.c.in \
@@ -484,6 +486,8 @@ prototype_sources = \
win_get_group.c.in \
win_get_info.c.in \
win_get_name.c.in \
+ win_get_notify_value.c.in \
+ win_reset_notify_value.c.in \
win_lock_all.c.in \
win_lock.c.in \
win_post.c.in \
@@ -954,6 +958,8 @@ interface_profile_sources = \
win_get_group_generated.c \
win_get_info_generated.c \
win_get_name_generated.c \
+ win_get_notify_value_generated.c \
+ win_reset_notify_value_generated.c \
win_lock_all_generated.c \
win_lock_generated.c \
win_post_generated.c \
diff --git a/ompi/mpi/c/get_notify.c.in b/ompi/mpi/c/get_notify.c.in
new file mode 100644
index 00000000000..1bad16944ab
--- /dev/null
+++ b/ompi/mpi/c/get_notify.c.in
@@ -0,0 +1,77 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
+ * University Research and Technology
+ * Corporation. All rights reserved.
+ * Copyright (c) 2004-2020 The University of Tennessee and The University
+ * of Tennessee Research Foundation. All rights
+ * reserved.
+ * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
+ * University of Stuttgart. All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ * All rights reserved.
+ * Copyright (c) 2015 Los Alamos National Security, LLC. All rights
+ * reserved.
+ * Copyright (c) 2015 Research Organization for Information Science
+ * and Technology (RIST). All rights reserved.
+ * Copyright (c) 2024 Triad National Security, LLC. All rights
+ * reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+#include "ompi_config.h"
+#include
+
+#include "ompi/mpi/c/bindings.h"
+#include "ompi/runtime/params.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/errhandler/errhandler.h"
+#include "ompi/win/win.h"
+#include "ompi/mca/osc/osc.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "ompi/runtime/ompi_spc.h"
+
+PROTOTYPE ERROR_CLASS get_notify(BUFFER_OUT origin_addr, COUNT origin_count,
+ DATATYPE origin_datatype, INT target_rank,
+ AINT target_disp, COUNT target_count,
+ DATATYPE target_datatype, INT notification_idx, WIN win)
+{
+ int rc;
+
+ SPC_RECORD(OMPI_SPC_GET_NOTIFY, 1);
+
+ if (MPI_PARAM_CHECK) {
+ rc = OMPI_SUCCESS;
+
+ OMPI_ERR_INIT_FINALIZE(FUNC_NAME);
+
+ if (ompi_win_invalid(win)) {
+ return OMPI_ERRHANDLER_NOHANDLE_INVOKE(MPI_ERR_WIN, FUNC_NAME);
+ } else if (origin_count < 0 || target_count < 0) {
+ rc = MPI_ERR_COUNT;
+ } else if (ompi_win_peer_invalid(win, target_rank) &&
+ (MPI_PROC_NULL != target_rank)) {
+ rc = MPI_ERR_RANK;
+ } else if ( MPI_WIN_FLAVOR_DYNAMIC != win->w_flavor && target_disp < 0 ) {
+ rc = MPI_ERR_DISP;
+ } else if (notification_idx < 0) {
+ rc = MPI_ERR_NOTIFY_IDX;
+ } else {
+ OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, origin_datatype, origin_count);
+ if (OMPI_SUCCESS == rc) {
+ OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, target_datatype, target_count);
+ }
+ }
+ OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME);
+ }
+
+ if (MPI_PROC_NULL == target_rank) return MPI_SUCCESS;
+
+ rc = win->w_osc_module->osc_get_notify(origin_addr, origin_count, origin_datatype,
+ target_rank, target_disp, target_count,
+ target_datatype, notification_idx, win);
+ OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME);
+}
diff --git a/ompi/mpi/c/put_notify.c.in b/ompi/mpi/c/put_notify.c.in
new file mode 100644
index 00000000000..14ee5c7e365
--- /dev/null
+++ b/ompi/mpi/c/put_notify.c.in
@@ -0,0 +1,80 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
+ * University Research and Technology
+ * Corporation. All rights reserved.
+ * Copyright (c) 2004-2020 The University of Tennessee and The University
+ * of Tennessee Research Foundation. All rights
+ * reserved.
+ * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
+ * University of Stuttgart. All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ * All rights reserved.
+ * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
+ * reserved.
+ * Copyright (c) 2015 Research Organization for Information Science
+ * and Technology (RIST). All rights reserved.
+ * Copyright (c) 2024 Triad National Security, LLC. All rights
+ * reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+#include "ompi_config.h"
+#include
+
+#include "ompi/mpi/c/bindings.h"
+#include "ompi/runtime/params.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/errhandler/errhandler.h"
+#include "ompi/win/win.h"
+#include "ompi/mca/osc/osc.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "ompi/runtime/ompi_spc.h"
+
+PROTOTYPE ERROR_CLASS put_notify(BUFFER origin_addr, COUNT origin_count, DATATYPE origin_datatype,
+ INT target_rank, AINT target_disp, COUNT target_count,
+ DATATYPE target_datatype, INT notification_idx, WIN win)
+{
+ int rc;
+
+ SPC_RECORD(OMPI_SPC_PUT_NOTIFY, 1);
+
+ if (MPI_PARAM_CHECK) {
+ rc = OMPI_SUCCESS;
+
+ OMPI_ERR_INIT_FINALIZE(FUNC_NAME);
+
+ if (ompi_win_invalid(win)) {
+ return OMPI_ERRHANDLER_NOHANDLE_INVOKE(MPI_ERR_WIN, FUNC_NAME);
+ } else if (origin_count < 0 || target_count < 0) {
+ rc = MPI_ERR_COUNT;
+ } else if (ompi_win_peer_invalid(win, target_rank) &&
+ (MPI_PROC_NULL != target_rank)) {
+ rc = MPI_ERR_RANK;
+ } else if (NULL == target_datatype ||
+ MPI_DATATYPE_NULL == target_datatype) {
+ rc = MPI_ERR_TYPE;
+ } else if ( MPI_WIN_FLAVOR_DYNAMIC != win->w_flavor && target_disp < 0 ) {
+ rc = MPI_ERR_DISP;
+ } else if (notification_idx < 0) {
+ rc = MPI_ERR_NOTIFY_IDX;
+ } else {
+ OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, origin_datatype, origin_count);
+ if (OMPI_SUCCESS == rc) {
+ OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, target_datatype, target_count);
+ }
+ }
+ OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME);
+ }
+
+ if (MPI_PROC_NULL == target_rank) return MPI_SUCCESS;
+
+ rc = win->w_osc_module->osc_put_notify(origin_addr, origin_count, origin_datatype,
+ target_rank, target_disp, target_count,
+ target_datatype, notification_idx, win);
+ OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME);
+}
diff --git a/ompi/mpi/c/win_get_notify_value.c.in b/ompi/mpi/c/win_get_notify_value.c.in
new file mode 100644
index 00000000000..228999c13ea
--- /dev/null
+++ b/ompi/mpi/c/win_get_notify_value.c.in
@@ -0,0 +1,41 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2026 Triad National Security, LLC. All rights
+ * reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+#include "ompi_config.h"
+
+#include "ompi/mpi/c/bindings.h"
+#include "ompi/runtime/params.h"
+#include "ompi/errhandler/errhandler.h"
+#include "ompi/win/win.h"
+#include "ompi/mca/osc/osc.h"
+
+PROTOTYPE ERROR_CLASS win_get_notify_value(WIN win, INT notification_idx, ELEMENT_COUNT value)
+{
+ int rc;
+
+ if (MPI_PARAM_CHECK) {
+ rc = OMPI_SUCCESS;
+
+ OMPI_ERR_INIT_FINALIZE(FUNC_NAME);
+
+ if (ompi_win_invalid(win)) {
+ return OMPI_ERRHANDLER_NOHANDLE_INVOKE(MPI_ERR_WIN, FUNC_NAME);
+ } else if (notification_idx < 0) {
+ rc = MPI_ERR_NOTIFY_IDX;
+ } else if (NULL == value) {
+ rc = MPI_ERR_ARG;
+ }
+
+ OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME);
+ }
+
+ rc = win->w_osc_module->osc_win_get_notify_value(win, notification_idx, value);
+ OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME);
+}
diff --git a/ompi/mpi/c/win_reset_notify_value.c.in b/ompi/mpi/c/win_reset_notify_value.c.in
new file mode 100644
index 00000000000..99aa1755a76
--- /dev/null
+++ b/ompi/mpi/c/win_reset_notify_value.c.in
@@ -0,0 +1,41 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2026 Triad National Security, LLC. All rights
+ * reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+#include "ompi_config.h"
+
+#include "ompi/mpi/c/bindings.h"
+#include "ompi/runtime/params.h"
+#include "ompi/errhandler/errhandler.h"
+#include "ompi/win/win.h"
+#include "ompi/mca/osc/osc.h"
+
+PROTOTYPE ERROR_CLASS win_reset_notify_value(WIN win, INT notification_idx, ELEMENT_COUNT value)
+{
+ int rc;
+
+ if (MPI_PARAM_CHECK) {
+ rc = OMPI_SUCCESS;
+
+ OMPI_ERR_INIT_FINALIZE(FUNC_NAME);
+
+ if (ompi_win_invalid(win)) {
+ return OMPI_ERRHANDLER_NOHANDLE_INVOKE(MPI_ERR_WIN, FUNC_NAME);
+ } else if (notification_idx < 0) {
+ rc = MPI_ERR_NOTIFY_IDX;
+ } else if (NULL == value) {
+ rc = MPI_ERR_ARG;
+ }
+
+ OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME);
+ }
+
+ rc = win->w_osc_module->osc_win_reset_notify_value(win, notification_idx, value);
+ OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME);
+}
diff --git a/ompi/mpi/fortran/mpif-h/request_get_status_f.c b/ompi/mpi/fortran/mpif-h/request_get_status_f.c
index 7a5c9d57716..7fac2b2e051 100644
--- a/ompi/mpi/fortran/mpif-h/request_get_status_f.c
+++ b/ompi/mpi/fortran/mpif-h/request_get_status_f.c
@@ -12,6 +12,7 @@
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
+ * Copyright (c) 2026 NVIDIA Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -75,16 +76,11 @@ void ompi_request_get_status_f(MPI_Fint *request, ompi_fortran_logical_t *flag,
MPI_Request c_req = PMPI_Request_f2c( *request );
OMPI_LOGICAL_NAME_DECL(flag);
- /* This seems silly, but someone will do it */
-
- if (OMPI_IS_FORTRAN_STATUS_IGNORE(status)) {
- *flag = OMPI_INT_2_LOGICAL(0);
- c_ierr = MPI_SUCCESS;
- } else {
- c_ierr = PMPI_Request_get_status(c_req,
- OMPI_LOGICAL_SINGLE_NAME_CONVERT(flag),
- &c_status);
- OMPI_SINGLE_INT_2_LOGICAL(flag);
+ c_ierr = PMPI_Request_get_status(c_req,
+ OMPI_LOGICAL_SINGLE_NAME_CONVERT(flag),
+ &c_status);
+ OMPI_SINGLE_INT_2_LOGICAL(flag);
+ if (!OMPI_IS_FORTRAN_STATUS_IGNORE(status)) {
PMPI_Status_c2f( &c_status, status );
}
if (NULL != ierr) *ierr = OMPI_INT_2_FINT(c_ierr);
diff --git a/ompi/request/req_ft.c b/ompi/request/req_ft.c
index 2c53ce076b0..e855afc59fd 100644
--- a/ompi/request/req_ft.c
+++ b/ompi/request/req_ft.c
@@ -128,7 +128,9 @@ bool ompi_request_is_failed_fn(ompi_request_t *req)
req->req_status.MPI_ERROR = MPI_ERR_PROC_FAILED_PENDING;
/* If it is a probe/mprobe, escalate the error */
if( (MCA_PML_REQUEST_MPROBE == pml_req->req_type) ||
- (MCA_PML_REQUEST_PROBE == pml_req->req_type) ) {
+ (MCA_PML_REQUEST_IMPROBE == pml_req->req_type) ||
+ (MCA_PML_REQUEST_PROBE == pml_req->req_type) ||
+ (MCA_PML_REQUEST_IPROBE == pml_req->req_type) ) {
req->req_status.MPI_ERROR = MPI_ERR_PROC_FAILED;
}
opal_output_verbose(10, ompi_ftmpi_output_handle,
diff --git a/ompi/runtime/ompi_mpi_finalize.c b/ompi/runtime/ompi_mpi_finalize.c
index ad8a328dc55..08c6efaa616 100644
--- a/ompi/runtime/ompi_mpi_finalize.c
+++ b/ompi/runtime/ompi_mpi_finalize.c
@@ -24,6 +24,7 @@
* reserved.
* Copyright (c) 2020 Amazon.com, Inc. or its affiliates.
* All Rights reserved.
+ * Copyright (c) 2026 Nanook Consulting All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -281,14 +282,25 @@ int ompi_mpi_finalize(void)
* communications/actions to complete. See
* https://github.com/open-mpi/ompi/issues/1576 for the
* original bug report. */
- if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, NULL, 0, fence_cbfunc, (void*)&active))) {
- ret = opal_pmix_convert_status(rc);
- OMPI_ERROR_LOG(ret);
+ rc = PMIx_Fence_nb(NULL, 0, NULL, 0, fence_cbfunc, (void*)&active);
+ if (PMIX_SUCCESS != rc) {
/* Reset the active flag to false, to avoid waiting for
* completion when the fence was failed. */
active = false;
+ // can return operation_succeeded if atomically completed
+ if (PMIX_OPERATION_SUCCEEDED == rc) {
+ ret = MPI_SUCCESS;
+ } else {
+ ret = opal_pmix_convert_status(rc);
+ OMPI_ERROR_LOG(ret);
+ }
+ } else {
+ OMPI_LAZY_WAIT_FOR_COMPLETION(active);
+ /* NOTE: we lose the fence return status here. This can be
+ * a problem as the fence CAN fail. Might consider retrieving
+ * the returned status so you can respond if it doesn't
+ * successfully complete? */
}
- OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
ompi_mpi_instance_finalize (&ompi_mpi_instance_default);
diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c
index c7e61c5bf94..deea53cb02e 100644
--- a/ompi/runtime/ompi_mpi_init.c
+++ b/ompi/runtime/ompi_mpi_init.c
@@ -26,7 +26,7 @@
* Copyright (c) 2018 FUJITSU LIMITED. All rights reserved.
* Copyright (c) 2020 Amazon.com, Inc. or its affiliates.
* All Rights reserved.
- * Copyright (c) 2021 Nanook Consulting. All rights reserved.
+ * Copyright (c) 2021-2026 Nanook Consulting All rights reserved.
* Copyright (c) 2021-2022 Triad National Security, LLC. All rights
* reserved.
* Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
@@ -464,12 +464,17 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
active = true;
OPAL_POST_OBJECT(&active);
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
- if( PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, NULL, 0,
- fence_release,
- (void*)&active))) {
- ret = opal_pmix_convert_status(rc);
- error = "PMIx_Fence_nb() failed";
- goto error;
+ rc = PMIx_Fence_nb(NULL, 0, NULL, 0, fence_release, (void*)&active);
+ if (PMIX_SUCCESS != rc) {
+ active = false;
+ if (PMIX_OPERATION_SUCCEEDED == rc) {
+ // can return operation_succeeded if atomically completed
+ ret = MPI_SUCCESS;
+ } else {
+ ret = opal_pmix_convert_status(rc);
+ error = "PMIx_Fence_nb() failed";
+ goto error;
+ }
}
}
} else {
@@ -482,12 +487,19 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active);
if( PMIX_SUCCESS != rc) {
- ret = opal_pmix_convert_status(rc);
- error = "PMIx_Fence() failed";
- goto error;
+ active = false;
+ if (PMIX_OPERATION_SUCCEEDED == rc) {
+ // can return operation_succeeded if atomically completed
+ ret = MPI_SUCCESS;
+ } else {
+ ret = opal_pmix_convert_status(rc);
+ error = "PMIx_Fence_nb() failed";
+ goto error;
+ }
+ } else {
+ /* cannot just wait on thread as we need to call opal_progress */
+ OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
- /* cannot just wait on thread as we need to call opal_progress */
- OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
}
@@ -537,7 +549,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
* we have to wait here for it to complete. However, there
* is no reason to do two barriers! */
if (background_fence) {
- OMPI_LAZY_WAIT_FOR_COMPLETION(active);
+ if (active) {
+ OMPI_LAZY_WAIT_FOR_COMPLETION(active);
+ }
} else if (!ompi_async_mpi_init) {
/* wait for everyone to reach this point - this is a hard
* barrier requirement at this time, though we hope to relax
@@ -546,13 +560,20 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
active = true;
OPAL_POST_OBJECT(&active);
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
- if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, info, 1,
- fence_release, (void*)&active))) {
- ret = opal_pmix_convert_status(rc);
- error = "PMIx_Fence_nb() failed";
- goto error;
+ rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active);
+ if (PMIX_SUCCESS != rc) {
+ active = false;
+ if (PMIX_OPERATION_SUCCEEDED == rc) {
+ // can return operation_succeeded if atomically completed
+ ret = MPI_SUCCESS;
+ } else {
+ ret = opal_pmix_convert_status(rc);
+ error = "PMIx_Fence_nb() failed";
+ goto error;
+ }
+ } else {
+ OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
- OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
}
diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c
index c747d55ee7d..7b5d1f3c55e 100644
--- a/ompi/runtime/ompi_mpi_params.c
+++ b/ompi/runtime/ompi_mpi_params.c
@@ -104,11 +104,12 @@ bool ompi_ftmpi_enabled = false;
#endif /* OPAL_ENABLE_FT_MPI */
static int ompi_stream_buffering_mode = -1;
+static int ompi_mpi_ft_verbose = 0;
int ompi_comm_verbose_level = 0;
int ompi_mpi_register_params(void)
{
- int value;
+ int value = 0;
#if OPAL_ENABLE_FT_MPI
mca_base_var_scope_t ftscope = MCA_BASE_VAR_SCOPE_READONLY;
@@ -121,15 +122,14 @@ int ompi_mpi_register_params(void)
"Enable UFLM MPI Fault Tolerance framework",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_4, ftscope, &ompi_ftmpi_enabled);
- value = 0;
(void) mca_base_var_register ("ompi", "mpi", "ft", "verbose",
"Verbosity level of the ULFM MPI Fault Tolerance framework",
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
- OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL, &value);
+ OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL, &ompi_mpi_ft_verbose);
#if OPAL_ENABLE_FT_MPI
- if( 0 < value ) {
+ if( 0 < ompi_mpi_ft_verbose ) {
ompi_ftmpi_output_handle = opal_output_open(NULL);
- opal_output_set_verbosity(ompi_ftmpi_output_handle, value);
+ opal_output_set_verbosity(ompi_ftmpi_output_handle, ompi_mpi_ft_verbose);
}
(void) ompi_comm_rbcast_register_params();
diff --git a/ompi/runtime/ompi_rte.c b/ompi/runtime/ompi_rte.c
index 651cf9d0b5a..f94df4fbd5d 100644
--- a/ompi/runtime/ompi_rte.c
+++ b/ompi/runtime/ompi_rte.c
@@ -85,56 +85,20 @@ static int _setup_proc_session_dir(char **sdir);
#define OPAL_PRINT_NAME_ARGS_MAX_SIZE 50
#define OPAL_PRINT_NAME_ARG_NUM_BUFS 16
-static bool fns_init=false;
-static opal_tsd_tracked_key_t print_args_tsd_key;
static char* opal_print_args_null = "NULL";
typedef struct {
- char *buffers[OPAL_PRINT_NAME_ARG_NUM_BUFS];
+ char buffers[OPAL_PRINT_NAME_ARG_NUM_BUFS][OPAL_PRINT_NAME_ARGS_MAX_SIZE + 1];
int cntr;
} opal_print_args_buffers_t;
-static void
-buffer_cleanup(void *value)
-{
- int i;
- opal_print_args_buffers_t *ptr;
-
- if (NULL != value) {
- ptr = (opal_print_args_buffers_t*)value;
- for (i=0; i < OPAL_PRINT_NAME_ARG_NUM_BUFS; i++) {
- free(ptr->buffers[i]);
- }
- free (ptr);
- }
- fns_init = false;
-}
-
static opal_print_args_buffers_t*
get_print_name_buffer(void)
{
- opal_print_args_buffers_t *ptr;
- int ret, i;
-
- if (!fns_init) {
- /* setup the print_args function */
- OBJ_CONSTRUCT(&print_args_tsd_key, opal_tsd_tracked_key_t);
- opal_tsd_tracked_key_set_destructor(&print_args_tsd_key, buffer_cleanup);
- fns_init = true;
- }
-
- ret = opal_tsd_tracked_key_get(&print_args_tsd_key, (void**)&ptr);
- if (OPAL_SUCCESS != ret) return NULL;
+ static opal_thread_local opal_print_args_buffers_t name_buffer = {
+ .cntr = 0
+ };
- if (NULL == ptr) {
- ptr = (opal_print_args_buffers_t*)malloc(sizeof(opal_print_args_buffers_t));
- for (i=0; i < OPAL_PRINT_NAME_ARG_NUM_BUFS; i++) {
- ptr->buffers[i] = (char *) malloc((OPAL_PRINT_NAME_ARGS_MAX_SIZE+1) * sizeof(char));
- }
- ptr->cntr = 0;
- ret = opal_tsd_tracked_key_set(&print_args_tsd_key, (void*)ptr);
- }
-
- return (opal_print_args_buffers_t*) ptr;
+ return &name_buffer;
}
static char* ompi_pmix_print_jobids(const opal_jobid_t job)
@@ -1043,10 +1007,6 @@ int ompi_rte_finalize(void)
opal_process_info.initial_errhandler = NULL;
}
- if (fns_init) {
- OBJ_DESTRUCT(&print_args_tsd_key);
- }
-
/* cleanup our internal nspace hack */
opal_pmix_finalize_nspace_tracker();
diff --git a/ompi/runtime/ompi_spc.c b/ompi/runtime/ompi_spc.c
index 6f1d8aa7d6a..dcbbe04b256 100644
--- a/ompi/runtime/ompi_spc.c
+++ b/ompi/runtime/ompi_spc.c
@@ -71,8 +71,10 @@ static const ompi_spc_event_t ompi_spc_events_desc[OMPI_SPC_NUM_COUNTERS] = {
SET_COUNTER_ARRAY(OMPI_SPC_SENDRECV, "The number of times MPI_Sendrecv was called.", false, false),
SET_COUNTER_ARRAY(OMPI_SPC_SENDRECV_REPLACE, "The number of times MPI_Sendrecv_replace was called.", false, false),
SET_COUNTER_ARRAY(OMPI_SPC_PUT, "The number of times MPI_Put was called.", false, false),
+ SET_COUNTER_ARRAY(OMPI_SPC_PUT_NOTIFY, "The number of times MPI_Put_notify was called.", false, false),
SET_COUNTER_ARRAY(OMPI_SPC_RPUT, "The number of times MPI_Rput was called.", false, false),
SET_COUNTER_ARRAY(OMPI_SPC_GET, "The number of times MPI_Get was called.", false, false),
+ SET_COUNTER_ARRAY(OMPI_SPC_GET_NOTIFY, "The number of times MPI_Get_notify was called.", false, false),
SET_COUNTER_ARRAY(OMPI_SPC_RGET, "The number of times MPI_Rget was called.", false, false),
SET_COUNTER_ARRAY(OMPI_SPC_PROBE, "The number of times MPI_Probe was called.", false, false),
SET_COUNTER_ARRAY(OMPI_SPC_IPROBE, "The number of times MPI_Iprobe was called.", false, false),
diff --git a/ompi/runtime/ompi_spc.h b/ompi/runtime/ompi_spc.h
index 76ec7f25f16..3d0efd257b3 100644
--- a/ompi/runtime/ompi_spc.h
+++ b/ompi/runtime/ompi_spc.h
@@ -58,8 +58,10 @@ typedef enum ompi_spc_counters {
OMPI_SPC_SENDRECV,
OMPI_SPC_SENDRECV_REPLACE,
OMPI_SPC_PUT,
+ OMPI_SPC_PUT_NOTIFY,
OMPI_SPC_RPUT,
OMPI_SPC_GET,
+ OMPI_SPC_GET_NOTIFY,
OMPI_SPC_RGET,
OMPI_SPC_PROBE,
OMPI_SPC_IPROBE,
diff --git a/opal/mca/btl/smcuda/Makefile.am b/opal/mca/btl/smcuda/Makefile.am
index c0cdf788e8d..9aed69bfb7f 100644
--- a/opal/mca/btl/smcuda/Makefile.am
+++ b/opal/mca/btl/smcuda/Makefile.am
@@ -46,15 +46,11 @@ component_noinst = libmca_btl_smcuda.la
component_install =
endif
-# See opal/mca/common/cuda/Makefile.am for an explanation of
-# libmca_common_sm.la.
-
mcacomponentdir = $(opallibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_btl_smcuda_la_SOURCES = $(libmca_btl_smcuda_la_sources)
mca_btl_smcuda_la_LDFLAGS = -module -avoid-version $(btl_smcuda_LDFLAGS)
mca_btl_smcuda_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \
- $(OPAL_TOP_BUILDDIR)/opal/mca/common/sm/lib@OPAL_LIB_NAME@mca_common_sm.la \
$(btl_smcuda_LIBS)
mca_btl_smcuda_la_CPPFLAGS = $(btl_smcuda_CPPFLAGS)
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 1ce2b966ece..e832c8ed81e 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -235,7 +235,6 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
free(loc);
} else {
/* If we have hwloc support, then get accurate information */
- loc = NULL;
if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) {
rc = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_AVAILABLE);
@@ -249,6 +248,7 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
mca_btl_smcuda_component.num_mem_nodes = rc;
}
}
+ loc = NULL;
/* see if we were given our location */
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING, &OPAL_PROC_MY_NAME, &loc, PMIX_STRING);
if (OPAL_SUCCESS == rc) {
@@ -267,6 +267,7 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
free(mynuma);
}
free(loc);
+ loc = NULL;
}
} else {
/* If we have hwloc support, then get accurate information */
diff --git a/opal/mca/btl/smcuda/configure.m4 b/opal/mca/btl/smcuda/configure.m4
new file mode 100644
index 00000000000..e9cb2df2996
--- /dev/null
+++ b/opal/mca/btl/smcuda/configure.m4
@@ -0,0 +1,29 @@
+# Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
+#
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+#
+# If any accelerators have been discovered, then build support for the
+# accelerator BTL. This assumes the discovery has already been done.
+#
+# Beware: Un like the name seems to indicate this BTl is generic and used by
+# all accelerators.
+
+AC_DEFUN([MCA_opal_btl_smcuda_CONFIG],[
+ AC_CONFIG_FILES([opal/mca/btl/smcuda/Makefile])
+
+ # This component shall be configured only after the accelerator discovery
+ # has been completed. This discovery is part of the OPAL accelerator framework.
+ AC_MSG_CHECKING([if any accelerator components were found (cuda, rocm, ze)])
+ AS_IF([test "x$OMPI_HAVE_ACCELERATOR_SUPPORT" = "x1"],
+ [AC_MSG_RESULT([yes])
+ $1],
+ [AC_MSG_RESULT([no])
+ $2])
+
+])dnl
diff --git a/opal/mca/rcache/gpusm/configure.m4 b/opal/mca/rcache/gpusm/configure.m4
new file mode 100644
index 00000000000..d721910500e
--- /dev/null
+++ b/opal/mca/rcache/gpusm/configure.m4
@@ -0,0 +1,27 @@
+# Copyright (c) 2026 NVIDIA Corporation. All rights reserved.
+#
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+#
+# If any accelerators have been discovered, then build support for the
+# accelerator rcache component.
+#
+AC_DEFUN([MCA_opal_rcache_gpusm_CONFIG],[
+
+ AC_CONFIG_FILES([opal/mca/rcache/gpusm/Makefile])
+
+ # This component shall be configured only after the accelerator discovery
+ # has been completed. This discovery is part of the OPAL accelerator framework.
+ AC_MSG_CHECKING([if any accelerator components were found (cuda, rocm, ze)])
+ AS_IF([test "x$OMPI_HAVE_ACCELERATOR_SUPPORT" = "x1"],
+ [AC_MSG_RESULT([yes])
+ $1],
+ [AC_MSG_RESULT([no])
+ $2])
+
+])dnl
diff --git a/opal/mca/rcache/rgpusm/configure.m4 b/opal/mca/rcache/rgpusm/configure.m4
new file mode 100644
index 00000000000..f5e3eda0154
--- /dev/null
+++ b/opal/mca/rcache/rgpusm/configure.m4
@@ -0,0 +1,27 @@
+# Copyright (c) 2026 NVIDIA Corporation. All rights reserved.
+#
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+#
+# If any accelerators have been discovered, then build support for the
+# accelerator rcache component.
+#
+AC_DEFUN([MCA_opal_rcache_rgpusm_CONFIG],[
+
+ AC_CONFIG_FILES([opal/mca/rcache/rgpusm/Makefile])
+
+ # This component shall be configured only after the accelerator discovery
+ # has been completed. This discovery is part of the OPAL accelerator framework.
+ AC_MSG_CHECKING([if any accelerator components were found (cuda, rocm, ze)])
+ AS_IF([test "x$OMPI_HAVE_ACCELERATOR_SUPPORT" = "x1"],
+ [AC_MSG_RESULT([yes])
+ $1],
+ [AC_MSG_RESULT([no])
+ $2])
+
+])dnl
diff --git a/opal/mca/smsc/accelerator/configure.m4 b/opal/mca/smsc/accelerator/configure.m4
new file mode 100644
index 00000000000..9fa993e9cf5
--- /dev/null
+++ b/opal/mca/smsc/accelerator/configure.m4
@@ -0,0 +1,27 @@
+# Copyright (c) 2026 NVIDIA Corporation. All rights reserved.
+#
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+#
+# If any accelerators have been discovered, then build support for the
+# accelerator SMSC component.
+#
+AC_DEFUN([MCA_opal_smsc_accelerator_CONFIG],[
+
+ AC_CONFIG_FILES([opal/mca/smsc/accelerator/Makefile])
+
+ # This component shall be configured only after the accelerator discovery
+ # has been completed. This discovery is part of the OPAL accelerator framework.
+ AC_MSG_CHECKING([if any accelerator components were found (cuda, rocm, ze)])
+ AS_IF([test "x$OMPI_HAVE_ACCELERATOR_SUPPORT" = "x1"],
+ [AC_MSG_RESULT([yes])
+ $1],
+ [AC_MSG_RESULT([no])
+ $2])
+
+])dnl
diff --git a/oshmem/mca/memheap/base/memheap_base_frame.c b/oshmem/mca/memheap/base/memheap_base_frame.c
index 53a71b27a9e..82658e09791 100644
--- a/oshmem/mca/memheap/base/memheap_base_frame.c
+++ b/oshmem/mca/memheap/base/memheap_base_frame.c
@@ -33,9 +33,9 @@
int mca_memheap_base_output = -1;
int mca_memheap_base_key_exchange = 1;
-opal_list_t mca_memheap_base_components_opened = {{0}};
+opal_list_t mca_memheap_base_components_opened = {};
int mca_memheap_base_already_opened = 0;
-mca_memheap_map_t mca_memheap_base_map = {{{{0}}}};
+mca_memheap_map_t mca_memheap_base_map = {};
int mca_memheap_num_segments_warn = 32;
static int mca_memheap_base_register(mca_base_register_flag_t flags)
diff --git a/oshmem/shmem/c/shmem_put_nb.c b/oshmem/shmem/c/shmem_put_nb.c
index 89e4bf18240..cef6abcc40b 100644
--- a/oshmem/shmem/c/shmem_put_nb.c
+++ b/oshmem/shmem/c/shmem_put_nb.c
@@ -11,6 +11,7 @@
#include "oshmem/constants.h"
#include "oshmem/include/shmem.h"
+#include "oshmem/include/shmemx.h"
#include "oshmem/runtime/runtime.h"