diff --git a/.ci/community-jenkins/Jenkinsfile b/.ci/community-jenkins/Jenkinsfile
index f4b305f1d66..2c20d630ac1 100644
--- a/.ci/community-jenkins/Jenkinsfile
+++ b/.ci/community-jenkins/Jenkinsfile
@@ -14,7 +14,6 @@
 //
 //
 // WORKSPACE Layout:
-//   autotools-install/    Autotools install for the builder
 //   ompi/                 Open MPI source tree
 
 // We if we push changes to a PR, we don't need to keep old jobs running, so
@@ -56,9 +55,26 @@ println('Tests Completed')
 // although currently we only support the one stage of "everything", where each
 // build stage is a map of different configurations to test.
 def prepare_check_stages() {
-    def configure_options = ["--disable-dlopen", "--disable-oshmem", "--enable-builtin-atomic", "--enable-ipv6"]
-    def compilers = ["clang10", "gcc7", "gcc8", "gcc9", "gcc10"]
-    def platforms = ["amazon_linux_2", "amazon_linux_2-arm64", "rhel8"]
+    def configure_options = [
+	"--disable-dlopen",
+	"--disable-oshmem",
+	"--enable-builtin-atomic",
+	"--enable-ipv6"
+    ]
+    def compilers = [
+	"gcc14",
+	"clang18"
+    ]
+    def platforms = [
+	"amazon_linux_2",
+	"amazon_linux_2-arm64",
+	"rhel8",
+	"amazon_linux_2023-arm64",
+	"amazon_linux_2023-x86_64",
+	"ubuntu_20.04",
+	"ubuntu_24.04-arm64",
+	"ubuntu_24.04-x86_64"
+    ]
     def check_stages_list = []
 
     // Build everything stage
@@ -79,6 +95,7 @@ def prepare_check_stages() {
     }
 
     build_parallel_map.put("distcheck", prepare_build("distcheck", "tarball_build", "--distcheck"))
+    build_parallel_map.put("vpath", prepare_build("vpath", "", "--build-dir ompi-build"))
 
     check_stages_list.add(build_parallel_map)
 
@@ -89,14 +106,20 @@ def prepare_build(build_name, label, build_arg) {
     return {
         stage("${build_name}") {
             node(label) {
-                checkout(changelog: false, poll: false, scm: scm)
+                // Checkout into ompi-source instead of the top of the
+                // workspace, so that we have room in the workspace to setup a
+                // vpath build.
+                dir ('ompi-source') {
+                    checkout(changelog: false, poll: false, scm: scm)
+                }
+
                 // If pr-builder.sh fails, the sh step will throw an exception,
                 // which we catch so that the job doesn't abort and continues on
                 // to other steps - such as cleanup. Because we catch the
                 // exception, we need to tell Jenkins the overall job has
                 // failed.
                 try {
-                    sh "/bin/bash -x .ci/community-jenkins/pr-builder.sh ${build_arg} ompi"
+                    sh "/bin/bash -x ompi-source/.ci/community-jenkins/pr-builder.sh ${build_arg} --source-dir ompi-source"
                 } catch (Exception e) {
                     currentBuild.result = "FAILURE"
                 }
diff --git a/.ci/community-jenkins/pr-builder.sh b/.ci/community-jenkins/pr-builder.sh
index eb88b4c1538..88426859bf0 100755
--- a/.ci/community-jenkins/pr-builder.sh
+++ b/.ci/community-jenkins/pr-builder.sh
@@ -21,6 +21,8 @@ MAKE_ARGS=
 MAKE_J="-j 8"
 PREFIX="${WORKSPACE}/install"
 MPIRUN_MODE=${MPIRUN_MODE:-runall}
+SOURCE_DIR=
+BUILD_DIR=
 
 #
 # Options Parsing
@@ -77,6 +79,24 @@ while (( "$#" )); do
                 exit 1
             fi
             ;;
+	--source-dir)
+            if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then
+                SOURCE_DIR=$2
+                shift 2
+            else
+                echo "Error: Argument for $1 is missing" >&2
+                exit 1
+            fi
+            ;;
+	--build-dir)
+            if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then
+                BUILD_DIR=$2
+                shift 2
+            else
+                echo "Error: Argument for $1 is missing" >&2
+                exit 1
+            fi
+            ;;
         -*|--*=) # Unsupported flags
             echo "Error: Unsupported flag $1" >&2
             exit 1
@@ -105,93 +125,43 @@ fi
 echo "--> platform: $PLATFORM_ID"
 echo "--> version: $VERSION_ID"
 
+if test "${SOURCE_DIR}" = "" ; then
+    echo "SOURCED_DIR is unset.  Cannot continue."
+    exit 1
+fi
+
+echo "--> Workspace: ${WORKSPACE}"
+echo "--> Source Dir: ${SOURCE_DIR}"
+echo "--> Build Dir: ${BUILD_DIR}"
+echo "--> Install Dir: ${PREFIX}"
+
 #
 # See if builder provided a compiler we should use, and translate it to
 # CONFIGURE_ARGS.
 #
-case ${PLATFORM_ID} in
-    rhel)
-        case "$COMPILER" in
-            gcc48|"")
-                echo "--> Using default compilers"
-                ;;
-            *)
-                echo "Unsupported compiler ${COMPILER}.  Aborting"
-                exit 1
-                ;;
-        esac
-        ;;
-    amzn)
-        case "$COMPILER" in
-            "")
-                echo "--> Using default compilers"
-                ;;
-            gcc44)
-                CONFIGURE_ARGS="$CONFIGURE_ARGS CC=gcc44 CXX=g++44 FC=gfortran44"
-                ;;
-            gcc48)
-                CONFIGURE_ARGS="$CONFIGURE_ARGS CC=gcc48 CXX=g++48 FC=gfortran48"
-                ;;
-            clang36)
-                CONFIGURE_ARGS="$CONFIGURE_ARGS CC=clang CXX=clang++ --disable-mpi-fortran"
-                ;;
-            *)
-                echo "Unsupported compiler ${COMPILER}.  Aborting"
-                exit 1
-                ;;
-        esac
-        ;;
-    ubuntu)
-        case "$COMPILER" in
-            "")
-                echo "--> Using default compilers"
-                ;;
-            gcc4*)
-                version=`echo "$COMPILER" | sed -e 's/gcc4\([0-9]*\)/4.\1/'`
-                CONFIGURE_ARGS="CC=gcc-${version} CXX=g++-${version} FC=gfortran-${version}"
-                ;;
-            gcc*)
-                version=`echo "$COMPILER" | sed -e 's/gcc\([0-9]*\)/\1/'`
-                CONFIGURE_ARGS="CC=gcc-${version} CXX=g++-${version} FC=gfortran-${version}"
-                ;;
-            clang3*|clang4*|clang5*|clang6*)
-                version=`echo "$COMPILER" |  sed -e 's/clang\([0-9]\)\([0-9]*\)/\1.\2/'`
-                CONFIGURE_ARGS="CC=clang-${version} CXX=clang++-${version} --disable-mpi-fortran"
-                ;;
+if test "${COMPILER}" != "" ; then
+    if test ! -r ${HOME}/ompi-compiler-setup.sh ; then
+        echo "Could not find compiler setup script ompi-compiler-setup.sh.  Aborting."
+        exit 1
+    fi
+
+    . ${HOME}/ompi-compiler-setup.sh
+    activate_compiler ${COMPILER}
+
+    CONFIGURE_ARGS="${CONFIGURE_ARGS} CC=${CC} CPP=${CPP} CXX=${CXX} FC=${FC}"
+    if test "$FC" = "" ; then
+        CONFIGURE_ARGS="${CONFIGURE_ARGS} --disable-mpi-fortran"
+    else
+        # Flang doesn't seem good enough (yet) to compile our Fortran bindings,
+        # so skip for now.
+        case "${COMPILER}" in
             clang*)
-                version=`echo "$COMPILER" | sed -e 's/clang\([0-9]*\)/\1/'`
-                CONFIGURE_ARGS="CC=clang-${version} CXX=clang++-${version} --disable-mpi-fortran"
-                ;;
-            *)
-                echo "Unsupported compiler ${COMPILER}.  Aborting"
-                exit 1
+                CONFIGURE_ARGS="${CONFIGURE_ARGS} --disable-mpi-fortran"
                 ;;
         esac
-        ;;
-    sles)
-        case "$COMPILER" in
-            "")
-                echo "--> Using default compilers"
-                ;;
-            gcc48)
-                CONFIGURE_ARGS="$CONFIGURE_ARGS CC=gcc-48 CXX=g++-48 FC=gfortran-48"
-                ;;
-            gcc5)
-                CONFIGURE_ARGS="$CONFIGURE_ARGS CC=gcc-5 CXX=g++-5 FC=gfortran-5"
-                ;;
-            gcc6)
-                CONFIGURE_ARGS="$CONFIGURE_ARGS CC=gcc-6 CXX=g++-6 FC=gfortran-6"
-                ;;
-            *)
-                echo "Unsupported compiler ${COMPILER}.  Aborting"
-                exit 1
-                ;;
-        esac
-        ;;
-    FreeBSD)
-        CONFIGURE_ARGS="$CONFIGURE_ARGS LDFLAGS=-Wl,-rpath,/usr/local/lib/gcc5 --with-wrapper-ldflags=-Wl,-rpath,/usr/local/lib/gcc5"
-        ;;
-esac
+    fi
+fi
+
 CONFIGURE_ARGS="$CONFIGURE_ARGS --disable-silent-rules"
 
 echo "--> Compiler setup: $CONFIGURE_ARGS"
@@ -210,10 +180,20 @@ fi
 echo "--> Autogen arguments: $AUTOGEN_ARGS"
 echo "--> Configure arguments: $CONFIGURE_ARGS"
 
+cd "${WORKSPACE}/${SOURCE_DIR}"
+
 # Build
 sha1=`git rev-parse HEAD`
 echo "--> Building commit ${sha1}"
 
+if test "${HOME}/ompi-setup-python.sh" ; then
+    echo "--> Initializing Python environment"
+    . ${HOME}/ompi-setup-python.sh
+    find . -name "requirements.txt" -exec ${PIP_CMD} install -r {} \;
+else
+    echo "--> No Python environment found, hoping for the best."
+fi
+
 if test -f autogen.pl; then
     echo "--> running ./autogen.pl ${AUTOGEN_ARGS}"
     ./autogen.pl ${AUTOGEN_ARGS}
@@ -227,9 +207,20 @@ else
     fi
 fi
 
-echo "--> running ./configure --prefix=\"${PREFIX}\" ${CONFIGURE_ARGS}"
-if ! ./configure --prefix="${PREFIX}" ${CONFIGURE_ARGS}; then
-    echo "./configure --prefix=\"${PREFIX}\" ${CONFIGURE_ARGS} failed, ABORTING !"
+if test "${BUILD_DIR}" != "" ; then
+    cd "${WORKSPACE}"
+    rm -rf "${BUILD_DIR}"
+    mkdir "${BUILD_DIR}"
+    cd "${WORKSPACE}/${BUILD_DIR}"
+    CONFIGURE=../${SOURCE_DIR}/configure
+else
+    # already in ${WORKSPACE}/${SOURCE_DIR}
+    CONFIGURE=./configure
+fi
+
+echo "--> running ${CONFIGURE} --prefix=\"${PREFIX}\" ${CONFIGURE_ARGS}"
+if ! ${CONFIGURE} --prefix="${PREFIX}" ${CONFIGURE_ARGS}; then
+    echo "${CONFIGURE} --prefix=\"${PREFIX}\" ${CONFIGURE_ARGS} failed, ABORTING !"
     if test -f config.log; then
         echo "config.log content :"
         cat config.log
@@ -268,7 +259,7 @@ echo "--> running ompi_info"
 ompi_info
 
 echo "--> running make all in examples"
-cd "examples"
+cd "${WORKSPACE}/${SOURCE_DIR}/examples"
 make ${MAKE_ARGS} all
 cd ..
 
diff --git a/.github/workflows/ompi_mpi4py_asan.yaml b/.github/workflows/ompi_mpi4py_asan.yaml
new file mode 100644
index 00000000000..240e3d2f101
--- /dev/null
+++ b/.github/workflows/ompi_mpi4py_asan.yaml
@@ -0,0 +1,148 @@
+name: mpi4py (ASAN)
+
+on:
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      repository:
+        description: 'mpi4py repository'
+        default: 'mpi4py/mpi4py'
+        required: false
+        type: string
+      ref:
+        description: 'mpi4py branch/tag/SHA'
+        default: 'master'
+        required: false
+        type: string
+
+permissions:
+  contents: read
+
+jobs:
+  test:
+    # We need Ubuntu 24.04 (over 22.04) due to a kernel bug,
+    # see https://github.com/google/sanitizers/issues/856.
+    runs-on: ubuntu-24.04
+    timeout-minutes: 30
+    env:
+      MPI4PY_TEST_SPAWN: true
+      # disable ASAN while building
+      ASAN_OPTIONS: verify_asan_link_order=0,detect_odr_violation=0,abort_on_error=0
+      # disable leak detection
+      LSAN_OPTIONS: detect_leaks=0,exitcode=0
+
+    steps:
+    - name: Configure hostname
+      run:  echo 127.0.0.1 `hostname` | sudo tee -a /etc/hosts > /dev/null
+      if:   ${{ runner.os == 'Linux' || runner.os == 'macOS' }}
+
+    - name: Install dependencies
+      run:  sudo apt-get install -y -q
+              libnuma-dev libasan8
+      if:   ${{ runner.os == 'Linux' }}
+
+    - name: Checkout Open MPI
+      uses: actions/checkout@v4
+      with:
+       path: mpi-build
+       submodules: recursive
+
+    - name: Bootstrap Open MPI
+      run:  ./autogen.pl
+      working-directory: mpi-build
+
+    # Install into a separate directory (/opt/openmpi) so that we can
+    # bundle up that tree into an artifact to share with other jobs in
+    # this github action.  Specifically don't use /usr/local, because
+    # there's a bunch of other stuff already installed in /usr/local,
+    # and we don't need to include that in our artifact.
+    - name: Configure Open MPI
+      run:  ./configure
+              --enable-debug
+              --disable-dependency-tracking
+              --disable-sphinx
+              --disable-mpi-fortran
+              --disable-oshmem
+              --disable-silent-rules
+              --prefix=/opt/openmpi
+              CFLAGS="-O1 -fno-omit-frame-pointer -g -fsanitize=address"
+              LDFLAGS="-Wl,-rpath,/opt/openmpi/lib -fsanitize=address"
+      working-directory: mpi-build
+
+    - name: Build MPI
+      run:  make -j $(nproc)
+      working-directory: mpi-build
+
+    - name: Install MPI
+      run:  sudo make install
+      working-directory: mpi-build
+
+    - name: Add Open MPI to PATH
+      run: echo /opt/openmpi/bin >> $GITHUB_PATH
+
+    - name: Tweak MPI
+      run:  |
+        # Tweak MPI
+        mca_params="$HOME/.openmpi/mca-params.conf"
+        mkdir -p "$(dirname "$mca_params")"
+        echo mpi_param_check = true >> "$mca_params"
+        echo mpi_show_handle_leaks = true >> "$mca_params"
+        mca_params="$HOME/.prte/mca-params.conf"
+        mkdir -p "$(dirname "$mca_params")"
+        echo rmaps_default_mapping_policy = :oversubscribe >> "$mca_params"
+
+    - name: Use Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: 3
+        architecture: x64
+
+    - name: Install Python packages (build)
+      run:  python -m pip install --upgrade
+              setuptools pip wheel
+
+    - name: Install Python packages (test)
+      run:  python -m pip install --upgrade
+              numpy cffi pyyaml
+
+    - name: Checkout mpi4py
+      uses: actions/checkout@v4
+      with:
+        repository: ${{ inputs.repository || 'mpi4py/mpi4py' }}
+        ref: ${{ inputs.ref }}
+
+    - name: Setting up ASAN environment
+      # LD_PRELOAD is needed to make sure ASAN is the first thing loaded
+      # as it will otherwise complain.
+      # Leak detection is currently disabled because of the size of the report.
+      # The patcher is disabled because ASAN fails if code mmaps data at fixed
+      # memory addresses, see https://github.com/open-mpi/ompi/issues/12819.
+      # ODR violation detection is disabled until #13469 is fixed.
+      run: |
+        echo LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libasan.so.8 >> $GITHUB_ENV
+        echo ASAN_OPTIONS=detect_odr_violation=0,abort_on_error=1,detect_stack_use_after_return=1 >> $GITHUB_ENV
+        echo LSAN_OPTIONS=detect_leaks=0,exitcode=0 >> $GITHUB_ENV
+        echo OMPI_MCA_memory=^patcher >> $GITHUB_ENV
+
+    - name: Show MPI
+      run:  ompi_info --all --all
+
+    - name: Install mpi4py
+      run:  python -m pip install .
+      env:
+        CFLAGS: "-O0"
+
+    - name: Test mpi4py (singleton)
+      run:  python test/main.py -v -x TestExcErrhandlerNull
+      if:   ${{ true }}
+      timeout-minutes: 10
+
+    - name: Test mpi4py (np=1)
+      run:  mpiexec -n 1 python test/main.py -v -x TestExcErrhandlerNull
+      if:   ${{ true }}
+      timeout-minutes: 10
+
+    - name: Test mpi4py (np=4)
+      run:  mpiexec -n 4 python test/main.py -v -f -x TestExcErrhandlerNull
+      if:   ${{ true }}
+      timeout-minutes: 10
diff --git a/.gitignore b/.gitignore
index 7ab0b99af7d..b30321da7ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -542,3 +542,5 @@ ompi/mpi/fortran/use-mpi-f08/base/*_generated.c
 ompi/mpi/fortran/use-mpi-f08/mod/mpi-f08-interfaces-generated.h
 ompi/mpi/fortran/use-mpi-ignore-tkr/mpi-ignore-tkr-interfaces-generated.h
 ompi/mpi/fortran/use-mpi-ignore-tkr/*_generated.F90
+
+.vscode/
diff --git a/3rd-party/openpmix b/3rd-party/openpmix
index 7704efaf865..53fce423d5d 160000
--- a/3rd-party/openpmix
+++ b/3rd-party/openpmix
@@ -1 +1 @@
-Subproject commit 7704efaf865328234e3cb1f77ff393adc971c9fe
+Subproject commit 53fce423d5d6b25798ed1f32837671dc55d0230d
diff --git a/3rd-party/prrte b/3rd-party/prrte
index 91544b8d2c5..2d9b0aaaeea 160000
--- a/3rd-party/prrte
+++ b/3rd-party/prrte
@@ -1 +1 @@
-Subproject commit 91544b8d2c5ac84585022d0edad68e38f375a917
+Subproject commit 2d9b0aaaeea49a0e7850aed95e5ace9340c7d847
diff --git a/config/ompi_fortran_check.m4 b/config/ompi_fortran_check.m4
index e479a87ac64..7fd2a790353 100644
--- a/config/ompi_fortran_check.m4
+++ b/config/ompi_fortran_check.m4
@@ -137,8 +137,8 @@ AC_DEFUN([OMPI_FORTRAN_CHECK], [
             long*double*_Complex) ofc_type_kind=C_LONG_DOUBLE_COMPLEX ;;
             opal_short_float_t)   ofc_type_kind=C_SHORT_FLOAT         ;;
             opal_short_float_complex_t) ofc_type_kind=C_SHORT_FLOAT_COMPLEX ;;
-            _Float128)            ofc_type_kind=C__FLOAT128           ;;
-            __float128)           ofc_type_kind=C___FLOAT128          ;;
+            _Float128)            ofc_type_kind=C_FLOAT128            ;;
+            __float128)           ofc_type_kind=C_FLOAT128            ;;
             *)
                 # Skip types like "DOUBLE PRECISION"
                 ;;
diff --git a/config/ompi_fortran_check_real16_c_equiv.m4 b/config/ompi_fortran_check_real16_c_equiv.m4
index 85141c798b6..b9e67d9606e 100644
--- a/config/ompi_fortran_check_real16_c_equiv.m4
+++ b/config/ompi_fortran_check_real16_c_equiv.m4
@@ -61,19 +61,26 @@ AC_DEFUN([OMPI_FORTRAN_CHECK_REAL16_C_EQUIV],[
                                      AC_MSG_RESULT([works!])],
                                     [AC_MSG_RESULT([does not work])])
                              ])
-                       # As recent Intel compilers identify as GNU we will always test for Quad support if no other tests were succesfull
+                       # As recent Intel compilers identify as GNU we will always test for Quad
+                       # support if no other tests were succesfull
                        AS_IF([test "$fortran_real16_happy" = "no"],
-                             [AC_CHECK_TYPES(_Quad)
-                              AS_IF([test "$ac_cv_type__Quad" = "yes"],
-                                    [AC_MSG_CHECKING([if the compiler _Quad == REAL*16])
-                                     CFLAGS_save="$CFLAGS"
+                             [AC_CHECK_TYPES([_Quad])
+                              AS_IF([test "$ac_cv_type__Quad" != "yes"],
+                                    [CFLAGS_save="$CFLAGS"
                                      OPAL_FLAGS_APPEND_UNIQ([CFLAGS], ["-Qoption,cpp,--extended_float_types"])
+                                     # force the check as we have updated CFLAGS
+                                     unset ac_cv_type__Quad
+                                     AC_CHECK_TYPES([_Quad])
+                                     AS_IF([test "$ac_cv_type__Quad" != "yes"],
+                                           [CFLAGS="$CFLAGS_save"])
+                                     ])
+                              AS_IF([test "$ac_cv_type__Quad" != "yes"],
+                                    [AC_MSG_CHECKING([if the compiler _Quad == REAL*16])
                                      OMPI_FORTRAN_CHECK_REAL16_EQUIV_TYPE([_Quad], [q])
                                      AS_IF([test "$fortran_real16_happy" = "yes"],
                                            [OMPI_FORTRAN_REAL16_C_TYPE="_Quad"
                                             AC_MSG_RESULT([works!])],
-                                           [CFLAGS="$CFLAGS_save"
-                                            AC_MSG_RESULT([does not work])])
+                                           [AC_MSG_RESULT([does not work])])
                                     ])
                              ])
                        # We have to [re-]print a new message here, because
diff --git a/config/ompi_setup_mpi_fortran.m4 b/config/ompi_setup_mpi_fortran.m4
index 3474276e661..c396a2efab6 100644
--- a/config/ompi_setup_mpi_fortran.m4
+++ b/config/ompi_setup_mpi_fortran.m4
@@ -226,7 +226,7 @@ AC_DEFUN([OMPI_SETUP_MPI_FORTRAN],[
                    [long double _Complex, double _Complex, float _Complex, short float _Complex, opal_short_float_complex_t],
                    [16], [no])
     OMPI_FORTRAN_CHECK([COMPLEX*32], [no],
-                   [_Float128 _Complex, long double _Complex, double _Complex, float _Complex, short float _Complex, opal_short_float_complex_t],
+                   [_Float128 _Complex, __float128 _Complex, long double _Complex, double _Complex, float _Complex, short float _Complex, opal_short_float_complex_t],
                    [32], [no])
     # Double precision complex types are not standard, but many
     # compilers support it.  Code should be wrapped with #ifdef
diff --git a/config/opal_check_cuda.m4 b/config/opal_check_cuda.m4
index a6bf80a1b2a..ed3a51a26e8 100644
--- a/config/opal_check_cuda.m4
+++ b/config/opal_check_cuda.m4
@@ -154,6 +154,7 @@ AC_MSG_CHECKING([if have cuda support])
 if test "$opal_check_cuda_happy" = "yes"; then
     AC_MSG_RESULT([yes (-I$opal_cuda_incdir)])
     CUDA_SUPPORT=1
+    OMPI_HAVE_ACCELERATOR_SUPPORT=1
     common_cuda_CPPFLAGS="-I$opal_cuda_incdir"
     AC_SUBST([common_cuda_CPPFLAGS])
 else
diff --git a/config/opal_check_rocm.m4 b/config/opal_check_rocm.m4
index 25ac54e438e..0d1e6053469 100644
--- a/config/opal_check_rocm.m4
+++ b/config/opal_check_rocm.m4
@@ -57,7 +57,8 @@ AC_DEFUN([OPAL_CHECK_ROCM],[
      AS_IF([ test "$opal_check_rocm_happy" = "yes" ],
            [ OPAL_APPEND([$1_CPPFLAGS], [$rocm_CPPFLAGS])
 	     AC_DEFINE_UNQUOTED([OPAL_ROCM_SUPPORT], [1], [Enable ROCm support])
-             ROCM_SUPPORT=1 ],
+             ROCM_SUPPORT=1
+             OMPI_HAVE_ACCELERATOR_SUPPORT=1 ],
            [ AC_DEFINE_UNQUOTED([OPAL_ROCM_SUPPORT], [0], [Disable ROCm support])
              ROCM_SUPPORT=0 ])
 
diff --git a/config/opal_check_ze.m4 b/config/opal_check_ze.m4
index d1d47bb67c1..84c8dacd2df 100644
--- a/config/opal_check_ze.m4
+++ b/config/opal_check_ze.m4
@@ -56,7 +56,8 @@ AC_DEFUN([OPAL_CHECK_ZE],[
 
      AS_IF([ test "$opal_check_ze_happy" = "yes" ],
            [ AC_DEFINE_UNQUOTED([OPAL_ZE_SUPPORT], [1], [Enable Intel ZE support])
-             ZE_SUPPORT=1 ],
+             ZE_SUPPORT=1
+             OMPI_HAVE_ACCELERATOR_SUPPORT=1 ],
            [ AC_DEFINE_UNQUOTED([OPAL_ZE_SUPPORT], [0], [Disable Intel ZE support])
              ZE_SUPPORT=0 ])
 
diff --git a/config/opal_mca.m4 b/config/opal_mca.m4
index cdeb935a3a3..bb51d3bc5f1 100644
--- a/config/opal_mca.m4
+++ b/config/opal_mca.m4
@@ -186,7 +186,7 @@ of type-component pairs.  For example, --enable-mca-no-build=pml-ob1])
     else
        msg=
        if test -z "$enable_mca_dso"; then
-           enable_mca_dso="accelerator-cuda,accelerator-rocm,accelerator-ze,btl-smcuda,rcache-gpusm,rcache-rgpusm"
+           enable_mca_dso="accelerator-cuda,accelerator-rocm,accelerator-ze"
            msg="(default)"
        fi
        DSO_all=0
diff --git a/configure.ac b/configure.ac
index 928f41b0415..d4276b23284 100644
--- a/configure.ac
+++ b/configure.ac
@@ -276,6 +276,7 @@ m4_ifdef([project_oshmem],
 ############################################################################
 # Configuration options
 ############################################################################
+OMPI_HAVE_ACCELERATOR_SUPPORT=0
 
 OPAL_CONFIGURE_OPTIONS
 
diff --git a/contrib/platform/mellanox/optimized.conf b/contrib/platform/mellanox/optimized.conf
index 6a7be025a66..b1316c4b67d 100644
--- a/contrib/platform/mellanox/optimized.conf
+++ b/contrib/platform/mellanox/optimized.conf
@@ -85,8 +85,6 @@ opal_warn_on_missing_libcuda = 0
 bml_r2_show_unreach_errors = 0
 
 # alltoall algorithm selection settings for tuned coll mca
-coll_tuned_alltoall_large_msg              = 250000
-coll_tuned_alltoall_min_procs              = 2048
 coll_tuned_alltoall_algorithm_max_requests = 8
 coll_tuned_scatter_intermediate_msg        = 8192
 coll_tuned_scatter_large_msg               = 250000
diff --git a/docs/Makefile.am b/docs/Makefile.am
index 871184eb01d..a6edc6ae045 100644
--- a/docs/Makefile.am
+++ b/docs/Makefile.am
@@ -38,7 +38,8 @@ TEXT_SOURCE_FILES  = \
         $(srcdir)/license/*.txt
 IMAGE_SOURCE_FILES = \
         $(srcdir)/openmpi_logo.png \
-        $(srcdir)/installing-open-mpi/required-support-libraries-dependency-graph.png
+        $(srcdir)/installing-open-mpi/required-support-libraries-dependency-graph.png \
+        $(srcdir)/tuning-apps/collectives/images/xhc-hierarchy.svg
 RST_SOURCE_FILES   = \
         $(srcdir)/*.rst \
         $(srcdir)/release-notes/*.rst \
diff --git a/docs/tuning-apps/collectives/components.rst b/docs/tuning-apps/collectives/components.rst
index f29c202e358..921f7e12036 100644
--- a/docs/tuning-apps/collectives/components.rst
+++ b/docs/tuning-apps/collectives/components.rst
@@ -28,7 +28,9 @@ The following provides a list of components and their primary target scenario:
    more details.
  - ``ucc``: component using the `UCC library <https://github.com/openucx/ucc/>`_
    for collective operations.
- - ``xhc``: shared memory collective component using XPMEM for data transfers.
+ - ``xhc``: shared memory collective component, employing hierarchical &
+   topology-aware algorithms, with XPMEM for data transfers. See :doc:`xhc` for
+   more details.
  - ``acoll``: collective component tuned for AMD Zen architectures. See :doc:`acoll` for
    more details.
  - ``accelerator``: component providing host-proxy algorithms for some
diff --git a/ompi/mca/coll/xhc/resources/xhc-hierarchy.svg b/docs/tuning-apps/collectives/images/xhc-hierarchy.svg
similarity index 86%
rename from ompi/mca/coll/xhc/resources/xhc-hierarchy.svg
rename to docs/tuning-apps/collectives/images/xhc-hierarchy.svg
index c8f6d8a2da3..b4ae62a6c4f 100644
--- a/ompi/mca/coll/xhc/resources/xhc-hierarchy.svg
+++ b/docs/tuning-apps/collectives/images/xhc-hierarchy.svg
@@ -7,7 +7,7 @@
    viewBox="0 0 169.571 119.89402"
    version="1.1"
    id="svg5"
-   inkscape:version="1.2.1 (9c6d41e410, 2022-07-14, custom)"
+   inkscape:version="1.4.3 (0d15f75042, 2025-12-25)"
    sodipodi:docname="xhc-hierarchy.svg"
    inkscape:export-filename="../xhc-hierarchy.png"
    inkscape:export-xdpi="300"
@@ -26,11 +26,11 @@
      inkscape:pagecheckerboard="0"
      inkscape:document-units="mm"
      showgrid="false"
-     inkscape:zoom="0.75290071"
-     inkscape:cx="286.22632"
-     inkscape:cy="274.93665"
+     inkscape:zoom="1.4452058"
+     inkscape:cx="278.16108"
+     inkscape:cy="266.39805"
      inkscape:window-width="1920"
-     inkscape:window-height="1018"
+     inkscape:window-height="1136"
      inkscape:window-x="1920"
      inkscape:window-y="0"
      inkscape:window-maximized="1"
@@ -78,25 +78,6 @@
        id="path-effect556"
        is_visible="true"
        lpeversion="1" />
-    <marker
-       style="overflow:visible"
-       id="TriangleStart"
-       refX="0"
-       refY="0"
-       orient="auto-start-reverse"
-       inkscape:stockid="TriangleStart"
-       markerWidth="4.2600002"
-       markerHeight="4.9248557"
-       viewBox="0 0 5.3244081 6.1553851"
-       inkscape:isstock="true"
-       inkscape:collect="always"
-       preserveAspectRatio="xMidYMid">
-      <path
-         transform="scale(0.5)"
-         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
-         d="M 5.77,0 -2.88,5 V -5 Z"
-         id="path135" />
-    </marker>
     <marker
        style="overflow:visible"
        id="TriangleStart_Fnone_S-000000"
@@ -422,52 +403,52 @@
      inkscape:label="Layer 1"
      inkscape:groupmode="layer"
      id="layer1"
-     transform="translate(-430.99854,-194.24567)">
+     transform="translate(-430.99854,-193.98109)">
     <rect
        style="fill:#ffffff;fill-opacity:1;stroke:none;stroke-width:0.992327"
        id="rect351"
        width="169.571"
        height="119.894"
        x="430.99854"
-       y="194.24567" />
+       y="193.98109" />
     <text
        xml:space="preserve"
-       style="font-size:7.05556px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
+       style="font-size:5.64444px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
        x="523.56769"
-       y="295.47681"
+       y="296.00598"
        id="text7266-3"><tspan
          sodipodi:role="line"
          id="tspan7264-6"
-         style="font-size:7.05556px;text-align:center;text-anchor:middle;stroke-width:0.264583"
+         style="font-size:5.64444px;text-align:center;text-anchor:middle;stroke-width:0.264583"
          x="523.56769"
-         y="295.47681">NUMA Level</tspan></text>
+         y="296.00598">NUMA Level</tspan></text>
     <rect
        style="fill:none;stroke:#000000;stroke-width:0.500001;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:2, 1;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
        id="rect29192-1"
        width="117.44209"
        height="33.103409"
        x="465.46393"
-       y="232.30515"
+       y="231.77599"
        ry="2"
        rx="1.9999999" />
     <text
        xml:space="preserve"
-       style="font-size:7.05556px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
+       style="font-size:5.64444px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
        x="523.71375"
-       y="260.33276"
+       y="259.80359"
        id="text37653-1"><tspan
          sodipodi:role="line"
          id="tspan37651-0"
-         style="font-size:7.05556px;text-align:center;text-anchor:middle;stroke-width:0.264583"
+         style="font-size:5.64444px;text-align:center;text-anchor:middle;stroke-width:0.264583"
          x="523.71375"
-         y="260.33276">Socket Level</tspan></text>
+         y="259.80359">Socket Level</tspan></text>
     <rect
        style="fill:none;stroke:#000000;stroke-width:0.4808;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:1.9232, 0.961599;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
        id="rect1048-9"
        width="147.28586"
        height="30.605822"
        x="450.54153"
-       y="267.75421"
+       y="268.28339"
        ry="1.8491039"
        rx="2.0002611" />
     <rect
@@ -476,21 +457,21 @@
        width="56.600784"
        height="33.103409"
        x="495.88324"
-       y="196.99588"
+       y="195.93756"
        ry="2"
        rx="1.9999996" />
     <rect
-       style="font-variation-settings:normal;fill:#afafe9;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
+       style="font-variation-settings:normal;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
        id="rect1928-1"
        width="29.11949"
        height="12.373198"
        x="509.6239"
-       y="210.07121"
+       y="209.01289"
        ry="2.4999995"
        rx="2.5" />
     <g
        id="g742"
-       transform="translate(28.708569,28.979003)">
+       transform="translate(28.708569,27.920669)">
       <rect
          style="fill:#ffffff;stroke:#000001;stroke-width:0.264999;stroke-linejoin:round;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
          id="rect1930-5"
@@ -508,28 +489,29 @@
     </g>
     <text
        xml:space="preserve"
-       style="font-size:7.05556px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
-       x="524.31555"
-       y="205.65865"
+       style="font-size:5.64444px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
+       x="524.14557"
+       y="204.60033"
        id="text53597-7"><tspan
          sodipodi:role="line"
          id="tspan53595-6"
-         style="font-size:7.05556px;text-align:center;text-anchor:middle;stroke-width:0.264583"
-         x="524.31555"
-         y="205.65865">System Level</tspan></text>
+         style="font-size:5.64444px;text-align:center;text-anchor:middle;stroke-width:0.264583"
+         x="524.14557"
+         y="204.60033">Node Level</tspan></text>
     <path
        style="fill:none;stroke:#000000;stroke-width:0.494089;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000);marker-end:url(#TriangleOutM-3_Fnone_S-000000)"
        d="M 561.29231,236.42783 530.28249,217.01021"
        id="path108384-9"
        inkscape:path-effect="#path-effect108388-3"
-       inkscape:original-d="m 561.29231,236.42783 c -10.38789,-6.52565 -20.67275,-12.94489 -31.00982,-19.41762" />
+       inkscape:original-d="m 561.29231,236.42783 c -10.38789,-6.52565 -20.67275,-12.94489 -31.00982,-19.41762"
+       transform="translate(0,-1.0583333)" />
     <rect
-       style="font-variation-settings:normal;fill:#87cdde;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
+       style="font-variation-settings:normal;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
        id="rect1248-1"
        width="29.11949"
        height="12.373198"
        x="472.74966"
-       y="238.28702"
+       y="237.75786"
        ry="2.4999995"
        rx="2.5" />
     <rect
@@ -538,14 +520,14 @@
        width="5.7350011"
        height="5.7350011"
        x="478.79581"
-       y="241.60611" />
+       y="241.07695" />
     <rect
        style="fill:#ffffff;stroke:#000001;stroke-width:0.264999;stroke-linejoin:round;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
        id="rect1857-7"
        width="5.7350011"
        height="5.7350011"
        x="490.08801"
-       y="241.60611" />
+       y="241.07695" />
     <path
        style="fill:none;stroke:#000000;stroke-width:0.489502;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000);marker-end:url(#TriangleOutM-3_Fnone_S-000000)"
        d="M 471.54671,271.46782 481.70372,244.9286"
@@ -553,26 +535,26 @@
        inkscape:path-effect="#path-effect108767-6"
        inkscape:original-d="m 471.54671,271.46782 c 3.40244,-8.91899 6.77119,-17.69255 10.15701,-26.53922" />
     <rect
-       style="font-variation-settings:normal;fill:#ffeeaa;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
+       style="font-variation-settings:normal;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
        id="rect12758-0"
        width="29.118999"
        height="12.373198"
        x="456.87491"
-       y="273.43073"
+       y="273.9599"
        ry="2.4999995"
        rx="2.5" />
     <rect
-       style="font-variation-settings:normal;fill:#ffeeaa;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
+       style="font-variation-settings:normal;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
        id="rect2265"
        width="29.118999"
        height="12.373198"
        x="488.62491"
-       y="273.43073"
+       y="273.9599"
        ry="2.4999995"
        rx="2.5" />
     <g
        id="g2275"
-       transform="translate(76.684113,22.629088)">
+       transform="translate(76.684113,23.158255)">
       <ellipse
          style="fill:#afe9dd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
          id="ellipse2267"
@@ -609,12 +591,12 @@
        inkscape:path-effect="#path-effect2379"
        inkscape:original-d="m 502.77673,271.53882 c -3.32175,-9.0237 -6.61058,-17.90027 -9.91609,-26.85082" />
     <rect
-       style="font-variation-settings:normal;fill:#87cdde;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
+       style="font-variation-settings:normal;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
        id="rect1916-5"
        width="29.11949"
        height="12.373198"
        x="546.49811"
-       y="237.65749"
+       y="237.12833"
        ry="2.4999995"
        rx="2.5" />
     <rect
@@ -623,14 +605,14 @@
        width="5.7350011"
        height="5.7350011"
        x="552.54425"
-       y="240.97658" />
+       y="240.44742" />
     <rect
        style="fill:#ffffff;stroke:#000001;stroke-width:0.264999;stroke-linejoin:round;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
        id="rect1920-6"
        width="5.7350011"
        height="5.7350011"
        x="563.83643"
-       y="240.97658" />
+       y="240.44742" />
     <path
        style="fill:none;stroke:#000000;stroke-width:0.490799;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000);marker-end:url(#TriangleOutM-3_Fnone_S-000000)"
        d="M 545.30179,271.61213 555.401,244.23858"
@@ -638,17 +620,17 @@
        inkscape:path-effect="#path-effect2678"
        inkscape:original-d="m 545.30179,271.61213 c 3.38308,-9.19938 6.73266,-18.24875 10.09921,-27.37355" />
     <rect
-       style="font-variation-settings:normal;fill:#ffeeaa;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
+       style="font-variation-settings:normal;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
        id="rect2642"
        width="29.118999"
        height="12.373198"
        x="530.62335"
-       y="273.43073"
+       y="273.9599"
        ry="2.4999995"
        rx="2.5" />
     <g
        id="g2652"
-       transform="translate(118.68254,22.629088)">
+       transform="translate(118.68254,23.158255)">
       <ellipse
          style="fill:#afe9dd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
          id="ellipse2644"
@@ -679,17 +661,17 @@
          ry="0.50640655" />
     </g>
     <rect
-       style="font-variation-settings:normal;fill:#ffeeaa;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
+       style="font-variation-settings:normal;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
        id="rect2656"
        width="29.118999"
        height="12.373198"
        x="562.37335"
-       y="273.43073"
+       y="273.9599"
        ry="2.4999995"
        rx="2.5" />
     <g
        id="g2666"
-       transform="translate(150.43255,22.629088)">
+       transform="translate(150.43255,23.158255)">
       <ellipse
          style="fill:#afe9dd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
          id="ellipse2658"
@@ -730,22 +712,23 @@
        d="m 487.56018,236.95873 30.3709,-19.72977"
        id="path2802"
        inkscape:path-effect="#path-effect2806"
-       inkscape:original-d="m 487.56018,236.95873 c 10.17386,-6.63057 20.2468,-13.15301 30.3709,-19.72977" />
+       inkscape:original-d="m 487.56018,236.95873 c 10.17386,-6.63057 20.2468,-13.15301 30.3709,-19.72977"
+       transform="translate(0,-1.0583333)" />
     <text
        xml:space="preserve"
-       style="font-size:6.35px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
-       x="-165.23523"
-       y="497.55908"
+       style="font-size:4.93889px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
+       x="-163.80605"
+       y="497.17615"
        id="text270"
        transform="rotate(-75)"><tspan
          sodipodi:role="line"
          id="tspan268"
-         style="font-size:6.35px;stroke-width:0.264583"
-         x="-165.23523"
-         y="497.55908">Cores</tspan></text>
+         style="font-size:4.93889px;stroke-width:0.264583"
+         x="-163.80605"
+         y="497.17615">Cores</tspan></text>
     <g
        id="g9766"
-       transform="matrix(0,1.4312737,-0.77511095,0,565.75587,22.73314)"
+       transform="matrix(0,1.4312737,-0.77511095,0,565.75587,23.262307)"
        style="stroke-width:0.474708;stroke-dasharray:none">
       <g
          id="g3745">
@@ -768,23 +751,23 @@
     </g>
     <g
        id="g14573"
-       transform="matrix(-1,0,0,1,924.11737,0)">
+       transform="matrix(-1,0,0,1,924.11737,0.52916667)">
       <text
          xml:space="preserve"
-         style="font-size:5.64444px;line-height:1;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
+         style="font-size:4.93889px;line-height:1;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
          x="-447.51492"
          y="305.93857"
          id="text656"
          transform="scale(-1,1)"><tspan
            sodipodi:role="line"
-           style="font-size:5.64444px;text-align:center;text-anchor:middle;stroke-width:0.264583"
+           style="font-size:4.93889px;text-align:center;text-anchor:middle;stroke-width:0.264583"
            x="-447.51492"
            y="305.93857"
            id="tspan1416">NUMA 0</tspan><tspan
            sodipodi:role="line"
-           style="font-size:5.64444px;text-align:center;text-anchor:middle;stroke-width:0.264583"
+           style="font-size:4.93889px;text-align:center;text-anchor:middle;stroke-width:0.264583"
            x="-447.51492"
-           y="311.58301"
+           y="310.87747"
            id="tspan2490">Leader</tspan></text>
       <path
          style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000)"
@@ -794,7 +777,7 @@
     </g>
     <g
        id="g2167"
-       transform="translate(44.916471,22.629088)">
+       transform="translate(44.916471,23.158255)">
       <ellipse
          style="fill:#afe9dd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
          id="circle12748-9"
@@ -828,181 +811,181 @@
        xml:space="preserve"
        style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
        x="459.90964"
-       y="278.60001"
+       y="279.12918"
        id="text1273"><tspan
          sodipodi:role="line"
          id="tspan1271"
          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
          x="459.90964"
-         y="278.60001">P0</tspan></text>
+         y="279.12918">P0</tspan></text>
     <text
        xml:space="preserve"
        style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
        x="466.32285"
-       y="278.59824"
+       y="279.12741"
        id="text1327"><tspan
          sodipodi:role="line"
          id="tspan1325"
          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
          x="466.32285"
-         y="278.59824">P1</tspan></text>
+         y="279.12741">P1</tspan></text>
     <text
        xml:space="preserve"
        style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
        x="472.31091"
-       y="278.59824"
+       y="279.12741"
        id="text1331"><tspan
          sodipodi:role="line"
          id="tspan1329"
          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
          x="472.31091"
-         y="278.59824">P2</tspan></text>
+         y="279.12741">P2</tspan></text>
     <text
        xml:space="preserve"
        style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
        x="478.46829"
-       y="278.59824"
+       y="279.12741"
        id="text1335"><tspan
          sodipodi:role="line"
          id="tspan1333"
          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
          x="478.46829"
-         y="278.59824">P3</tspan></text>
+         y="279.12741">P3</tspan></text>
     <text
        xml:space="preserve"
        style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
        x="491.64203"
-       y="278.60001"
+       y="279.12918"
        id="text1504"><tspan
          sodipodi:role="line"
          id="tspan1502"
          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
          x="491.64203"
-         y="278.60001">P4</tspan></text>
+         y="279.12918">P4</tspan></text>
     <text
        xml:space="preserve"
        style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
        x="497.89117"
-       y="278.59824"
+       y="279.12741"
        id="text1508"><tspan
          sodipodi:role="line"
          id="tspan1506"
          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
          x="497.89117"
-         y="278.59824">P5</tspan></text>
+         y="279.12741">P5</tspan></text>
     <text
        xml:space="preserve"
        style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
        x="504.06619"
-       y="278.59824"
+       y="279.12741"
        id="text1512"><tspan
          sodipodi:role="line"
          id="tspan1510"
          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
          x="504.06619"
-         y="278.59824">P6</tspan></text>
+         y="279.12741">P6</tspan></text>
     <text
        xml:space="preserve"
        style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
        x="510.26416"
-       y="278.59824"
+       y="279.12741"
        id="text1516"><tspan
          sodipodi:role="line"
          id="tspan1514"
          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
          x="510.26416"
-         y="278.59824">P7</tspan></text>
+         y="279.12741">P7</tspan></text>
     <text
        xml:space="preserve"
        style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
        x="533.67395"
-       y="278.60004"
+       y="279.12921"
        id="text1522"><tspan
          sodipodi:role="line"
          id="tspan1520"
          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
          x="533.67395"
-         y="278.60004">P8</tspan></text>
+         y="279.12921">P8</tspan></text>
     <text
        xml:space="preserve"
        style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
        x="539.87195"
-       y="278.59827"
+       y="279.12744"
        id="text1526"><tspan
          sodipodi:role="line"
          id="tspan1524"
          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
          x="539.87195"
-         y="278.59827">P9</tspan></text>
+         y="279.12744">P9</tspan></text>
     <text
        xml:space="preserve"
        style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
        x="546.22516"
-       y="278.59827"
+       y="279.12744"
        id="text1530"><tspan
          sodipodi:role="line"
          id="tspan1528"
          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
          x="546.22516"
-         y="278.59827">10</tspan></text>
+         y="279.12744">10</tspan></text>
     <text
        xml:space="preserve"
        style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
        x="552.63831"
-       y="278.59827"
+       y="279.12744"
        id="text1534"><tspan
          sodipodi:role="line"
          id="tspan1532"
          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
          x="552.63831"
-         y="278.59827">11</tspan></text>
+         y="279.12744">11</tspan></text>
     <text
        xml:space="preserve"
        style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
        x="565.57214"
-       y="278.60004"
+       y="279.12921"
        id="text1548"><tspan
          sodipodi:role="line"
          id="tspan1546"
          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
          x="565.57214"
-         y="278.60004">12</tspan></text>
+         y="279.12921">12</tspan></text>
     <text
        xml:space="preserve"
        style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
        x="571.78247"
-       y="278.59827"
+       y="279.12744"
        id="text1552"><tspan
          sodipodi:role="line"
          id="tspan1550"
          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
          x="571.78247"
-         y="278.59827">13</tspan></text>
+         y="279.12744">13</tspan></text>
     <text
        xml:space="preserve"
        style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
        x="577.93988"
-       y="278.59827"
+       y="279.12744"
        id="text1556"><tspan
          sodipodi:role="line"
          id="tspan1554"
          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
          x="577.93988"
-         y="278.59827">14</tspan></text>
+         y="279.12744">14</tspan></text>
     <text
        xml:space="preserve"
        style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
        x="584.18903"
-       y="278.59827"
+       y="279.12744"
        id="text1560"><tspan
          sodipodi:role="line"
          id="tspan1558"
          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
          x="584.18903"
-         y="278.59827">15</tspan></text>
+         y="279.12744">15</tspan></text>
     <g
        id="g8037"
-       transform="matrix(-1,0,0,1,944.89717,-0.52916667)">
+       transform="matrix(-1,0,0,1,944.89717,-1.0583333)">
       <g
          id="g7602">
         <text
@@ -1025,7 +1008,7 @@
     </g>
     <g
        id="g7632"
-       transform="translate(84.666671,0.52917333)">
+       transform="translate(84.666671)">
       <text
          xml:space="preserve"
          style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
@@ -1043,8 +1026,7 @@
          id="path7630" />
     </g>
     <g
-       id="g8800"
-       transform="translate(0,0.52917333)">
+       id="g8800">
       <text
          xml:space="preserve"
          style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
@@ -1063,7 +1045,7 @@
     </g>
     <g
        id="g7666"
-       transform="translate(-83.60834)">
+       transform="translate(-83.60834,-0.52916667)">
       <text
          xml:space="preserve"
          style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
@@ -1082,7 +1064,7 @@
     </g>
     <g
        id="g8059"
-       transform="rotate(180,501.4769,223.23715)">
+       transform="rotate(180,501.4769,222.70799)">
       <text
          xml:space="preserve"
          style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
@@ -1102,7 +1084,7 @@
     </g>
     <g
        id="g13977"
-       transform="matrix(1,0,0,-1,45.394312,446.4743)">
+       transform="matrix(1,0,0,-1,45.394312,445.41596)">
       <text
          xml:space="preserve"
          style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
@@ -1122,23 +1104,23 @@
     </g>
     <g
        id="g554"
-       transform="matrix(-1,0,0,1,955.86739,0)">
+       transform="matrix(-1,0,0,1,955.86739,0.52916667)">
       <text
          xml:space="preserve"
-         style="font-size:5.64444px;line-height:1;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
+         style="font-size:4.93889px;line-height:1;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
          x="-447.51492"
          y="305.93857"
          id="text550"
          transform="scale(-1,1)"><tspan
            sodipodi:role="line"
-           style="font-size:5.64444px;text-align:center;text-anchor:middle;stroke-width:0.264583"
+           style="font-size:4.93889px;text-align:center;text-anchor:middle;stroke-width:0.264583"
            x="-447.51492"
            y="305.93857"
            id="tspan546">NUMA 1</tspan><tspan
            sodipodi:role="line"
-           style="font-size:5.64444px;text-align:center;text-anchor:middle;stroke-width:0.264583"
+           style="font-size:4.93889px;text-align:center;text-anchor:middle;stroke-width:0.264583"
            x="-447.51492"
-           y="311.58301"
+           y="310.87747"
            id="tspan548">Leader</tspan></text>
       <path
          style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000)"
@@ -1148,23 +1130,23 @@
     </g>
     <g
        id="g666"
-       transform="matrix(-1,0,0,1,1029.9509,0)">
+       transform="matrix(-1,0,0,1,1029.9509,0.52916667)">
       <text
          xml:space="preserve"
-         style="font-size:5.64444px;line-height:1;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
+         style="font-size:4.93889px;line-height:1;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
          x="-447.51492"
          y="305.93857"
          id="text662"
          transform="scale(-1,1)"><tspan
            sodipodi:role="line"
-           style="font-size:5.64444px;text-align:center;text-anchor:middle;stroke-width:0.264583"
+           style="font-size:4.93889px;text-align:center;text-anchor:middle;stroke-width:0.264583"
            x="-447.51492"
            y="305.93857"
            id="tspan658">NUMA 3</tspan><tspan
            sodipodi:role="line"
-           style="font-size:5.64444px;text-align:center;text-anchor:middle;stroke-width:0.264583"
+           style="font-size:4.93889px;text-align:center;text-anchor:middle;stroke-width:0.264583"
            x="-447.51492"
-           y="311.58301"
+           y="310.87747"
            id="tspan660">Leader</tspan></text>
       <path
          style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000)"
diff --git a/docs/tuning-apps/collectives/index.rst b/docs/tuning-apps/collectives/index.rst
index 4220aed7d2a..2e49cae2d15 100644
--- a/docs/tuning-apps/collectives/index.rst
+++ b/docs/tuning-apps/collectives/index.rst
@@ -12,3 +12,4 @@ components additional information on how to utilize them.
    components
    tuned
    acoll
+   xhc
diff --git a/docs/tuning-apps/collectives/xhc.rst b/docs/tuning-apps/collectives/xhc.rst
new file mode 100644
index 00000000000..8ac345649c5
--- /dev/null
+++ b/docs/tuning-apps/collectives/xhc.rst
@@ -0,0 +1,310 @@
+XPMEM Hierarchical Collectives (``xhc``)
+========================================
+
+Introduction
+------------
+
+The XHC component implements highly optimized intra-node MPI collectives using
+hierarchical & topology-aware algorithms, while (mainly) utilizing XPMEM for
+efficient data transfers between processes.
+
+The following primitives are currently implemented:
+
+    * MPI_Bcast
+    * MPI_Allreduce
+    * MPI_Reduce
+    * MPI_Barrier
+
+Using the xhc component
+-----------------------
+
+To enable the ``xhc`` component, simply set its priority higher than other
+collectives components:
+
+.. code-block:: sh
+   
+   $ mpirun --mca coll_xhc_priority 40 [...] <program> [...]
+
+Main Features
+-------------
+
+Hierarchy
+~~~~~~~~~
+
+XHC constructs an *n*-level hierarchy (i.e. no limitation on number of levels),
+based on intra-node topological features. Rank/process locality information
+originates from Hwloc, and is obtained through Open MPI's internal structures.
+
+The following topological features can currently be defined:
+
+    * NUMA node
+    * CPU Socket
+    * L1/L2/L3 cache
+    * Hwthread/core
+    * Node (all ranks *are* in same node -> flat hierarchy)
+
+An example of a 3-level XHC hierarchy (``numa,socket`` configuration):
+
+.. image:: images/xhc-hierarchy.svg
+   :width: 450px
+
+Furthermore, support for virtual/user-defined hierarchies is available, to
+allow for even finer control and custom experiments.
+
+**Pipelining** is seamlessly applied across all levels of the hierarchy, to
+minimize hierarchy-induced overheads, and to allow for interleaving of
+operations in certain collectives (e.g. reduce+bcast in allreduce).
+
+Single-copy data transfers
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+XHC supports data transfers between MPI ranks using a single copy, through Open
+MPI's ``opal/smsc`` (shared-memory-single-copy) framework. Despite the
+component's name, XHC actually also supports additional single-copy mechanisms
+in some collectives, though XPMEM is highly recommended.
+
+    * Bcast: XPMEM, CMA, KNEM
+    * Allreduce/Reduce: XPMEM
+    * Barrier: *(irrelevant)*
+
+In XPMEM mode, application buffers are attached on the fly the first time they
+appear, and are saved in ``smsc/xpmem``'s internal registration cache for
+future uses.
+
+Shared-memory data transfers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+XHC also supports data transfers using copy-in-copy-out (CICO) over shared
+memory. Copy-in-copy-out is always used for small messages, with automatic
+switching to single-copy for large ones. All primitives support this mode,
+regardless of XPMEM or SMSC presence, as long as the size of the message is
+below the threshold.
+
+Inline data transfers
+~~~~~~~~~~~~~~~~~~~~~
+
+For especially small messages, the payload data is inlined in the same cache
+line as the control data. This achieves exceptionally low latency in such
+messages. Supported in all primitives, regardless of XPMEM or SMSC presence.
+
+Synchronization
+~~~~~~~~~~~~~~~
+
+XHC uses **lock-free** synchronization, using the single-writer paradigm and lightweight *read* or *write* memory barriers wherever appropriate.
+
+Multi-node with HAN
+-------------------
+
+Even though ``xhc`` only works over shared memory, it may also be utilized in
+multi-node environments, through ``coll/han``. HAN is already the default
+component in multi-node runs, so all that's needed is to define ``xhc`` as the
+component to be used for the intra-node phase:
+
+.. code-block:: sh
+   
+   $ mpirun --mca coll_han_bcast_low_module 2 --mca coll_han_reduce_low_module 2 \
+      --mca coll_han_allreduce_low_module 2 <program>
+
+.. _mca-params:
+
+MCA Parameters
+--------------
+
+Basic
+~~~~~
+
+.. list-table::
+   :header-rows: 1
+   :widths: 20 10 70
+   
+   * - Parameter
+     - Default
+     - Description
+   
+   * - coll_xhc_priority
+     - 0
+     - The priority of the component. Set it to a value higher than other
+       components to enable xhc.
+
+Main
+~~~~
+
+.. list-table::
+   :header-rows: 1
+   :widths: 20 20 60
+   
+   * - Parameter
+     - Default
+     - Description
+   
+   * - coll_xhc_hierarchy
+     - *unset*
+     - A comma separated list of topological features to which XHC's hierarchy
+       should be sensitive. This is a hint -- xhc will automatically: disregard
+       features that don't exist in the system, or that don't further segment
+       the ranks (e.g. ``numa`` was specified, but all ranks are in the same
+       NUMA node); re-order the list to match the system's hierarchy; add an
+       extra top level that's common to all ranks. This parameter applies to
+       all primitives, and is mutually exclusive with the primitive-specific
+       ones below.
+       
+       This parameter also supports the use of special modifiers for *virtual
+       hierarchies*. Check ``xhc_component_parse_hierarchy()`` for further
+       explanation and syntax.
+   
+   * - coll_xhc_chunk_size
+     - *unset*
+     - The chunk size for the pipelining. Data is processed in this-much sized
+       pieces at once. Applies to all primitives -- mutually exclusive with
+       primitive-specific parameters.
+   
+   * - coll_xhc_cico_max
+     - *unset*
+     - The max size up to which to use copy-in-copy-out. Single copy will be
+       used for messages above this size. Applies to all primitives -- mutually
+       exclusive with primitive-specific parameters.
+   
+   * - coll_xhc_<op>_hierarchy
+     - bcast/barrier: ``numa,socket``
+       (all)reduce: ``l3,numa,socket``
+     - Topological features to consider for XHC's hierarchy, specifially for
+       this primitive. Mutually exclusive with the respective non-specific
+       parameter.
+   
+   * - coll_xhc_<op>_chunk_size
+     - 16K
+     - Pipeline chunk size, specifically for this primitive. Mutually exclusive
+       with the non-specific parameter.
+   
+   * - coll_xhc_<op>_cico_max
+     - bcast: ``256``
+       (all)reduce: ``4K``
+     - Max size for copy-in-copy-out transfers, specifically for this
+       primitive. Mutually exclusive with the non-specific parameter.
+
+Advanced
+~~~~~~~~
+
+.. list-table::
+   :header-rows: 1
+   :widths: 20 20 60
+   
+   * - Parameter
+     - Default
+     - Description
+   
+   * - coll_xhc_<barrier/allreduce>_root
+     - 0
+     - Internal root rank, for either of these operations.
+   
+   * - coll_xhc_uniforms_chunks
+     - true
+     - Whether to dynamically adjust (decrease) the chunk size in reduction
+       primitives, so that all ranks will perform equal work, depending on
+       the message size.
+   
+   * - coll_xhc_uniforms_chunks_min
+     - 4K
+     - Minimum allowed value for the automatically decreased chunk size in
+       reduction primitives.
+   
+   * - coll_xhc_reduce_load_balance
+     - top,first
+     - Controls load balancing features in reduction primitives. With no such
+       features enabled, leader ranks don't perform any reduction work, on the
+       levels on which they are leaders. Add ``top`` to have the root perform
+       reductions on the top-most level of the hierarchy, as if a common rank.
+       Add ``first``, to have all leaders reduce a single chunk, at the
+       beginning of the operation as if they weren't leaders. Add ``all`` to
+       have leaders always perform reductions, even on the levels on which they
+       are leaders (not recommended).
+   
+   * - coll_xhc_dynamic_reduce
+     - non-float
+     - Controls support for out-of-order reduction (rank wise), which allows
+       temporarily skipping a peer that's not yet ready. The default value only
+       enables the feature for non-float types, to avoid reproducibility issues
+       with floats. Set to ``disabled`` or ``all`` to turn off or on,
+       respectively, for all types.
+   
+   * - coll_xhc_dynamic_leader
+     - false
+     - Dynamically elect the first rank from each hierarchy group to join the
+       collective as its leader, in broadcast. Introduces an atomic
+       compare-exchange per each call, when enabled.
+
+Other
+~~~~~
+
+.. list-table::
+   :header-rows: 1
+   :widths: 20 20 60
+   
+   * - Parameter
+     - Default
+     - Description
+   
+   * - coll_xhc_shmem_backing
+     - /dev/shm
+     - Backing directory for shmem files.
+   
+   * - coll_xhc_memcpy_chunk_size
+     - 256K
+     - Break up large memcpy calls to smaller ones, using this chunk size.
+       Will actually attempt to mirror the value of ``smsc/xpmem``'s respective
+       parameter at run-time.
+
+Debug
+~~~~~
+
+.. list-table::
+   :header-rows: 1
+   :widths: 25 15 60
+   
+   * - Parameter
+     - Default
+     - Description
+   
+   * - coll_xhc_print_info
+     - *none*
+     - Print information about the component's configuration, and its
+       constructed hierarchies. Takes a comma delimited list of: the name of
+       the collective primitive about which to print information; ``config``
+       to print the configuration; ``all`` to print everything; ``dot`` along
+       with the name of a collective primitive to print its hierarchy in DOT
+       format.
+
+Limitations
+-----------
+
+* **Heterogeneity**: XHC does not support nodes with non-uniform
+  datatype representations across ranks (Open MPI's ``proc_arch``).
+
+* **Non-commutative** operators are not currently supported in
+  reduction collectives.
+
+* **Derived datatypes** are not yet supported.
+
+* The Reduce implementation only supports rank 0 as the root, and will
+  automatically fall back to another component in other scenarios. Work in
+  progress.
+
+Other resources
+---------------
+
+All things XHC landing page: https://github.com/CARV-ICS-FORTH/XHC-OpenMPI
+
+Publications
+~~~~~~~~~~~~
+
+.. **Publications**
+
+| **A framework for hierarchical single-copy MPI collectives on multicore nodes**
+| *George Katevenis, Manolis Ploumidis, and Manolis Marazakis*
+| Cluster 2022, Heidelberg, Germany
+| https://ieeexplore.ieee.org/document/9912729
+
+| **Impact of Cache Coherence on the Performance of Shared-Memory based MPI Primitives: A Case Study for Broadcast on Intel Xeon Scalable Processors**
+| *George Katevenis, Manolis Ploumidis, and Manolis Marazakis*
+| ICPP 2023, Salt Lake City, Utah, USA
+| https://dl.acm.org/doi/10.1145/3605573.3605616
diff --git a/ompi/communicator/comm_cid.c b/ompi/communicator/comm_cid.c
index be99de913ab..ddf1657b9ab 100644
--- a/ompi/communicator/comm_cid.c
+++ b/ompi/communicator/comm_cid.c
@@ -24,7 +24,7 @@
  * Copyright (c) 2017      Mellanox Technologies. All rights reserved.
  * Copyright (c) 2018      Amazon.com, Inc. or its affiliates.  All Rights reserved.
  * Copyright (c) 2021      Nanook Consulting.  All rights reserved.
- * Copyright (c) 2020-2025 Triad National Security, LLC. All rights
+ * Copyright (c) 2020-2026 Triad National Security, LLC. All rights
  *                         reserved.
  * $COPYRIGHT$
  *
@@ -1094,7 +1094,7 @@ int ompi_comm_get_remote_cid_from_pmix (ompi_communicator_t *comm, int dest, uin
     }
 
     if (val->type != PMIX_SIZE) {
-        OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID type mismatch"));
+        OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID type mismatch - %s", PMIx_Value_string(val)));
         rc = OMPI_ERR_TYPE_MISMATCH;
         goto done;
     }
diff --git a/ompi/include/mpi.h.in b/ompi/include/mpi.h.in
index e06865b182f..1422695ea37 100644
--- a/ompi/include/mpi.h.in
+++ b/ompi/include/mpi.h.in
@@ -764,6 +764,7 @@ enum {
 #define MPI_ERR_SESSION               78
 #define MPI_ERR_VALUE_TOO_LARGE       79
 #define MPI_ERR_ERRHANDLER            80
+#define MPI_ERR_NOTIFY_IDX            81
 
 /* Per MPI-3 p349 47, MPI_ERR_LASTCODE must be >= the last predefined
    MPI_ERR_<foo> code. Set the last code to allow some room for adding
@@ -1917,6 +1918,14 @@ OMPI_DECLSPEC  int MPI_Get_c(void *origin_addr, MPI_Count origin_count,
                              MPI_Datatype origin_datatype, int target_rank,
                              MPI_Aint target_disp, MPI_Count target_count,
                              MPI_Datatype target_datatype, MPI_Win win);
+OMPI_DECLSPEC  int MPI_Get_notify(void *origin_addr, int origin_count,
+                                       MPI_Datatype origin_datatype, int target_rank,
+                                       MPI_Aint target_disp, int target_count,
+                                       MPI_Datatype target_datatype, int notification_idx, MPI_Win win);
+OMPI_DECLSPEC  int MPI_Get_notify_c(void *origin_addr, MPI_Count origin_count,
+                                         MPI_Datatype origin_datatype, int target_rank,
+                                         MPI_Aint target_disp, MPI_Count target_count,
+                                         MPI_Datatype target_datatype, int notification_idx, MPI_Win win);
 OMPI_DECLSPEC  int MPI_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
                                       void *result_addr, int result_count, MPI_Datatype result_datatype,
                                       int target_rank, MPI_Aint target_disp, int target_count,
@@ -2180,6 +2189,12 @@ OMPI_DECLSPEC  int MPI_Put(const void *origin_addr, int origin_count, MPI_Dataty
 OMPI_DECLSPEC  int MPI_Put_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype,
                              int target_rank, MPI_Aint target_disp, MPI_Count target_count,
                              MPI_Datatype target_datatype, MPI_Win win);
+OMPI_DECLSPEC  int MPI_Put_notify(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                                       int target_rank, MPI_Aint target_disp, int target_count,
+                                       MPI_Datatype target_datatype, int notification_idx, MPI_Win win);
+OMPI_DECLSPEC  int MPI_Put_notify_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype,
+                                         int target_rank, MPI_Aint target_disp, MPI_Count target_count,
+                                         MPI_Datatype target_datatype, int notification_idx, MPI_Win win);
 OMPI_DECLSPEC  int MPI_Query_thread(int *provided);
 OMPI_DECLSPEC  int MPI_Raccumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
                                    int target_rank, MPI_Aint target_disp, int target_count,
@@ -2604,6 +2619,8 @@ OMPI_DECLSPEC  int MPI_Win_get_errhandler(MPI_Win win, MPI_Errhandler *errhandle
 OMPI_DECLSPEC  int MPI_Win_get_group(MPI_Win win, MPI_Group *group);
 OMPI_DECLSPEC  int MPI_Win_get_info(MPI_Win win, MPI_Info *info_used);
 OMPI_DECLSPEC  int MPI_Win_get_name(MPI_Win win, char *win_name, int *resultlen);
+OMPI_DECLSPEC  int MPI_Win_get_notify_value(MPI_Win win, int notification_idx, MPI_Count *value);
+OMPI_DECLSPEC  int MPI_Win_reset_notify_value(MPI_Win win, int notification_idx, MPI_Count *value);
 OMPI_DECLSPEC  int MPI_Win_lock(int lock_type, int rank, int mpi_assert, MPI_Win win);
 OMPI_DECLSPEC  int MPI_Win_lock_all(int mpi_assert, MPI_Win win);
 OMPI_DECLSPEC  int MPI_Win_post(MPI_Group group, int mpi_assert, MPI_Win win);
@@ -3091,6 +3108,14 @@ OMPI_DECLSPEC  int PMPI_Get_c(void *origin_addr, MPI_Count origin_count,
                               MPI_Datatype origin_datatype, int target_rank,
                               MPI_Aint target_disp, MPI_Count target_count,
                               MPI_Datatype target_datatype, MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Get_notify(void *origin_addr, int origin_count,
+                                        MPI_Datatype origin_datatype, int target_rank,
+                                        MPI_Aint target_disp, int target_count,
+                                        MPI_Datatype target_datatype, int notification_idx, MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Get_notify_c(void *origin_addr, MPI_Count origin_count,
+                                          MPI_Datatype origin_datatype, int target_rank,
+                                          MPI_Aint target_disp, MPI_Count target_count,
+                                          MPI_Datatype target_datatype, int notification_idx, MPI_Win win);
 OMPI_DECLSPEC  int PMPI_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
                                        void *result_addr, int result_count, MPI_Datatype result_datatype,
                                        int target_rank, MPI_Aint target_disp, int target_count,
@@ -3354,6 +3379,12 @@ OMPI_DECLSPEC  int PMPI_Put(const void *origin_addr, int origin_count, MPI_Datat
 OMPI_DECLSPEC  int PMPI_Put_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype,
                               int target_rank, MPI_Aint target_disp, MPI_Count target_count,
                               MPI_Datatype target_datatype, MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Put_notify(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                                        int target_rank, MPI_Aint target_disp, int target_count,
+                                        MPI_Datatype target_datatype, int notification_idx, MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Put_notify_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype,
+                                          int target_rank, MPI_Aint target_disp, MPI_Count target_count,
+                                          MPI_Datatype target_datatype, int notification_idx, MPI_Win win);
 OMPI_DECLSPEC  int PMPI_Query_thread(int *provided);
 OMPI_DECLSPEC  int PMPI_Raccumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
                                     int target_rank, MPI_Aint target_disp, int target_count,
@@ -3778,6 +3809,8 @@ OMPI_DECLSPEC  int PMPI_Win_get_errhandler(MPI_Win win, MPI_Errhandler *errhandl
 OMPI_DECLSPEC  int PMPI_Win_get_group(MPI_Win win, MPI_Group *group);
 OMPI_DECLSPEC  int PMPI_Win_get_info(MPI_Win win, MPI_Info *info_used);
 OMPI_DECLSPEC  int PMPI_Win_get_name(MPI_Win win, char *win_name, int *resultlen);
+OMPI_DECLSPEC  int PMPI_Win_get_notify_value(MPI_Win win, int notification_idx, MPI_Count *value);
+OMPI_DECLSPEC  int PMPI_Win_reset_notify_value(MPI_Win win, int notification_idx, MPI_Count *value);
 OMPI_DECLSPEC  int PMPI_Win_lock(int lock_type, int rank, int mpi_assert, MPI_Win win);
 OMPI_DECLSPEC  int PMPI_Win_lock_all(int mpi_assert, MPI_Win win);
 OMPI_DECLSPEC  int PMPI_Win_post(MPI_Group group, int mpi_assert, MPI_Win win);
diff --git a/ompi/include/mpif-values.py b/ompi/include/mpif-values.py
index 53159d5d8dd..b74fbcbaf1f 100755
--- a/ompi/include/mpif-values.py
+++ b/ompi/include/mpif-values.py
@@ -301,6 +301,7 @@
     'MPI_ERR_SESSION': 78,
     'MPI_ERR_VALUE_TOO_LARGE': 79,
     'MPI_ERR_ERRHANDLER': 80,
+    'MPI_ERR_NOTIFY_IDX': 81,
     'MPI_ERR_LASTCODE': 92,
     'MPI_IDENT': 0,
     'MPI_CONGRUENT': 1,
diff --git a/ompi/instance/instance.c b/ompi/instance/instance.c
index bd686d2bab2..6d50d32ffb2 100644
--- a/ompi/instance/instance.c
+++ b/ompi/instance/instance.c
@@ -8,6 +8,7 @@
  *                         reserved.
  * Copyright (c) 2023      Jeffrey M. Squyres.  All rights reserved.
  * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2026      Nanook Consulting  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -586,11 +587,16 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
                 active = true;
                 OPAL_POST_OBJECT(&active);
                 PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
-                if( PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, NULL, 0,
-                                                        fence_release,
-                                                        (void*)&active))) {
-                    ret = opal_pmix_convert_status(rc);
-                    return ompi_instance_print_error ("PMIx_Fence_nb() failed", ret);
+                rc = PMIx_Fence_nb(NULL, 0, NULL, 0, fence_release, (void*)&active);
+                if (PMIX_SUCCESS != rc) {
+                    active = false;
+                    if (PMIX_OPERATION_SUCCEEDED == rc) {
+                        // can return operation_succeeded if atomically completed
+                        ret = MPI_SUCCESS;
+                    } else {
+                        ret = opal_pmix_convert_status(rc);
+                        return ompi_instance_print_error ("PMIx_Fence_nb() failed", ret);
+                    }
                 }
             }
         } else {
@@ -602,12 +608,19 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
             OPAL_POST_OBJECT(&active);
             PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
             rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active);
-            if( PMIX_SUCCESS != rc) {
-                ret = opal_pmix_convert_status(rc);
-                return ompi_instance_print_error ("PMIx_Fence() failed", ret);
+            if (PMIX_SUCCESS != rc) {
+                active = false;
+                if (PMIX_OPERATION_SUCCEEDED == rc) {
+                    // can return operation_succeeded if atomically completed
+                    ret = MPI_SUCCESS;
+                } else {
+                    ret = opal_pmix_convert_status(rc);
+                    return ompi_instance_print_error ("PMIx_Fence() failed", ret);
+                }
+            } else {
+                /* cannot just wait on thread as we need to call opal_progress */
+                OMPI_LAZY_WAIT_FOR_COMPLETION(active);
             }
-            /* cannot just wait on thread as we need to call opal_progress */
-            OMPI_LAZY_WAIT_FOR_COMPLETION(active);
         }
     }
 
@@ -748,7 +761,9 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
          * we have to wait here for it to complete. However, there
          * is no reason to do two barriers! */
         if (background_fence) {
-            OMPI_LAZY_WAIT_FOR_COMPLETION(active);
+            if (active) {
+                OMPI_LAZY_WAIT_FOR_COMPLETION(active);
+            }
         } else if (!ompi_async_mpi_init) {
             /* wait for everyone to reach this point - this is a hard
              * barrier requirement at this time, though we hope to relax
@@ -757,12 +772,19 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
             active = true;
             OPAL_POST_OBJECT(&active);
             PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
-            if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, info, 1,
-                                                    fence_release, (void*)&active))) {
-                ret = opal_pmix_convert_status(rc);
-                return ompi_instance_print_error ("PMIx_Fence_nb() failed", ret);
+            rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active);
+            if (PMIX_SUCCESS != rc) {
+                active = false;
+                if (PMIX_OPERATION_SUCCEEDED == rc) {
+                    // can return operation_succeeded if atomically completed
+                    ret = MPI_SUCCESS;
+                } else {
+                    ret = opal_pmix_convert_status(rc);
+                    return ompi_instance_print_error ("PMIx_Fence() failed", ret);
+                }
+            } else {
+                OMPI_LAZY_WAIT_FOR_COMPLETION(active);
             }
-            OMPI_LAZY_WAIT_FOR_COMPLETION(active);
         }
     }
 
diff --git a/ompi/mca/coll/accelerator/configure.m4 b/ompi/mca/coll/accelerator/configure.m4
new file mode 100644
index 00000000000..057db874435
--- /dev/null
+++ b/ompi/mca/coll/accelerator/configure.m4
@@ -0,0 +1,27 @@
+# Copyright (c) 2026      NVIDIA Corporation.  All rights reserved.
+#
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+#
+# If any accelerators have been discovered, then build support for the
+# accelerator collective component.
+#
+AC_DEFUN([MCA_ompi_coll_accelerator_CONFIG],[
+
+    AC_CONFIG_FILES([ompi/mca/coll/accelerator/Makefile])
+
+    # This component shall be configured only after the accelerator discovery
+    # has been completed. This discovery is part of the OPAL accelerator framework.
+    AC_MSG_CHECKING([if any accelerator components were found (cuda, rocm, ze)])
+    AS_IF([test "x$OMPI_HAVE_ACCELERATOR_SUPPORT" = "x1"],
+              [AC_MSG_RESULT([yes])
+              $1],
+              [AC_MSG_RESULT([no])
+              $2])
+
+])dnl
diff --git a/ompi/mca/coll/acoll/coll_acoll_reduce.c b/ompi/mca/coll/acoll/coll_acoll_reduce.c
index 69da3cb49cf..28fc3c62c6a 100644
--- a/ompi/mca/coll/acoll/coll_acoll_reduce.c
+++ b/ompi/mca/coll/acoll/coll_acoll_reduce.c
@@ -66,7 +66,7 @@ static inline int coll_acoll_reduce_topo(const void *sbuf, void *rbuf, size_t co
     int use_socket = (0 == acoll_module->use_socket) ? 1 : acoll_module->use_socket;
     
     tmp_sbuf = (char *) sbuf;
-    if ((MPI_IN_PLACE == sbuf) && (rank == root)) {
+    if (MPI_IN_PLACE == sbuf) {
         tmp_sbuf = (char *) rbuf;
     }
 
diff --git a/ompi/mca/coll/adapt/coll_adapt_ireduce.c b/ompi/mca/coll/adapt/coll_adapt_ireduce.c
index 15bd586901a..07616285616 100644
--- a/ompi/mca/coll/adapt/coll_adapt_ireduce.c
+++ b/ompi/mca/coll/adapt/coll_adapt_ireduce.c
@@ -48,7 +48,7 @@ int ompi_coll_adapt_ireduce_register(void)
         mca_coll_adapt_component.adapt_ireduce_algorithm = 1;
     }
 
-    mca_coll_adapt_component.adapt_ireduce_segment_size = 163740;
+    mca_coll_adapt_component.adapt_ireduce_segment_size = 524288;
     mca_base_component_var_register(c, "reduce_segment_size",
                                     "Segment size in bytes used by default for reduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
                                     MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
diff --git a/ompi/mca/coll/ftagree/coll_ftagree_component.c b/ompi/mca/coll/ftagree/coll_ftagree_component.c
index 97e9ca4cee7..8a733ad3357 100644
--- a/ompi/mca/coll/ftagree/coll_ftagree_component.c
+++ b/ompi/mca/coll/ftagree/coll_ftagree_component.c
@@ -38,6 +38,8 @@ int mca_coll_ftagree_era_rebuild = 0;
 double mca_coll_ftagree_debug_inject_proba = 0.0;
 #endif
 
+static int mca_coll_ft_agreement;
+
 /*
  * Local function
  */
@@ -92,8 +94,6 @@ ftagree_close(void)
 static int
 ftagree_register(void)
 {
-    int value;
-
     /* Use a low priority, but allow other components to be lower */
     mca_coll_ftagree_priority = 30;
     (void) mca_base_component_var_register(&mca_coll_ftagree_component.collm_version,
@@ -103,15 +103,15 @@ ftagree_register(void)
                                            MCA_BASE_VAR_SCOPE_READONLY,
                                            &mca_coll_ftagree_priority);
 
-    if( ompi_ftmpi_enabled ) value = 1;
-    else value = 0; /* NOFT: do not initialize ERA */
+    if( ompi_ftmpi_enabled ) mca_coll_ft_agreement = 1;
+    else mca_coll_ft_agreement = 0; /* NOFT: do not initialize ERA */
     (void) mca_base_component_var_register(&mca_coll_ftagree_component.collm_version,
                                            "agreement", "Agreement algorithm 0: Allreduce (NOT FAULT TOLERANT); 1: Early Returning Consensus (era); 2: Early Terminating Consensus (eta)",
                                            MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
                                            OPAL_INFO_LVL_6,
                                            MCA_BASE_VAR_SCOPE_READONLY,
-                                           &value);
-    switch(value) {
+                                           &mca_coll_ft_agreement);
+    switch(mca_coll_ft_agreement) {
     case 0:
         mca_coll_ftagree_algorithm = COLL_FTAGREE_NOFT;
         opal_output_verbose(6, ompi_ftmpi_output_handle,
diff --git a/ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c b/ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c
index 9450c443349..f28c36a3d16 100644
--- a/ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c
+++ b/ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c
@@ -2956,6 +2956,15 @@ int mca_coll_ftagree_era_finalize(void)
                          "%s ftagree:agreement (ERA) GC: %lu passed agreements remain in the passed agreements hash table\n",
                          OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),
                          opal_hash_table_get_size(&era_passed_agreements)));
+    /* Some agreements can remain in the era_passed_agreements table until
+     * finalize; notably, the last agreement in a communicator that has been
+     * freed.
+     *
+     * The commit that added this comment also removed the (unused) function
+     * mca_coll_ftagree_era_free_comm that could enforce purging that table
+     * during comm_free, at the cost of making comm_free hard synchronizing;
+     * this was deemed too disruptive for the small memory usage gain.
+     */
     for( rc = opal_hash_table_get_first_key_uint64(&era_passed_agreements, &key64, &value, &node);
          OPAL_SUCCESS == rc;
          rc = opal_hash_table_get_next_key_uint64(&era_passed_agreements, &key64, &value, node, &node) ) {
@@ -3368,46 +3377,3 @@ int mca_coll_ftagree_iera_intra(void *contrib,
     return OMPI_SUCCESS;
 }
 
-#if 0
-// Per @bosilca and @jsquyres discussion 29 Apr 2021: there is
-// probably a memory leak in MPI_FINALIZE right now, because this
-// function does not appear to be being called from anywhere.
-// @bosilca's team is looking into it.
-int mca_coll_ftagree_era_free_comm(ompi_communicator_t* comm,
-                                   mca_coll_base_module_t *module)
-{
-    ompi_group_t* acked;
-    era_identifier_t aid;
-    int rc;
-
-    OPAL_OUTPUT_VERBOSE((4, ompi_ftmpi_output_handle,
-                         "%s ftagree:agreement (ERA) Freeing Communicator (%d.%d).\n",
-                         OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),
-                         comm->c_contextid,
-                         comm->c_epoch));
-
-    opal_mutex_lock(&ompi_group_afp_mutex);
-    ompi_group_intersection(comm->c_remote_group, ompi_group_all_failed_procs, &acked);
-    opal_mutex_unlock(&ompi_group_afp_mutex);
-    do {
-        rc = mca_coll_ftagree_era_intra(NULL,
-                                        0,
-                                        &ompi_mpi_int.dt,
-                                        &ompi_mpi_op_band.op,
-                                        &acked, true,
-                                        comm,
-                                        comm->c_coll->coll_agree_module);
-    } while(rc != MPI_SUCCESS);
-    OBJ_RELEASE(acked);
-
-    aid.ERAID_FIELDS.contextid = comm->c_contextid.cid_sub.u64;
-    aid.ERAID_FIELDS.epoch     = comm->c_epoch;
-
-    opal_mutex_lock(&era_mutex);
-    /** We don't need to set aid.ERAID_FIELDS.agreementid to collect all of them */
-    era_collect_passed_agreements(aid, 0, (uint16_t)-1);
-    opal_mutex_unlock(&era_mutex);
-
-    return OMPI_SUCCESS;
-}
-#endif
diff --git a/ompi/mca/coll/han/coll_han_component.c b/ompi/mca/coll/han/coll_han_component.c
index 1d78bf87158..7ae17b9e4f8 100644
--- a/ompi/mca/coll/han/coll_han_component.c
+++ b/ompi/mca/coll/han/coll_han_component.c
@@ -301,7 +301,7 @@ static int han_register(void)
                                            OPAL_INFO_LVL_9,
                                            MCA_BASE_VAR_SCOPE_ALL, &cs->han_output_verbose);
 
-    cs->han_bcast_segsize = 65536;
+    cs->han_bcast_segsize = 524288;
     (void) mca_base_component_var_register(c, "bcast_segsize",
                                            "segment size for bcast",
                                            MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
@@ -321,7 +321,7 @@ static int han_register(void)
                                               &cs->han_bcast_low_module,
                                               &cs->han_op_module_name.bcast.han_op_low_module_name);
 
-    cs->han_reduce_segsize = 65536;
+    cs->han_reduce_segsize = 524288;
     (void) mca_base_component_var_register(c, "reduce_segsize",
                                            "segment size for reduce",
                                            MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
@@ -340,7 +340,7 @@ static int han_register(void)
                                               OPAL_INFO_LVL_9, &cs->han_reduce_low_module,
                                               &cs->han_op_module_name.reduce.han_op_low_module_name);
 
-    cs->han_allreduce_segsize = 65536;
+    cs->han_allreduce_segsize = 524288;
     (void) mca_base_component_var_register(c, "allreduce_segsize",
                                            "segment size for allreduce",
                                            MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
diff --git a/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c b/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c
index e3482116c84..9dca14bcc55 100644
--- a/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c
@@ -34,6 +34,8 @@ static int coll_tuned_alltoall_segment_size = 0;
 static int coll_tuned_alltoall_tree_fanout;
 static int coll_tuned_alltoall_chain_fanout;
 
+static int deprecated_mca_params = -1;
+
 /* valid values for coll_tuned_alltoall_forced_algorithm */
 static const mca_base_var_enum_value_t alltoall_algorithms[] = {
     {0, "ignore"},
@@ -119,7 +121,6 @@ int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm
                                       MCA_BASE_VAR_SCOPE_ALL,
                                       &coll_tuned_alltoall_chain_fanout);
 
-    int deprecated_mca_params = -1;
     (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
                                            "alltoall_large_msg",
                                            "use pairwise exchange algorithm for messages larger than this value",
diff --git a/ompi/mca/coll/tuned/coll_tuned_component.c b/ompi/mca/coll/tuned/coll_tuned_component.c
index d8dbb7959e4..6f5a8c57987 100644
--- a/ompi/mca/coll/tuned/coll_tuned_component.c
+++ b/ompi/mca/coll/tuned/coll_tuned_component.c
@@ -71,6 +71,8 @@ int   ompi_coll_tuned_scatter_large_msg = 0;
 int   ompi_coll_tuned_scatter_min_procs = 0;
 int   ompi_coll_tuned_scatter_blocking_send_ratio = 0;
 
+static int deprecated_mca_params = -1;
+
 /* forced algorithm variables */
 /* indices for the MCA parameters */
 coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT] = {{0}};
@@ -161,7 +163,6 @@ static int tuned_register(void)
                                            MCA_BASE_VAR_SCOPE_ALL,
                                            &ompi_coll_tuned_init_chain_fanout);
 
-    int deprecated_mca_params = -1;
     (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
                                            "alltoall_small_msg",
                                            "threshold (if supported) to decide if small MSGs alltoall algorithm will be used",
diff --git a/ompi/mca/coll/xhc/README.md b/ompi/mca/coll/xhc/README.md
index 213062a5edc..438fd712507 100644
--- a/ompi/mca/coll/xhc/README.md
+++ b/ompi/mca/coll/xhc/README.md
@@ -1,371 +1,8 @@
-# XHC: XPMEM-based Hierarchical Collectives
+# XPMEM Hierarchical Collectives (XHC)
 
 XHC implements hierarchical & topology-aware intra-node MPI collectives,
-utilizing XPMEM for efficient shared address space memory access between
-processes.
+(mainly) utilizing XPMEM for efficient shared address space data transfers
+between MPI ranks.
 
-## Main features
-
-* XHC constructs an **n-level hierarchy** (i.e. no algorithmic limitation on
-level count), based on intra-node topological features. Rank/process locality
-information is known thanks to Hwloc, and is obtained from Open MPI's
-integrated book-keeping.
-	
-	Topological features that can currently be defined:
-	
-	- NUMA node
-	- CPU Socket
-	- L1/L2/L3 cache
-	- Hwthread/core
-	- Node (all ranks *are* in same node --> flat, no hierarchy at all)
-	
-	Example of a 3-level XHC hierarchy (numa+socket+node configuration):
-	
-	![Example of 3-level XHC hierarchy](resources/xhc-hierarchy.svg)
-	
-	Furthermore, support for custom virtual user-defined hierarchies is
-	available, to allow fine-grained control over the communication pattern.
-
-* **Single-copy** transportation
-	
-	- Supported through integration with Open MPI's `opal/smsc`
-	(shared-memory-single-copy) framework. Selecting `smsc/xpmem` is highly
-	recommended.
-	
-		- Bcast support: XPMEM, CMA, KNEM
-		- Allreduce/Reduce support: XPMEM
-		- Barrier support: *(irrelevant)*
-	
-	- Application buffers are attached on the fly the first time they appear,
-	saved on and recovered from the registration cache in subsequent
-	appearances. (assuming smsc/xpmem)
-
-* **Copy-in-copy-out (CICO)** transportation
-	
-	- Through shared memory buffers that remain active throughout the
-	component's lifetime.
-	
-	- Switchover with single-copy at configurable message size.
-	
-	- Supported in all ops, regardless of smsc support or XPMEM presence (up to
-	maximum allowed message size).
-
-* **Inline** transportation
-	
-	- For especially small messages, payload data is inlined in the same cache
-	line as the control data.
-	
-	- Supported in all ops, regardless of smsc support or XPMEM presence (up to
-	maximum allowed message size).
-
-* Data-wise **pipelining** across all levels of the hierarchy. Allows for
-lowering hierarchy-induced start-up overheads, and interleaving of operations
-in applicable operations (e.g. reduce+bcast in allreduce).
-
-* **Lock-free** single-writer synchronization, with appropriate cache-line
-separation where necessary. Consistency ensured via lightweight *read* or
-*write* memory barriers.
-
-## Configuration options -- MCA params
-
-XHC can be customized via a number of standard Open MPI MCA parameters, though
-defaults that should satisfy a wide number of systems are in place.
-
-The available parameters (also found in `coll_xhc_component.c`):
-
-#### *(prepend with "coll_xhc_")*
-
-* **priority** (default `0`): The priority of the coll/xhc component, used
-during the component selection process.
-
-* **print_info** (default `false`): Print information about XHC's generated
-hierarchy and its configuration.
-
-* **shmem_backing** (default `/dev/shm`): Backing directory for shmem files
-used for XHC's synchronization fields and CICO buffers.
-
-* **dynamic_leader** (default `false`): Enables the feature that dynamically
-elects an XHC-communicator leader at each collective (currently only applicable
-for bcast).
-
-* **dynamic_reduce** (default `1`=`non-float`): Enables support for
-out-of-order reduction. Ranks fetch data to reduce from multiple peers;
-out-of-order reduction allows them to temporarily skip a peer when the expected
-data is not yet prepared, instead of stalling. The default value auto-enables
-it when the data is of non-float type; setting to `2`=`enabled for all types`,
-might/will harm reproducibility of reductions with float types.
-
-* **reduce_load_balance** (default `0`=`non-leader`): Controls the
-leader-to-member load balancing mode in reductions. Under `non-leader`, the
-members, and not the leaders, perform reductions. With `top-level`, all members
-as well as the leader of the top-most level perform reductions. With
-`first-chunk`, leaders perform a single reduction on each level for a single
-chunk at the beginning of the operation. `top+first` combines `top-level` and
-`first-chunk`. Finally, with `all`, all ranks perform reductions equally.
-
-* **hierarchy** (default `"numa,socket"`): A comma separated list of
-topological feature to which XHC's hierarchy-building algorithm should be
-sensitive. `ompi_info` reports the possible values for the parameter.
-	
-	- In some ways, this is "just" a suggestion. The resulting hierarchy may
-	not exactly match the requested one. Reasons that this will occur:
-		
-		- A requested topological feature does not effectively segment the set
-		of ranks. (eg. `numa` was specified, but all ranks reside in the same
-		NUMA node)
-		
-		- No feature that all ranks have in common was provided. This a more
-		intrinsic detail, that you probably don't need to be aware of, but you
-		might come across if eg. you investigate the output of `print_info`. An
-		additional level will automatically be added in this case, no need to
-		worry about it.
-			
-			For all intents and purposes, a hierarchy of `numa,socket` is
-			interpreted as "segment the ranks according to NUMA node locality,
-			and then further segment them according to CPU socket locality".
-		
-		- The provided features will automatically be re-ordered when their
-		order does not match their order in the physical system. (unless a
-		virtual hierarchy is present in the list)
-	
-	- *Virtual Hierarchies*: The string may alternatively also contain "rank
-	lists" which specify exactly which ranks to group together, as well as some
-	other special modifiers. See
-	`coll_xhc_component.c:xhc_component_parse_hierarchy()` for further
-	explanation as well as syntax information.
-
-* **chunk_size** (default `16K`): The chunk size for the pipelining process.
-Data is processed (eg broadcast, reduced) in this-much sized pieces at once.
-	
-	- It's possible to have a different chunk size for each level of the
-	hierarchy, achieved via providing a comma-separated list of sizes (eg.
-	`"16K,16K,128K"`) instead of single one. The sizes in this list's *DO NOT*
-	correspond to the items on hierarchy list; the hierarchy keys might be
-	re-ordered or reduced to match the system, but the chunk sizes will be
-	consumed in the order they are given, left-to-right -> bottom-to-top.
-
-* **uniform_chunks** (default `true`): Automatically optimize the chunk size
-in reduction collectives, according to the message size, so that all members
-will perform equal work.
-
-* **uniform_chunks_min** (default `1K`): The lowest allowed value for the chunk
-size when uniform chunks are enabled.
-
-* **cico_max** (default `1K`): Copy-in-copy-out, instead of single-copy, will
-be used for messages of *cico_max* or less bytes.
-
-*(Removed Parameters)*
-
-* **rcache_max**, **rcache_max_global** *(REMOVED with shift to opal/smsc)*:
-Limit to number of attachments that the registration cache should hold.
-	
-	- A case can be made about their usefulness. If desired, shall be
-	re-implemented at smsc-level.
-
-## Limitations
-
-- *Intra-node support only*
-	- Define XHC as `coll/HAN`'s intra-node component to reap its benefits in
-	multi-node runs.
-
-- **Heterogeneity**: XHC does not support nodes with non-uniform (rank-wise)
-datatype representations. (determined according to Open MPI's `proc_arch`)
-
-- **Non-commutative** operators are not supported by XHC's reduction
-collectives. In past versions, they were, but only with a flat hierarchy; this
-could make a return at some point.
-
-- **Derived Datatypes** are currently not supported.
-
-- XHC's Reduce currently only supports rank 0 as the root, and will
-automatically fall back to another component for other cases.
-
-## Building
-
-This section describes how to compile the XHC component.
-
-XPMEM support in Open MPI is required to reap the full benefits of XHC.
-	
-- The XHC component will build and work without XPMEM support, but for large
-messages (i.e. ones above the CICO threshold) Allreduce/Reduce will be
-disabled, and Broadcast will fall-back to less efficient mechanisms.
-
-- XPMEM can be obtained from <https://github.com/hpc/xpmem>, and then
-compiled like a common kernel module. You might need to manually point Open
-MPI's configure script to XPMEM's installation location, via the
-`--with-xpmem=` parameter.
-
-- At run-time, you will need to insert the kernel module and obtain proper
-access rights to `/dev/xpmem`.
-
-Apart from instructing Open MPI to include XPMEM support, the rest of the build
-process is standard. General information on building Open MPI can be found in
-its documentation.
-
-<https://www.open-mpi.org/doc/>  
-<https://www.open-mpi.org/faq/?category=building>  
-<https://github.com/open-mpi/ompi/blob/master/README.md>
-
-## Running
-
-General information on running Open MPI jobs can be found here:  
-<https://www.open-mpi.org/faq/?category=running>  
-<https://docs.open-mpi.org/en/v5.0.x/launching-apps/index.html>
-
-`mpirun`'s man page will also be useful:  
-<https://docs.open-mpi.org/en/v5.0.x/man-openmpi/man1/mpirun.1.html>
-
-In order for the XHC component to be chosen, its priority must be manually set
-higher than other collectives components that implement the same primitives,
-via the `coll_xhc_priority` MCA param.
-
-	- Example: `--mca coll_xhc_priority 100`
-
-* Most likely, you will also want the `--bind-to core` param. Otherwise, the
-reported process localities might be too general, preventing XHC from correctly
-segmenting the system. (MCA `coll_xhc_print_info` will report the generated
-hierarchy if you wish to experiment)
-
-### Tuning
-
-* Optional: You might wish to manually specify the topological features that
-XHC's hierarchy should conform to. The default is `numa,socket`, which will
-group the processes according to NUMA locality and then further group them
-according to socket locality. See the `coll_xhc_hierarchy` param.
-	
-	- Example: `--mca coll_xhc_hierarchy numa,socket`
-	- Example: `--mca coll_xhc_hierarchy numa`
-	- Example: `--mca coll_xhc_hierarchy flat`
-	
-	In some systems, small-message Broadcast or the Barrier operation might
-	perform better with a flat tree instead of a hierarchical one. Currently,
-	manual benchmarking is required to accurately determine this.
-
-* Optional: You might wish to tune XHC's chunk size (default `16K`). Use the
-`coll_xhc_chunk_size` param, and try values close to the default and see if
-improvements are observed.
-
-	- Example: `--mca coll_xhc_chunk_size 16K`
-
-* Optional: If you wish to focus on latencies of small/medium size messages,
-you can try altering the cico-to-zcopy switchover point (MCA
-`coll_xhc_cico_max`, default `1K`).
-	
-	- Example: `--mca coll_xhc_cico_max 1K`
-
-* Optional: If your application is heavy in Broadcast calls and you suspect
-that specific ranks might be joining the collective with delay and causing
-others to stall waiting for them, try enabling dynamic leadership (MCA
-`coll_xhc_dynamic_leader`), and seeing if it makes an improvement. Please let
-us know if it does :-).
-	
-	- Example: `--mca coll_xhc_dynamic_leader 1`
-
-### Example command lines
-
-*Assuming `PATH` and `LD_LIBRARY_PATH` have been set appropriately.*
-
-Default XHC configuration:  
-`$ mpirun --mca coll_xhc_priority 100 --bind-to core <application>`
-
-XHC w/ numa-sensitive hierarchy, chunk size @ 16K:  
-`$ mpirun --mca coll_xhc_priority 100 --mca coll_xhc_hierarchy numa --mca coll_xhc_chunk_size 16K --bind-to core <application>`
-
-XHC with flat hierarchy (ie. none at all):  
-`$ mpirun --mca coll_xhc_priority 100 --mca coll_xhc_hierarchy node [--bind-to core] <application>`
-
-## Benchmarking
-
-This section outlines some tips for benchmarking XHC and intra-node MPI
-collectives in general.
-
-### Micro-Benchmarks
-
-For our micro-benchmarking purposes, we have been using [OSU's microbenchmark
-suite](https://mvapich.cse.ohio-state.edu/benchmarks/). However, when
-micro-benchmarking intra-node collectives, there are some important details
-that one needs to look out for.
-
-**CPU Cache** An issue with the OSU micro-benchmarks is that they use the same
-buffer for each iteration without altering it. Since modern processors
-implicitly cache data, this can lead to false/unrealistic/unrepresentative
-results, given that actual real-world applications do not (usually/optimally!)
-perform duplicate operations.
-
-Availability of collective operation source data on a processor's local cache
-hierarchy will cause certain phenomenons (e.g. slow path memory transactions)
-and their effects to remain hidden and undetected in the micro-benchmarking
-process, even though they *will* negatively impact performance in actual
-applications,
-
-We have created "data-varying" (`_dv` suffix) benchmarks to counter this
-problem, which will alter the data before each iteration.
-
-**Microbenchmark's pre-op Barrier** One also needs to be aware how the barrier
-that appears before each iteration in the OSU micro-benchmarks affects the
-result, especially so when latencies of small messages are concerned. The
-underlying implementation of this barrier and the speed/efficiency of its
-"release stage" will affect how fast and how synchronized ranks will exit the
-barrier, and therefore how fast/synchronized they will enter the benchmarked
-collective operation.
-
-For as accurate/clean performance reporting as possible, use a barrier
-implementation that has as low a latency as possible. Furthermore, ideally,
-all ranks should exit the barrier at the exact same time -- this is more
-complex to measure, but can make a difference. In order to have a common
-baseline when benchmarking and comparing multiple collectives implementation,
-use this same barrier implementation for all benchmark scenarios.
-
-In the environments we tested, XHC's barrier was the best performing one. To
-make using this barrier easier, we have put together a small new collective
-component, `XB` (= xhc barrier).
-
-XB creates a new nested (duplicate) communicator with a hint to prioritize XHC,
-and delegates barrier operations to it. A slightly inconvenient side-effect is
-that XHC needs to be on the coll list (MCA `--mca coll`); it doesn't need to
-have a high priority, though it can't be less than 0.
-
-* To benchmark Open MPI's `coll/tuned` with XB: `--mca coll basic,libnbc,tuned,xb,xhc --mca coll_xhc_priority 0 --mca coll_xb_priority 95 --mca coll_tuned_priority 90`
-
-* Or XHC itself, with XB: `--mca coll basic,libnbc,xb,xhc --mca coll_xhc_priority 90 --mca coll_xb_priority 95`
-
-It is also possible to specify the hierarchy to be used for XB's barrier (the
-request will be passed in string form to XHC, only for the nested communicator)
-via the `coll_xb_hierarchy` MCA parameter.
-
-In our fork of the OSU micro-benchmarks, you will also find
-"integrity-checking" variants (`_integrity` suffix). These can help verify that
-collective operations complete successfully without data corruption.
-
-Our OSU micro-benchmarks fork:  
-<https://github.com/CARV-ICS-FORTH/XHC-OpenMPI/tree/xhc-fresh/osu-micro-benchmarks>
-
-The XB component:  
-<https://github.com/CARV-ICS-FORTH/XHC-OpenMPI/tree/xhc-fresh/xb>
-
-### Applications
-
-We expect to see any meaningful performance improvement with XHC in actual
-applications, only if they spend a non-insignificant percentage of their
-runtime in the collective operations that XHC implements: Broadcast, Barrier,
-Allreduce, Reduce.
-
-One known such application is [miniAMR](https://github.com/Mantevo/miniAMR).
-The application parameters (e.g. the refine count and frequency) will affect
-the amount of time spent in the Allreduce primitive.
-
-Another one is Microsoft's [CNTK](https://github.com/microsoft/CNTK), also
-heavy in Allreduce, though it actually makes use of the non-blocking
-`Iallreduce` variant. However, it can easily be converted to use the blocking
-variant instead (contact for patch). Comparing the performance of the
-unmodified CNTK with OpenMPI's `coll/libnbc`, versus that of the patched CNTK
-with XHC reveals that this modification is sensible and beneficial.
-
-Finally, while we have not yet rigorously evaluated it,
-[PiSvM](http://pisvm.sourceforge.net/) is another candidate, with intense use
-of MPI Broadcast.
-
----
-
-Contact: George Katevenis (gkatev@ics.forth.gr), Manolis Ploumidis (ploumid@ics.forth.gr)  
-Computer Architecture and VLSI Systems (CARV) Laboratory, ICS Forth
+For additional info and resources about XHC, check the Open MPI docs:
+https://docs.open-mpi.org/
diff --git a/ompi/mca/osc/osc.h b/ompi/mca/osc/osc.h
index c8f77404c1c..b43f34ac3c5 100644
--- a/ompi/mca/osc/osc.h
+++ b/ompi/mca/osc/osc.h
@@ -216,6 +216,15 @@ typedef int (*ompi_osc_base_module_put_fn_t)(const void *origin_addr,
                                             struct ompi_datatype_t *target_dt,
                                             struct ompi_win_t *win);
 
+typedef int (*ompi_osc_base_module_put_notify_fn_t)(const void *origin_addr,
+                                            size_t origin_count,
+                                            struct ompi_datatype_t *origin_dt,
+                                            int target,
+                                            ptrdiff_t target_disp,
+                                            size_t target_count,
+                                            struct ompi_datatype_t *target_dt,
+                                            int notify,
+                                            struct ompi_win_t *win);
 
 typedef int (*ompi_osc_base_module_get_fn_t)(void *origin_addr,
                                             size_t origin_count,
@@ -226,6 +235,23 @@ typedef int (*ompi_osc_base_module_get_fn_t)(void *origin_addr,
                                             struct ompi_datatype_t *target_dt,
                                             struct ompi_win_t *win);
 
+typedef int (*ompi_osc_base_module_get_notify_fn_t)(void *origin_addr,
+                                            size_t origin_count,
+                                            struct ompi_datatype_t *origin_dt,
+                                            int target,
+                                            ptrdiff_t target_disp,
+                                            size_t target_count,
+                                            struct ompi_datatype_t *target_dt,
+                                            int notify,
+                                            struct ompi_win_t *win);
+
+typedef int (*ompi_osc_base_module_win_get_notify_value_fn_t)(struct ompi_win_t *win,
+                                                               int notify,
+                                                               OMPI_MPI_COUNT_TYPE *value);
+                                                               
+typedef int (*ompi_osc_base_module_win_reset_notify_value_fn_t)(struct ompi_win_t *win,
+                                                                int notify,
+                                                                OMPI_MPI_COUNT_TYPE *value);
 
 typedef int (*ompi_osc_base_module_accumulate_fn_t)(const void *origin_addr,
                                                    size_t origin_count,
@@ -276,6 +302,17 @@ typedef int (*ompi_osc_base_module_rput_fn_t)(const void *origin_addr,
                                               struct ompi_win_t *win,
                                               struct ompi_request_t **request);
 
+typedef int (*ompi_osc_base_module_rput_notify_fn_t)(const void *origin_addr,
+                                              size_t origin_count,
+                                              struct ompi_datatype_t *origin_dt,
+                                              int target,
+                                              ptrdiff_t target_disp,
+                                              size_t target_count,
+                                              struct ompi_datatype_t *target_dt,
+                                              int notify,
+                                              struct ompi_win_t *win,
+                                              struct ompi_request_t **request); 
+
 typedef int (*ompi_osc_base_module_rget_fn_t)(void *origin_addr,
                                               size_t origin_count,
                                               struct ompi_datatype_t *origin_dt,
@@ -286,6 +323,16 @@ typedef int (*ompi_osc_base_module_rget_fn_t)(void *origin_addr,
                                               struct ompi_win_t *win,
                                               struct ompi_request_t **request);
 
+typedef int (*ompi_osc_base_module_rget_notify_fn_t)(void *origin_addr,
+                                              size_t origin_count,
+                                              struct ompi_datatype_t *origin_dt,
+                                              int target,
+                                              ptrdiff_t target_disp,
+                                              size_t target_count,
+                                              struct ompi_datatype_t *target_dt,
+                                              int notify,
+                                              struct ompi_win_t *win,
+                                              struct ompi_request_t **request);
 
 typedef int (*ompi_osc_base_module_raccumulate_fn_t)(const void *origin_addr,
                                                      size_t origin_count,
@@ -371,7 +418,6 @@ typedef int (*ompi_osc_base_module_flush_local_all_fn_t)(struct ompi_win_t *win)
  * module structure.
  */
 
- // TODO: extend the struct and add pointers to put/get_with_notify functions
 struct ompi_osc_base_module_4_0_0_t {
     ompi_osc_base_module_win_shared_query_fn_t osc_win_shared_query;
 
@@ -409,6 +455,12 @@ struct ompi_osc_base_module_4_0_0_t {
     ompi_osc_base_module_flush_all_fn_t osc_flush_all;
     ompi_osc_base_module_flush_local_fn_t osc_flush_local;
     ompi_osc_base_module_flush_local_all_fn_t osc_flush_local_all;
+    ompi_osc_base_module_put_notify_fn_t osc_put_notify;
+    ompi_osc_base_module_get_notify_fn_t osc_get_notify;
+    ompi_osc_base_module_win_get_notify_value_fn_t osc_win_get_notify_value;
+    ompi_osc_base_module_win_reset_notify_value_fn_t osc_win_reset_notify_value;
+    ompi_osc_base_module_rput_notify_fn_t osc_rput_notify;
+    ompi_osc_base_module_rget_notify_fn_t osc_rget_notify;
 };
 typedef struct ompi_osc_base_module_4_0_0_t ompi_osc_base_module_4_0_0_t;
 typedef ompi_osc_base_module_4_0_0_t ompi_osc_base_module_t;
diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c
index cc34c109683..14eeb928e40 100644
--- a/ompi/mca/osc/rdma/osc_rdma_component.c
+++ b/ompi/mca/osc/rdma/osc_rdma_component.c
@@ -1649,37 +1649,39 @@ int ompi_osc_rdma_shared_query(
     ptrdiff_t *disp_unit, void *baseptr)
 {
     int rc = OMPI_ERR_NOT_SUPPORTED;
-    ompi_osc_rdma_peer_t *peer;
-    int actual_rank = rank;
+    ompi_osc_rdma_peer_t *peer = NULL;
     ompi_osc_rdma_module_t *module = GET_MODULE(win);
 
-    peer = ompi_osc_module_get_peer (module, actual_rank);
-    if (NULL == peer) {
-        return OMPI_ERR_NOT_SUPPORTED;
-    }
-
     /* currently only supported for allocated windows */
     if (MPI_WIN_FLAVOR_ALLOCATE != module->flavor) {
         return OMPI_ERR_NOT_SUPPORTED;
     }
 
-    if (!ompi_osc_rdma_peer_local_base(peer)) {
-        return OMPI_ERR_NOT_SUPPORTED;
-    }
-
     if (MPI_PROC_NULL == rank) {
         /* iterate until we find a rank that has a non-zero size */
         for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
             peer = ompi_osc_module_get_peer (module, i);
-            ompi_osc_rdma_peer_extended_t *ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;
-            if (!ompi_osc_rdma_peer_local_base(peer)) {
+            if (NULL == peer) {
+                /* peer object not cached yet (typically non-local here since local peers are added eagerly) */
                 continue;
-            } else if (module->same_size && ex_peer->super.base) {
-                break;
-            } else if (ex_peer->size > 0) {
-                break;
             }
+            ompi_osc_rdma_peer_extended_t *ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;
+            if (ompi_osc_rdma_peer_local_base(peer)) {
+                if (module->same_size && ex_peer->super.base) {
+                    break;
+                } else if (ex_peer->size > 0) {
+                    break;
+                }
+            }
+            // reset so we don't mistakenly use a peer without memory
+            peer = NULL;
         }
+    } else {
+        peer = ompi_osc_module_get_peer (module, rank);
+    }
+
+    if (NULL == peer || !ompi_osc_rdma_peer_local_base(peer)) {
+        return OMPI_ERR_NOT_SUPPORTED;
     }
 
     if (module->same_size && module->same_disp_unit) {
diff --git a/ompi/mca/osc/sm/osc_sm.h b/ompi/mca/osc/sm/osc_sm.h
index 23afacd7d49..85d250bfa18 100644
--- a/ompi/mca/osc/sm/osc_sm.h
+++ b/ompi/mca/osc/sm/osc_sm.h
@@ -22,6 +22,7 @@ typedef uint64_t osc_sm_post_type_t;
 typedef opal_atomic_uint64_t osc_sm_post_atomic_type_t;
 #define OSC_SM_POST_BITS 6
 #define OSC_SM_POST_MASK 0x3f
+#define OSC_SM_MAX_NOTIFY_COUNTERS 16
 
 /* data shared across all peers */
 struct ompi_osc_sm_global_state_t {
@@ -47,6 +48,9 @@ struct ompi_osc_sm_node_state_t {
     opal_atomic_int32_t complete_count;
     ompi_osc_sm_lock_t lock;
     opal_atomic_lock_t accumulate_lock;
+    uint32_t notify_counter_count;
+    uint64_t notify_counter_offset; /* offset from segment_base, not raw pointer */
+
 };
 typedef struct ompi_osc_sm_node_state_t ompi_osc_sm_node_state_t;
 
@@ -79,7 +83,7 @@ struct ompi_osc_sm_module_t {
     size_t *sizes;
     void **bases;
     ptrdiff_t *disp_units;
-    uint64_t **notify_counters;
+    uint64_t *notify_counters;
 
 
     ompi_group_t *start_group;
@@ -107,7 +111,6 @@ int ompi_osc_sm_detach(struct ompi_win_t *win, const void *base);
 
 int ompi_osc_sm_free(struct ompi_win_t *win);
 
-// TODO: add put/get_with_notify prototypes
 
 int ompi_osc_sm_put(const void *origin_addr,
                           size_t origin_count,
@@ -118,6 +121,16 @@ int ompi_osc_sm_put(const void *origin_addr,
                           struct ompi_datatype_t *target_dt,
                           struct ompi_win_t *win);
 
+ int ompi_osc_sm_put_notify(const void *origin_addr,
+                           size_t origin_count,
+                           struct ompi_datatype_t *origin_dt,
+                           int target,
+                           ptrdiff_t target_disp,
+                           size_t target_count,
+                           struct ompi_datatype_t *target_dt,
+                           int notify,
+                           struct ompi_win_t *win);
+ 
 int ompi_osc_sm_get(void *origin_addr,
                           size_t origin_count,
                           struct ompi_datatype_t *origin_dt,
@@ -127,6 +140,24 @@ int ompi_osc_sm_get(void *origin_addr,
                           struct ompi_datatype_t *target_dt,
                           struct ompi_win_t *win);
 
+int ompi_osc_sm_get_notify(void *origin_addr,
+                          size_t origin_count,
+                          struct ompi_datatype_t *origin_dt,
+                          int target,
+                          ptrdiff_t target_disp,
+                          size_t target_count,
+                          struct ompi_datatype_t *target_dt,
+                          int notify,
+                          struct ompi_win_t *win);
+
+int ompi_osc_sm_win_get_notify_value(struct ompi_win_t *win,
+                                     int notify,
+                                     OMPI_MPI_COUNT_TYPE *value);
+                                     
+int ompi_osc_sm_win_reset_notify_value(struct ompi_win_t *win,
+                                       int notify,
+                                       OMPI_MPI_COUNT_TYPE *value);
+
 int ompi_osc_sm_accumulate(const void *origin_addr,
                                  size_t origin_count,
                                  struct ompi_datatype_t *origin_dt,
@@ -176,6 +207,17 @@ int ompi_osc_sm_rput(const void *origin_addr,
                            struct ompi_win_t *win,
                            struct ompi_request_t **request);
 
+int ompi_osc_sm_rput_notify(const void *origin_addr,
+                          size_t origin_count,
+                          struct ompi_datatype_t *origin_dt,
+                          int target,
+                          ptrdiff_t target_disp,
+                          size_t target_count,
+                          struct ompi_datatype_t *target_dt,
+                          int notify,
+                          struct ompi_win_t *win,
+                          struct ompi_request_t **request);
+
 int ompi_osc_sm_rget(void *origin_addr,
                            size_t origin_count,
                            struct ompi_datatype_t *origin_dt,
@@ -186,6 +228,17 @@ int ompi_osc_sm_rget(void *origin_addr,
                            struct ompi_win_t *win,
                            struct ompi_request_t **request);
 
+int ompi_osc_sm_rget_notify(void *origin_addr,
+                          size_t origin_count,
+                          struct ompi_datatype_t *origin_dt,
+                          int target,
+                          ptrdiff_t target_disp,
+                          size_t target_count,
+                          struct ompi_datatype_t *target_dt,
+                          int notify,
+                          struct ompi_win_t *win,
+                          struct ompi_request_t **request);
+
 int ompi_osc_sm_raccumulate(const void *origin_addr,
                                   size_t origin_count,
                                   struct ompi_datatype_t *origin_dt,
diff --git a/ompi/mca/osc/sm/osc_sm_comm.c b/ompi/mca/osc/sm/osc_sm_comm.c
index f9bae370870..fbd4f17856c 100644
--- a/ompi/mca/osc/sm/osc_sm_comm.c
+++ b/ompi/mca/osc/sm/osc_sm_comm.c
@@ -17,9 +17,58 @@
 #include "ompi/mca/osc/osc.h"
 #include "ompi/mca/osc/base/base.h"
 #include "ompi/mca/osc/base/osc_base_obj_convert.h"
+#include "ompi/communicator/communicator.h"
 
 #include "osc_sm.h"
 
+static inline uint64_t *osc_sm_target_notify_base(ompi_osc_sm_module_t *module, int target)
+{
+    if (NULL == module->segment_base) {
+        /* single-rank path: notify_counters is a regular local allocation */
+        return module->notify_counters;
+    }
+
+    return (uint64_t *) ((char *) module->segment_base +
+                         module->node_states[target].notify_counter_offset);
+}
+
+int
+ompi_osc_sm_win_get_notify_value(struct ompi_win_t *win,
+                                 int notify,
+                                 OMPI_MPI_COUNT_TYPE *value)
+{
+    ompi_osc_sm_module_t *module = (ompi_osc_sm_module_t *) win->w_osc_module;
+    int rank = ompi_comm_rank(module->comm);
+
+    if (notify < 0 || (uint32_t) notify >= module->node_states[rank].notify_counter_count) {
+        return MPI_ERR_NOTIFY_IDX;
+    }
+
+    *value = (OMPI_MPI_COUNT_TYPE) osc_sm_target_notify_base(module, rank)[notify];
+    opal_atomic_rmb();
+
+    return OMPI_SUCCESS;
+}
+
+int
+ompi_osc_sm_win_reset_notify_value(struct ompi_win_t *win,
+                                   int notify,
+                                   OMPI_MPI_COUNT_TYPE *value)
+{
+    ompi_osc_sm_module_t *module = (ompi_osc_sm_module_t *) win->w_osc_module;
+    int rank = ompi_comm_rank(module->comm);
+
+    if (notify < 0 || (uint32_t) notify >= module->node_states[rank].notify_counter_count) {
+        return MPI_ERR_NOTIFY_IDX;
+    }
+
+    /* Atomically swap the counter to 0 and return the previous value */
+    *value = (OMPI_MPI_COUNT_TYPE) opal_atomic_swap_64(
+                 &osc_sm_target_notify_base(module, rank)[notify], 0);
+
+    return OMPI_SUCCESS;
+}
+
 int
 ompi_osc_sm_rput(const void *origin_addr,
                  size_t origin_count,
@@ -59,6 +108,53 @@ ompi_osc_sm_rput(const void *origin_addr,
     return OMPI_SUCCESS;
 }
 
+int
+ompi_osc_sm_rput_notify(const void *origin_addr,
+                 size_t origin_count,
+                 struct ompi_datatype_t *origin_dt,
+                 int target,
+                 ptrdiff_t target_disp,
+                 size_t target_count,
+                 struct ompi_datatype_t *target_dt,
+                 int notify,
+                 struct ompi_win_t *win,
+                 struct ompi_request_t **ompi_req)
+{
+    int ret;
+    ompi_osc_sm_module_t *module =
+        (ompi_osc_sm_module_t*) win->w_osc_module;
+    void *remote_address;
+
+    OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
+                         "rput_notify: 0x%lx, %zu, %s, %d, %d, %zu, %s, %d, 0x%lx",
+                         (unsigned long) origin_addr, origin_count,
+                         origin_dt->name, target, (int) target_disp,
+                         target_count, target_dt->name,
+                         notify,
+                         (unsigned long) win));
+
+    remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp;
+
+    ret = ompi_datatype_sndrcv((void *)origin_addr, origin_count, origin_dt,
+                               remote_address, target_count, target_dt);
+    if (OMPI_SUCCESS != ret) {
+        return ret;
+    }
+
+    /* the only valid field of RMA request status is the MPI_ERROR field.
+     * ompi_request_empty has status MPI_SUCCESS and indicates the request is
+     * complete. */
+    *ompi_req = &ompi_request_empty;
+
+    if (notify < 0 || (uint32_t) notify >= module->node_states[target].notify_counter_count) {
+        return MPI_ERR_NOTIFY_IDX;
+    }
+
+    opal_atomic_wmb();
+    opal_atomic_add(&osc_sm_target_notify_base(module, target)[notify], 1);
+
+    return OMPI_SUCCESS;
+}
 
 int
 ompi_osc_sm_rget(void *origin_addr,
@@ -99,6 +195,53 @@ ompi_osc_sm_rget(void *origin_addr,
     return OMPI_SUCCESS;
 }
 
+int
+ompi_osc_sm_rget_notify(void *origin_addr,
+                 size_t origin_count,
+                 struct ompi_datatype_t *origin_dt,
+                 int target,
+                 ptrdiff_t target_disp,
+                 size_t target_count,
+                 struct ompi_datatype_t *target_dt,
+                 int notify,
+                 struct ompi_win_t *win,
+                 struct ompi_request_t **ompi_req)
+{
+    int ret;
+    ompi_osc_sm_module_t *module =
+        (ompi_osc_sm_module_t*) win->w_osc_module;
+    void *remote_address;
+
+    OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
+                         "rget_notify: 0x%lx, %zu, %s, %d, %d, %zu, %s, %d, 0x%lx",
+                         (unsigned long) origin_addr, origin_count,
+                         origin_dt->name, target, (int) target_disp,
+                         target_count, target_dt->name,
+                         notify,
+                         (unsigned long) win));
+
+    remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp;
+
+    ret = ompi_datatype_sndrcv(remote_address, target_count, target_dt,
+                               origin_addr, origin_count, origin_dt);
+    if (OMPI_SUCCESS != ret) {
+        return ret;
+    }
+
+    /* the only valid field of RMA request status is the MPI_ERROR field.
+     * ompi_request_empty has status MPI_SUCCESS and indicates the request is
+     * complete. */
+    *ompi_req = &ompi_request_empty;
+
+    if (notify < 0 || (uint32_t) notify >= module->node_states[target].notify_counter_count) {
+        return MPI_ERR_NOTIFY_IDX;
+    }
+
+    opal_atomic_rmb();
+    opal_atomic_add(&osc_sm_target_notify_base(module, target)[notify], 1);
+
+    return OMPI_SUCCESS;
+}
 
 int
 ompi_osc_sm_raccumulate(const void *origin_addr,
@@ -236,6 +379,48 @@ ompi_osc_sm_put(const void *origin_addr,
 }
 
 
+int
+ompi_osc_sm_put_notify(const void *origin_addr,
+                size_t origin_count,
+                struct ompi_datatype_t *origin_dt,
+                int target,
+                ptrdiff_t target_disp,
+                size_t target_count,
+                struct ompi_datatype_t *target_dt,
+                int notify,
+                struct ompi_win_t *win)
+{
+    int ret;
+    ompi_osc_sm_module_t *module =
+        (ompi_osc_sm_module_t*) win->w_osc_module;
+    void *remote_address;
+
+    OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
+                            "put_notify: 0x%lx, %zu, %s, %d, %d, %zu, %s, %d, 0x%lx",
+                            (unsigned long) origin_addr, origin_count,
+                            origin_dt->name, target, (int) target_disp,
+                            target_count, target_dt->name,
+                            notify,
+                            (unsigned long) win));
+
+    remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp;
+
+    ret = ompi_datatype_sndrcv((void *)origin_addr, origin_count, origin_dt,
+                                remote_address, target_count, target_dt);
+    if (OMPI_SUCCESS != ret) {
+        return ret;
+    }
+
+    if (notify < 0 || (uint32_t) notify >= module->node_states[target].notify_counter_count) {
+        return MPI_ERR_NOTIFY_IDX;
+    }
+
+    opal_atomic_wmb();
+    opal_atomic_add(&osc_sm_target_notify_base(module, target)[notify], 1);
+
+    return ret;
+}
+
 int
 ompi_osc_sm_get(void *origin_addr,
                       size_t origin_count,
@@ -268,7 +453,7 @@ ompi_osc_sm_get(void *origin_addr,
 
 
 int
-ompi_osc_sm_get_with_notify(void *origin_addr,
+ompi_osc_sm_get_notify(void *origin_addr,
                 size_t origin_count,
                 struct ompi_datatype_t *origin_dt,
                 int target,
@@ -294,9 +479,15 @@ ompi_osc_sm_get_with_notify(void *origin_addr,
 
     ret = ompi_datatype_sndrcv(remote_address, target_count, target_dt,
                                origin_addr, origin_count, origin_dt);
-    // TODO: do the same for put_with_notify
+    if (OMPI_SUCCESS != ret) {
+        return ret;
+    }
+    if (notify < 0 || (uint32_t) notify >= module->node_states[target].notify_counter_count) {
+        return OMPI_ERR_BAD_PARAM;
+    }
+
     opal_atomic_rmb();
-    opal_atomic_add(&module->notify_counters[target][notify], 1);
+    opal_atomic_add(&osc_sm_target_notify_base(module, target)[notify], 1);
 
     return ret;
 }
@@ -472,5 +663,5 @@ ompi_osc_sm_fetch_and_op(const void *origin_addr,
  done:
     opal_atomic_unlock(&module->node_states[target].accumulate_lock);
 
-    return OMPI_SUCCESS;;
+    return OMPI_SUCCESS;
 }
diff --git a/ompi/mca/osc/sm/osc_sm_component.c b/ompi/mca/osc/sm/osc_sm_component.c
index 1ad9a48cfd2..259c0826017 100644
--- a/ompi/mca/osc/sm/osc_sm_component.c
+++ b/ompi/mca/osc/sm/osc_sm_component.c
@@ -70,8 +70,6 @@ ompi_osc_sm_component_t mca_osc_sm_component = {
 MCA_BASE_COMPONENT_INIT(ompi, osc, sm)
 
 
-// TODO: extend the struct and add pointers to put/get_with_notify functions
-// TODO: extend it to rput/rget_with_notify as well
 ompi_osc_sm_module_t ompi_osc_sm_module_template = {
     {
         .osc_win_shared_query = ompi_osc_sm_shared_query,
@@ -81,14 +79,20 @@ ompi_osc_sm_module_t ompi_osc_sm_module_template = {
         .osc_free = ompi_osc_sm_free,
 
         .osc_put = ompi_osc_sm_put,
+        .osc_put_notify = ompi_osc_sm_put_notify,
         .osc_get = ompi_osc_sm_get,
+        .osc_get_notify = ompi_osc_sm_get_notify,
+        .osc_win_get_notify_value = ompi_osc_sm_win_get_notify_value,
+        .osc_win_reset_notify_value = ompi_osc_sm_win_reset_notify_value,
         .osc_accumulate = ompi_osc_sm_accumulate,
         .osc_compare_and_swap = ompi_osc_sm_compare_and_swap,
         .osc_fetch_and_op = ompi_osc_sm_fetch_and_op,
         .osc_get_accumulate = ompi_osc_sm_get_accumulate,
 
         .osc_rput = ompi_osc_sm_rput,
+        .osc_rput_notify = ompi_osc_sm_rput_notify,
         .osc_rget = ompi_osc_sm_rget,
+        .osc_rget_notify = ompi_osc_sm_rget_notify,
         .osc_raccumulate = ompi_osc_sm_raccumulate,
         .osc_rget_accumulate = ompi_osc_sm_rget_accumulate,
 
@@ -253,12 +257,19 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis
         module->posts = calloc (1, sizeof(module->posts[0]) + sizeof (module->posts[0][0]));
         if (NULL == module->posts) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
         module->posts[0] = (osc_sm_post_atomic_type_t *) (module->posts + 1);
+
+        /* allocate notify counters for single process case */
+        module->notify_counters = calloc(OSC_SM_MAX_NOTIFY_COUNTERS, sizeof(uint64_t));
+        if (NULL == module->notify_counters) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
+        module->node_states[0].notify_counter_count = OSC_SM_MAX_NOTIFY_COUNTERS;
+        module->node_states[0].notify_counter_offset = 0;
     } else {
-        unsigned long total, *rbuf;
+        unsigned long total, total_counters, gather_values[2], *rbuf;
         int i, flag;
         size_t pagesize;
         size_t state_size;
         size_t posts_size, post_size = (comm_size + OSC_SM_POST_MASK) / (OSC_SM_POST_MASK + 1);
+        size_t notify_counters_size;
         size_t data_base_size;
 
         opal_output_verbose(MCA_BASE_VERBOSE_DEBUG, ompi_osc_base_framework.framework_output,
@@ -267,7 +278,7 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis
         /* get the pagesize */
         pagesize = opal_getpagesize();
 
-        rbuf = malloc(sizeof(unsigned long) * comm_size);
+        rbuf = malloc(sizeof(unsigned long) * comm_size * 2 );
         if (NULL == rbuf) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
 
         /* Note that the alloc_shared_noncontig info key only has
@@ -291,9 +302,10 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis
                                 "allocating window using contiguous strategy");
         }
 
-        total = size;
-        ret = module->comm->c_coll->coll_allgather(&total, 1, MPI_UNSIGNED_LONG,
-                                                  rbuf, 1, MPI_UNSIGNED_LONG,
+        gather_values[0] = size;
+        gather_values[1] = OSC_SM_MAX_NOTIFY_COUNTERS;
+        ret = module->comm->c_coll->coll_allgather(gather_values, 2, MPI_UNSIGNED_LONG,
+                                                  rbuf, 2, MPI_UNSIGNED_LONG,
                                                   module->comm,
                                                   module->comm->c_coll->coll_allgather_module);
         if (OMPI_SUCCESS != ret) {
@@ -302,8 +314,10 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis
         }
 
         total = 0;
+        total_counters = 0;
         for (i = 0 ; i < comm_size ; ++i) {
-            total += rbuf[i];
+            total += rbuf[2 * i];
+            total_counters += rbuf[2 * i + 1];
             if (module->noncontig) {
                 total += OPAL_ALIGN_PAD_AMOUNT(total, pagesize);
             }
@@ -314,7 +328,9 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis
         state_size += OPAL_ALIGN_PAD_AMOUNT(state_size, 64);
         posts_size = comm_size * post_size * sizeof (module->posts[0][0]);
         posts_size += OPAL_ALIGN_PAD_AMOUNT(posts_size, 64);
-        data_base_size = state_size + posts_size;
+        notify_counters_size = total_counters * sizeof(uint64_t);
+        notify_counters_size += OPAL_ALIGN_PAD_AMOUNT(notify_counters_size, 64);
+        data_base_size = state_size + posts_size + notify_counters_size;
         data_base_size += OPAL_ALIGN_PAD_AMOUNT(data_base_size, pagesize);
         if (0 == ompi_comm_rank (module->comm)) {
             char *data_file;
@@ -375,15 +391,27 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis
         module->global_state = (ompi_osc_sm_global_state_t *) (module->posts[0] + comm_size * post_size);
         module->node_states = (ompi_osc_sm_node_state_t *) (module->global_state + 1);
 
-        for (i = 0, total = data_base_size ; i < comm_size ; ++i) {
+        /* set up notify counters in shared memory after node_states */
+        module->notify_counters = (uint64_t *) ((char *)(module->node_states + comm_size) +
+                                   OPAL_ALIGN_PAD_AMOUNT((uintptr_t)(module->node_states + comm_size), 64));
+        /* zero out notify counters */
+        memset(module->notify_counters, 0, total_counters * sizeof(uint64_t));
+
+        for (i = 0, total = data_base_size, total_counters = 0 ; i < comm_size ; ++i) {
             if (i > 0) {
                 module->posts[i] = module->posts[i - 1] + post_size;
             }
 
-            module->sizes[i] = rbuf[i];
+            module->node_states[i].notify_counter_count = (uint32_t) rbuf[2 * i + 1];
+            module->node_states[i].notify_counter_offset =
+                (uint64_t) ((char *) (module->notify_counters + total_counters) -
+                            (char *) module->segment_base);
+            total_counters += rbuf[2 * i + 1];
+
+            module->sizes[i] = rbuf[2 * i];
             if (module->sizes[i] || !module->noncontig) {
                 module->bases[i] = ((char *) module->segment_base) + total;
-                total += rbuf[i];
+                total += rbuf[2 * i];
                 if (module->noncontig) {
                     total += OPAL_ALIGN_PAD_AMOUNT(total, pagesize);
                 }
@@ -397,7 +425,8 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis
 
     /* initialize my state shared */
     module->my_node_state = &module->node_states[ompi_comm_rank(module->comm)];
-    memset (module->my_node_state, 0, sizeof(*module->my_node_state));
+    module->my_node_state->complete_count = 0;
+    memset (&module->my_node_state->lock, 0, sizeof(module->my_node_state->lock));
 
     *base = module->bases[ompi_comm_rank(module->comm)];
 
@@ -553,6 +582,7 @@ ompi_osc_sm_free(struct ompi_win_t *win)
                                           module->comm->c_coll->coll_barrier_module);
 
         opal_shmem_segment_detach (&module->seg_ds);
+        /* notify_counters points into shared memory segment, no separate free needed */
     } else {
         free(module->node_states);
         free(module->global_state);
@@ -560,6 +590,8 @@ ompi_osc_sm_free(struct ompi_win_t *win)
             mca_mpool_base_default_module->mpool_free(mca_mpool_base_default_module,
                                                       module->bases[0]);
         }
+        /* free notify_counters for single process case */
+        free(module->notify_counters);
     }
     free(module->disp_units);
     free(module->outstanding_locks);
diff --git a/ompi/mca/osc/ucx/osc_ucx_comm.c b/ompi/mca/osc/ucx/osc_ucx_comm.c
index ab122e67263..0354edb71c0 100644
--- a/ompi/mca/osc/ucx/osc_ucx_comm.c
+++ b/ompi/mca/osc/ucx/osc_ucx_comm.c
@@ -944,7 +944,7 @@ static inline int ompi_osc_ucx_check_ops_and_flush (ompi_osc_ucx_module_t *modul
     uint64_t base_tmp, tail_tmp;
     int ret = OMPI_SUCCESS;
 
-    if (module->ctx->num_incomplete_req_ops > ompi_osc_ucx_outstanding_ops_flush_threshold) {
+    if ((size_t)module->ctx->num_incomplete_req_ops > ompi_osc_ucx_outstanding_ops_flush_threshold) {
         ret = opal_common_ucx_ctx_flush(module->ctx, OPAL_COMMON_UCX_SCOPE_WORKER, 0);
         if (ret != OPAL_SUCCESS) {
             ret = OMPI_ERROR;
diff --git a/ompi/mca/part/persist/part_persist.h b/ompi/mca/part/persist/part_persist.h
index ccc8f8f1971..86fb9bac42d 100644
--- a/ompi/mca/part/persist/part_persist.h
+++ b/ompi/mca/part/persist/part_persist.h
@@ -490,7 +490,7 @@ mca_part_persist_psend_init(const void* buf,
     return err;
 }
 
-__opal_attribute_always_inline__ static inline int
+static inline int
 mca_part_persist_start(size_t count, ompi_request_t** requests)
 {
     int err = OMPI_SUCCESS;
diff --git a/ompi/mca/pml/ob1/pml_ob1_iprobe.c b/ompi/mca/pml/ob1/pml_ob1_iprobe.c
index 4d6a0eb8dfd..97744cce5dc 100644
--- a/ompi/mca/pml/ob1/pml_ob1_iprobe.c
+++ b/ompi/mca/pml/ob1/pml_ob1_iprobe.c
@@ -47,6 +47,11 @@ int mca_pml_ob1_iprobe(int src,
         *matched = 1;
     } else {
         *matched = 0;
+#if OPAL_ENABLE_FT_MPI
+        if( ompi_request_is_failed((ompi_request_t*)&recvreq) ) {
+            rc = recvreq.req_recv.req_base.req_ompi.req_status.MPI_ERROR;
+        }
+#endif
         opal_progress();
     }
     MCA_PML_BASE_RECV_REQUEST_FINI( &recvreq.req_recv );
@@ -119,6 +124,11 @@ mca_pml_ob1_improbe(int src,
         (*message)->count = recvreq->req_recv.req_base.req_ompi.req_status._ucount;
     } else {
         *matched = 0;
+#if OPAL_ENABLE_FT_MPI
+        if( ompi_request_is_failed((ompi_request_t*)recvreq) ) {
+            rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR;
+        }
+#endif
 
         /* we only free if we didn't match, because we're going to
            translate the request into a receive request later on if it
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
index 57aba677a8a..a6a2866f2a2 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@@ -108,16 +108,19 @@ static int mca_pml_ob1_recv_request_cancel(struct ompi_request_t* ompi_request,
     }
     if( !request->req_match_received ) { /* the match has not been already done */
         assert( OMPI_ANY_TAG == ompi_request->req_status.MPI_TAG ); /* not matched isn't it */
+        if(OPAL_LIKELY(request->req_recv.req_base.req_type != MCA_PML_REQUEST_IPROBE &&
+                       request->req_recv.req_base.req_type != MCA_PML_REQUEST_IMPROBE)) {
 #if MCA_PML_OB1_CUSTOM_MATCH
-        custom_match_prq_cancel(ob1_comm->prq, request);
+            custom_match_prq_cancel(ob1_comm->prq, request);
 #else
-        if( request->req_recv.req_base.req_peer == OMPI_ANY_SOURCE ) {
-            opal_list_remove_item( &ob1_comm->wild_receives, (opal_list_item_t*)request );
-        } else {
-            mca_pml_ob1_comm_proc_t* proc = mca_pml_ob1_peer_lookup (comm, request->req_recv.req_base.req_peer);
-            opal_list_remove_item(&proc->specific_receives, (opal_list_item_t*)request);
-        }
+            if( request->req_recv.req_base.req_peer == OMPI_ANY_SOURCE ) {
+                opal_list_remove_item( &ob1_comm->wild_receives, (opal_list_item_t*)request );
+            } else {
+                mca_pml_ob1_comm_proc_t* proc = mca_pml_ob1_peer_lookup (comm, request->req_recv.req_base.req_peer);
+                opal_list_remove_item(&proc->specific_receives, (opal_list_item_t*)request);
+            }
 #endif
+        }
         PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_REMOVE_FROM_POSTED_Q,
                                 &(request->req_recv.req_base), PERUSE_RECV );
         OB1_MATCHING_UNLOCK(&ob1_comm->matching_lock);
diff --git a/ompi/mpi/bindings/ompi_bindings/consts.py b/ompi/mpi/bindings/ompi_bindings/consts.py
index 43bca486b57..759b342f64a 100644
--- a/ompi/mpi/bindings/ompi_bindings/consts.py
+++ b/ompi/mpi/bindings/ompi_bindings/consts.py
@@ -23,6 +23,7 @@
     'MPI_SUCCESS',
     'MPI_ERR_BUFFER',
     'MPI_ERR_COUNT',
+    'MPI_ERR_NOTIFY_IDX'
     'MPI_ERR_TYPE',
     'MPI_ERR_TAG',
     'MPI_ERR_COMM',
diff --git a/ompi/mpi/c/Makefile.am b/ompi/mpi/c/Makefile.am
index 25b871fa7d4..49619694d0b 100644
--- a/ompi/mpi/c/Makefile.am
+++ b/ompi/mpi/c/Makefile.am
@@ -223,6 +223,7 @@ prototype_sources = \
 	get_accumulate.c.in \
 	get_address.c.in \
 	get.c.in \
+	get_notify.c.in \
 	get_count.c.in \
 	get_elements.c.in \
 	get_elements_x.c.in \
@@ -341,6 +342,7 @@ prototype_sources = \
 	psend_init.c.in \
 	publish_name.c.in \
 	put.c.in \
+	put_notify.c.in \
 	query_thread.c.in \
 	raccumulate.c.in \
         recv.c.in \
@@ -484,6 +486,8 @@ prototype_sources = \
 	win_get_group.c.in \
 	win_get_info.c.in \
 	win_get_name.c.in \
+	win_get_notify_value.c.in \
+	win_reset_notify_value.c.in \
 	win_lock_all.c.in \
 	win_lock.c.in \
 	win_post.c.in \
@@ -954,6 +958,8 @@ interface_profile_sources = \
 	win_get_group_generated.c \
 	win_get_info_generated.c \
 	win_get_name_generated.c \
+	win_get_notify_value_generated.c \
+	win_reset_notify_value_generated.c \
 	win_lock_all_generated.c \
 	win_lock_generated.c \
 	win_post_generated.c \
diff --git a/ompi/mpi/c/get_notify.c.in b/ompi/mpi/c/get_notify.c.in
new file mode 100644
index 00000000000..1bad16944ab
--- /dev/null
+++ b/ompi/mpi/c/get_notify.c.in
@@ -0,0 +1,77 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2020 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2015      Los Alamos National Security, LLC.  All rights
+ *                         reserved.
+ * Copyright (c) 2015      Research Organization for Information Science
+ *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2024      Triad National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+#include "ompi_config.h"
+#include <stdio.h>
+
+#include "ompi/mpi/c/bindings.h"
+#include "ompi/runtime/params.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/errhandler/errhandler.h"
+#include "ompi/win/win.h"
+#include "ompi/mca/osc/osc.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "ompi/runtime/ompi_spc.h"
+
+PROTOTYPE ERROR_CLASS get_notify(BUFFER_OUT origin_addr, COUNT origin_count,
+                          DATATYPE origin_datatype, INT target_rank,
+                          AINT target_disp, COUNT target_count,
+                          DATATYPE target_datatype, INT notification_idx, WIN win)
+{
+    int rc;
+
+    SPC_RECORD(OMPI_SPC_GET_NOTIFY, 1);
+
+    if (MPI_PARAM_CHECK) {
+        rc = OMPI_SUCCESS;
+
+        OMPI_ERR_INIT_FINALIZE(FUNC_NAME);
+
+        if (ompi_win_invalid(win)) {
+            return OMPI_ERRHANDLER_NOHANDLE_INVOKE(MPI_ERR_WIN, FUNC_NAME);
+        } else if (origin_count < 0 || target_count < 0) {
+            rc = MPI_ERR_COUNT;
+        } else if (ompi_win_peer_invalid(win, target_rank) &&
+                   (MPI_PROC_NULL != target_rank)) {
+            rc = MPI_ERR_RANK;
+        } else if ( MPI_WIN_FLAVOR_DYNAMIC != win->w_flavor && target_disp < 0 ) {
+            rc = MPI_ERR_DISP;
+        } else if (notification_idx < 0) {
+            rc = MPI_ERR_NOTIFY_IDX;
+        } else {
+            OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, origin_datatype, origin_count);
+            if (OMPI_SUCCESS == rc) {
+                OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, target_datatype, target_count);
+            }
+        }
+        OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME);
+    }
+
+    if (MPI_PROC_NULL == target_rank) return MPI_SUCCESS;
+
+    rc = win->w_osc_module->osc_get_notify(origin_addr, origin_count, origin_datatype,
+                                    target_rank, target_disp, target_count,
+                                    target_datatype, notification_idx, win);
+    OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME);
+}
diff --git a/ompi/mpi/c/put_notify.c.in b/ompi/mpi/c/put_notify.c.in
new file mode 100644
index 00000000000..14ee5c7e365
--- /dev/null
+++ b/ompi/mpi/c/put_notify.c.in
@@ -0,0 +1,80 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2020 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2006      Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2013-2015 Los Alamos National Security, LLC.  All rights
+ *                         reserved.
+ * Copyright (c) 2015      Research Organization for Information Science
+ *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2024      Triad National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+#include "ompi_config.h"
+#include <stdio.h>
+
+#include "ompi/mpi/c/bindings.h"
+#include "ompi/runtime/params.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/errhandler/errhandler.h"
+#include "ompi/win/win.h"
+#include "ompi/mca/osc/osc.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "ompi/runtime/ompi_spc.h"
+
+PROTOTYPE ERROR_CLASS put_notify(BUFFER origin_addr, COUNT origin_count, DATATYPE origin_datatype,
+                          INT target_rank, AINT target_disp, COUNT target_count,
+                          DATATYPE target_datatype, INT notification_idx, WIN win)
+{
+    int rc;
+
+    SPC_RECORD(OMPI_SPC_PUT_NOTIFY, 1);
+
+    if (MPI_PARAM_CHECK) {
+        rc = OMPI_SUCCESS;
+
+        OMPI_ERR_INIT_FINALIZE(FUNC_NAME);
+
+        if (ompi_win_invalid(win)) {
+            return OMPI_ERRHANDLER_NOHANDLE_INVOKE(MPI_ERR_WIN, FUNC_NAME);
+        } else if (origin_count < 0 || target_count < 0) {
+            rc = MPI_ERR_COUNT;
+        } else if (ompi_win_peer_invalid(win, target_rank) &&
+                   (MPI_PROC_NULL != target_rank)) {
+            rc = MPI_ERR_RANK;
+        } else if (NULL == target_datatype ||
+                   MPI_DATATYPE_NULL == target_datatype) {
+            rc = MPI_ERR_TYPE;
+        } else if ( MPI_WIN_FLAVOR_DYNAMIC != win->w_flavor && target_disp < 0 ) {
+            rc = MPI_ERR_DISP;
+        } else if (notification_idx < 0) {
+            rc = MPI_ERR_NOTIFY_IDX;
+        } else {
+            OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, origin_datatype, origin_count);
+            if (OMPI_SUCCESS == rc) {
+                OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, target_datatype, target_count);
+            }
+        }
+        OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME);
+    }
+
+    if (MPI_PROC_NULL == target_rank) return MPI_SUCCESS;
+
+    rc = win->w_osc_module->osc_put_notify(origin_addr, origin_count, origin_datatype,
+                                    target_rank, target_disp, target_count,
+                                    target_datatype, notification_idx, win);
+    OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME);
+}
diff --git a/ompi/mpi/c/win_get_notify_value.c.in b/ompi/mpi/c/win_get_notify_value.c.in
new file mode 100644
index 00000000000..228999c13ea
--- /dev/null
+++ b/ompi/mpi/c/win_get_notify_value.c.in
@@ -0,0 +1,41 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2026      Triad National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+#include "ompi_config.h"
+
+#include "ompi/mpi/c/bindings.h"
+#include "ompi/runtime/params.h"
+#include "ompi/errhandler/errhandler.h"
+#include "ompi/win/win.h"
+#include "ompi/mca/osc/osc.h"
+
+PROTOTYPE ERROR_CLASS win_get_notify_value(WIN win, INT notification_idx, ELEMENT_COUNT value)
+{
+    int rc;
+
+    if (MPI_PARAM_CHECK) {
+        rc = OMPI_SUCCESS;
+
+        OMPI_ERR_INIT_FINALIZE(FUNC_NAME);
+
+        if (ompi_win_invalid(win)) {
+            return OMPI_ERRHANDLER_NOHANDLE_INVOKE(MPI_ERR_WIN, FUNC_NAME);
+        } else if (notification_idx < 0) {
+            rc = MPI_ERR_NOTIFY_IDX;
+        } else if (NULL == value) {
+            rc = MPI_ERR_ARG;
+        }
+
+        OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME);
+    }
+
+    rc = win->w_osc_module->osc_win_get_notify_value(win, notification_idx, value);
+    OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME);
+}
diff --git a/ompi/mpi/c/win_reset_notify_value.c.in b/ompi/mpi/c/win_reset_notify_value.c.in
new file mode 100644
index 00000000000..99aa1755a76
--- /dev/null
+++ b/ompi/mpi/c/win_reset_notify_value.c.in
@@ -0,0 +1,41 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2026      Triad National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+#include "ompi_config.h"
+
+#include "ompi/mpi/c/bindings.h"
+#include "ompi/runtime/params.h"
+#include "ompi/errhandler/errhandler.h"
+#include "ompi/win/win.h"
+#include "ompi/mca/osc/osc.h"
+
+PROTOTYPE ERROR_CLASS win_reset_notify_value(WIN win, INT notification_idx, ELEMENT_COUNT value)
+{
+    int rc;
+
+    if (MPI_PARAM_CHECK) {
+        rc = OMPI_SUCCESS;
+
+        OMPI_ERR_INIT_FINALIZE(FUNC_NAME);
+
+        if (ompi_win_invalid(win)) {
+            return OMPI_ERRHANDLER_NOHANDLE_INVOKE(MPI_ERR_WIN, FUNC_NAME);
+        } else if (notification_idx < 0) {
+            rc = MPI_ERR_NOTIFY_IDX;
+        } else if (NULL == value) {
+            rc = MPI_ERR_ARG;
+        }
+
+        OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME);
+    }
+
+    rc = win->w_osc_module->osc_win_reset_notify_value(win, notification_idx, value);
+    OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME);
+}
diff --git a/ompi/mpi/fortran/mpif-h/request_get_status_f.c b/ompi/mpi/fortran/mpif-h/request_get_status_f.c
index 7a5c9d57716..7fac2b2e051 100644
--- a/ompi/mpi/fortran/mpif-h/request_get_status_f.c
+++ b/ompi/mpi/fortran/mpif-h/request_get_status_f.c
@@ -12,6 +12,7 @@
  * Copyright (c) 2011-2012 Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2015      Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2026      NVIDIA Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -75,16 +76,11 @@ void ompi_request_get_status_f(MPI_Fint *request, ompi_fortran_logical_t *flag,
     MPI_Request c_req = PMPI_Request_f2c( *request );
     OMPI_LOGICAL_NAME_DECL(flag);
 
-    /* This seems silly, but someone will do it */
-
-    if (OMPI_IS_FORTRAN_STATUS_IGNORE(status)) {
-        *flag = OMPI_INT_2_LOGICAL(0);
-        c_ierr = MPI_SUCCESS;
-    } else {
-        c_ierr = PMPI_Request_get_status(c_req,
-                                        OMPI_LOGICAL_SINGLE_NAME_CONVERT(flag),
-                                        &c_status);
-        OMPI_SINGLE_INT_2_LOGICAL(flag);
+    c_ierr = PMPI_Request_get_status(c_req,
+                                    OMPI_LOGICAL_SINGLE_NAME_CONVERT(flag),
+                                    &c_status);
+    OMPI_SINGLE_INT_2_LOGICAL(flag);
+    if (!OMPI_IS_FORTRAN_STATUS_IGNORE(status)) {
         PMPI_Status_c2f( &c_status, status );
     }
     if (NULL != ierr) *ierr = OMPI_INT_2_FINT(c_ierr);
diff --git a/ompi/request/req_ft.c b/ompi/request/req_ft.c
index 2c53ce076b0..e855afc59fd 100644
--- a/ompi/request/req_ft.c
+++ b/ompi/request/req_ft.c
@@ -128,7 +128,9 @@ bool ompi_request_is_failed_fn(ompi_request_t *req)
             req->req_status.MPI_ERROR  = MPI_ERR_PROC_FAILED_PENDING;
             /* If it is a probe/mprobe, escalate the error */
             if( (MCA_PML_REQUEST_MPROBE == pml_req->req_type) ||
-                (MCA_PML_REQUEST_PROBE == pml_req->req_type) ) {
+                (MCA_PML_REQUEST_IMPROBE == pml_req->req_type) ||
+                (MCA_PML_REQUEST_PROBE == pml_req->req_type) ||
+                (MCA_PML_REQUEST_IPROBE == pml_req->req_type) ) {
                 req->req_status.MPI_ERROR  = MPI_ERR_PROC_FAILED;
             }
             opal_output_verbose(10, ompi_ftmpi_output_handle,
diff --git a/ompi/runtime/ompi_mpi_finalize.c b/ompi/runtime/ompi_mpi_finalize.c
index ad8a328dc55..08c6efaa616 100644
--- a/ompi/runtime/ompi_mpi_finalize.c
+++ b/ompi/runtime/ompi_mpi_finalize.c
@@ -24,6 +24,7 @@
  *                         reserved.
  * Copyright (c) 2020      Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
+ * Copyright (c) 2026      Nanook Consulting  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -281,14 +282,25 @@ int ompi_mpi_finalize(void)
          * communications/actions to complete.  See
          * https://github.com/open-mpi/ompi/issues/1576 for the
          * original bug report. */
-        if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, NULL, 0, fence_cbfunc, (void*)&active))) {
-            ret = opal_pmix_convert_status(rc);
-            OMPI_ERROR_LOG(ret);
+        rc = PMIx_Fence_nb(NULL, 0, NULL, 0, fence_cbfunc, (void*)&active);
+        if (PMIX_SUCCESS != rc) {
             /* Reset the active flag to false, to avoid waiting for
              * completion when the fence was failed. */
             active = false;
+            // can return operation_succeeded if atomically completed
+            if (PMIX_OPERATION_SUCCEEDED == rc) {
+                ret = MPI_SUCCESS;
+            } else {
+                ret = opal_pmix_convert_status(rc);
+                OMPI_ERROR_LOG(ret);
+            }
+        } else {
+            OMPI_LAZY_WAIT_FOR_COMPLETION(active);
+            /* NOTE: we lose the fence return status here. This can be
+             * a problem as the fence CAN fail. Might consider retrieving
+             * the returned status so you can respond if it doesn't 
+             * successfully complete? */
         }
-        OMPI_LAZY_WAIT_FOR_COMPLETION(active);
     }
 
     ompi_mpi_instance_finalize (&ompi_mpi_instance_default);
diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c
index c7e61c5bf94..deea53cb02e 100644
--- a/ompi/runtime/ompi_mpi_init.c
+++ b/ompi/runtime/ompi_mpi_init.c
@@ -26,7 +26,7 @@
  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
  * Copyright (c) 2020      Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
- * Copyright (c) 2021      Nanook Consulting.  All rights reserved.
+ * Copyright (c) 2021-2026 Nanook Consulting  All rights reserved.
  * Copyright (c) 2021-2022 Triad National Security, LLC. All rights
  *                         reserved.
  * Copyright (c) 2025      Advanced Micro Devices, Inc. All rights reserved.
@@ -464,12 +464,17 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
                 active = true;
                 OPAL_POST_OBJECT(&active);
                 PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
-                if( PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, NULL, 0,
-                                                        fence_release,
-                                                        (void*)&active))) {
-                    ret = opal_pmix_convert_status(rc);
-                    error = "PMIx_Fence_nb() failed";
-                    goto error;
+                rc = PMIx_Fence_nb(NULL, 0, NULL, 0, fence_release, (void*)&active);
+                if (PMIX_SUCCESS != rc) {
+                    active = false;
+                    if (PMIX_OPERATION_SUCCEEDED == rc) {
+                        // can return operation_succeeded if atomically completed
+                        ret = MPI_SUCCESS;
+                    } else {
+                        ret = opal_pmix_convert_status(rc);
+                        error = "PMIx_Fence_nb() failed";
+                        goto error;
+                    }
                 }
             }
         } else {
@@ -482,12 +487,19 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
             PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
             rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active);
             if( PMIX_SUCCESS != rc) {
-                ret = opal_pmix_convert_status(rc);
-                error = "PMIx_Fence() failed";
-                goto error;
+                active = false;
+                if (PMIX_OPERATION_SUCCEEDED == rc) {
+                    // can return operation_succeeded if atomically completed
+                    ret = MPI_SUCCESS;
+                } else {
+                    ret = opal_pmix_convert_status(rc);
+                    error = "PMIx_Fence_nb() failed";
+                    goto error;
+                }
+            } else {
+                /* cannot just wait on thread as we need to call opal_progress */
+                OMPI_LAZY_WAIT_FOR_COMPLETION(active);
             }
-            /* cannot just wait on thread as we need to call opal_progress */
-            OMPI_LAZY_WAIT_FOR_COMPLETION(active);
         }
     }
 
@@ -537,7 +549,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
          * we have to wait here for it to complete. However, there
          * is no reason to do two barriers! */
         if (background_fence) {
-            OMPI_LAZY_WAIT_FOR_COMPLETION(active);
+            if (active) {
+                OMPI_LAZY_WAIT_FOR_COMPLETION(active);
+            }
         } else if (!ompi_async_mpi_init) {
             /* wait for everyone to reach this point - this is a hard
              * barrier requirement at this time, though we hope to relax
@@ -546,13 +560,20 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
             active = true;
             OPAL_POST_OBJECT(&active);
             PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
-            if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, info, 1,
-                                                    fence_release, (void*)&active))) {
-                ret = opal_pmix_convert_status(rc);
-                error = "PMIx_Fence_nb() failed";
-                goto error;
+            rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active);
+            if (PMIX_SUCCESS != rc) {
+                active = false;
+                if (PMIX_OPERATION_SUCCEEDED == rc) {
+                    // can return operation_succeeded if atomically completed
+                    ret = MPI_SUCCESS;
+                } else {
+                    ret = opal_pmix_convert_status(rc);
+                    error = "PMIx_Fence_nb() failed";
+                    goto error;
+                }
+            } else {
+                OMPI_LAZY_WAIT_FOR_COMPLETION(active);
             }
-            OMPI_LAZY_WAIT_FOR_COMPLETION(active);
         }
     }
 
diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c
index c747d55ee7d..7b5d1f3c55e 100644
--- a/ompi/runtime/ompi_mpi_params.c
+++ b/ompi/runtime/ompi_mpi_params.c
@@ -104,11 +104,12 @@ bool ompi_ftmpi_enabled = false;
 #endif /* OPAL_ENABLE_FT_MPI */
 
 static int ompi_stream_buffering_mode = -1;
+static int ompi_mpi_ft_verbose = 0;
 int ompi_comm_verbose_level = 0;
 
 int ompi_mpi_register_params(void)
 {
-    int value;
+    int value = 0;
 
 #if OPAL_ENABLE_FT_MPI
     mca_base_var_scope_t ftscope = MCA_BASE_VAR_SCOPE_READONLY;
@@ -121,15 +122,14 @@ int ompi_mpi_register_params(void)
                                   "Enable UFLM MPI Fault Tolerance framework",
                                   MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
                                   OPAL_INFO_LVL_4, ftscope, &ompi_ftmpi_enabled);
-    value = 0;
     (void) mca_base_var_register ("ompi", "mpi", "ft", "verbose",
                                   "Verbosity level of the ULFM MPI Fault Tolerance framework",
                                   MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
-                                  OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL, &value);
+                                  OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL, &ompi_mpi_ft_verbose);
 #if OPAL_ENABLE_FT_MPI
-    if( 0 < value ) {
+    if( 0 < ompi_mpi_ft_verbose ) {
         ompi_ftmpi_output_handle = opal_output_open(NULL);
-        opal_output_set_verbosity(ompi_ftmpi_output_handle, value);
+        opal_output_set_verbosity(ompi_ftmpi_output_handle, ompi_mpi_ft_verbose);
     }
 
     (void) ompi_comm_rbcast_register_params();
diff --git a/ompi/runtime/ompi_rte.c b/ompi/runtime/ompi_rte.c
index 651cf9d0b5a..f94df4fbd5d 100644
--- a/ompi/runtime/ompi_rte.c
+++ b/ompi/runtime/ompi_rte.c
@@ -85,56 +85,20 @@ static int _setup_proc_session_dir(char **sdir);
 #define OPAL_PRINT_NAME_ARGS_MAX_SIZE   50
 #define OPAL_PRINT_NAME_ARG_NUM_BUFS    16
 
-static bool fns_init=false;
-static opal_tsd_tracked_key_t print_args_tsd_key;
 static char* opal_print_args_null = "NULL";
 typedef struct {
-    char *buffers[OPAL_PRINT_NAME_ARG_NUM_BUFS];
+    char buffers[OPAL_PRINT_NAME_ARG_NUM_BUFS][OPAL_PRINT_NAME_ARGS_MAX_SIZE + 1];
     int cntr;
 } opal_print_args_buffers_t;
 
-static void
-buffer_cleanup(void *value)
-{
-    int i;
-    opal_print_args_buffers_t *ptr;
-
-    if (NULL != value) {
-        ptr = (opal_print_args_buffers_t*)value;
-        for (i=0; i < OPAL_PRINT_NAME_ARG_NUM_BUFS; i++) {
-            free(ptr->buffers[i]);
-        }
-        free (ptr);
-    }
-    fns_init = false;
-}
-
 static opal_print_args_buffers_t*
 get_print_name_buffer(void)
 {
-    opal_print_args_buffers_t *ptr;
-    int ret, i;
-
-    if (!fns_init) {
-        /* setup the print_args function */
-        OBJ_CONSTRUCT(&print_args_tsd_key, opal_tsd_tracked_key_t);
-        opal_tsd_tracked_key_set_destructor(&print_args_tsd_key, buffer_cleanup);
-        fns_init = true;
-    }
-
-    ret = opal_tsd_tracked_key_get(&print_args_tsd_key, (void**)&ptr);
-    if (OPAL_SUCCESS != ret) return NULL;
+    static opal_thread_local opal_print_args_buffers_t name_buffer = {
+        .cntr = 0
+    };
 
-    if (NULL == ptr) {
-        ptr = (opal_print_args_buffers_t*)malloc(sizeof(opal_print_args_buffers_t));
-        for (i=0; i < OPAL_PRINT_NAME_ARG_NUM_BUFS; i++) {
-            ptr->buffers[i] = (char *) malloc((OPAL_PRINT_NAME_ARGS_MAX_SIZE+1) * sizeof(char));
-        }
-        ptr->cntr = 0;
-        ret = opal_tsd_tracked_key_set(&print_args_tsd_key, (void*)ptr);
-    }
-
-    return (opal_print_args_buffers_t*) ptr;
+    return &name_buffer;
 }
 
 static char* ompi_pmix_print_jobids(const opal_jobid_t job)
@@ -1043,10 +1007,6 @@ int ompi_rte_finalize(void)
         opal_process_info.initial_errhandler = NULL;
     }
 
-    if (fns_init) {
-        OBJ_DESTRUCT(&print_args_tsd_key);
-    }
-
     /* cleanup our internal nspace hack */
     opal_pmix_finalize_nspace_tracker();
 
diff --git a/ompi/runtime/ompi_spc.c b/ompi/runtime/ompi_spc.c
index 6f1d8aa7d6a..dcbbe04b256 100644
--- a/ompi/runtime/ompi_spc.c
+++ b/ompi/runtime/ompi_spc.c
@@ -71,8 +71,10 @@ static const ompi_spc_event_t ompi_spc_events_desc[OMPI_SPC_NUM_COUNTERS] = {
     SET_COUNTER_ARRAY(OMPI_SPC_SENDRECV, "The number of times MPI_Sendrecv was called.", false, false),
     SET_COUNTER_ARRAY(OMPI_SPC_SENDRECV_REPLACE, "The number of times MPI_Sendrecv_replace was called.", false, false),
     SET_COUNTER_ARRAY(OMPI_SPC_PUT, "The number of times MPI_Put was called.", false, false),
+    SET_COUNTER_ARRAY(OMPI_SPC_PUT_NOTIFY, "The number of times MPI_Put_notify was called.", false, false),
     SET_COUNTER_ARRAY(OMPI_SPC_RPUT, "The number of times MPI_Rput was called.", false, false),
     SET_COUNTER_ARRAY(OMPI_SPC_GET, "The number of times MPI_Get was called.", false, false),
+    SET_COUNTER_ARRAY(OMPI_SPC_GET_NOTIFY, "The number of times MPI_Get_notify was called.", false, false),
     SET_COUNTER_ARRAY(OMPI_SPC_RGET, "The number of times MPI_Rget was called.", false, false),
     SET_COUNTER_ARRAY(OMPI_SPC_PROBE, "The number of times MPI_Probe was called.", false, false),
     SET_COUNTER_ARRAY(OMPI_SPC_IPROBE, "The number of times MPI_Iprobe was called.", false, false),
diff --git a/ompi/runtime/ompi_spc.h b/ompi/runtime/ompi_spc.h
index 76ec7f25f16..3d0efd257b3 100644
--- a/ompi/runtime/ompi_spc.h
+++ b/ompi/runtime/ompi_spc.h
@@ -58,8 +58,10 @@ typedef enum ompi_spc_counters {
     OMPI_SPC_SENDRECV,
     OMPI_SPC_SENDRECV_REPLACE,
     OMPI_SPC_PUT,
+    OMPI_SPC_PUT_NOTIFY,
     OMPI_SPC_RPUT,
     OMPI_SPC_GET,
+    OMPI_SPC_GET_NOTIFY,
     OMPI_SPC_RGET,
     OMPI_SPC_PROBE,
     OMPI_SPC_IPROBE,
diff --git a/opal/mca/btl/smcuda/Makefile.am b/opal/mca/btl/smcuda/Makefile.am
index c0cdf788e8d..9aed69bfb7f 100644
--- a/opal/mca/btl/smcuda/Makefile.am
+++ b/opal/mca/btl/smcuda/Makefile.am
@@ -46,15 +46,11 @@ component_noinst = libmca_btl_smcuda.la
 component_install =
 endif
 
-# See opal/mca/common/cuda/Makefile.am for an explanation of
-# libmca_common_sm.la.
-
 mcacomponentdir = $(opallibdir)
 mcacomponent_LTLIBRARIES = $(component_install)
 mca_btl_smcuda_la_SOURCES = $(libmca_btl_smcuda_la_sources)
 mca_btl_smcuda_la_LDFLAGS = -module -avoid-version $(btl_smcuda_LDFLAGS)
 mca_btl_smcuda_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \
-    $(OPAL_TOP_BUILDDIR)/opal/mca/common/sm/lib@OPAL_LIB_NAME@mca_common_sm.la \
     $(btl_smcuda_LIBS)
 mca_btl_smcuda_la_CPPFLAGS = $(btl_smcuda_CPPFLAGS)
 
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 1ce2b966ece..e832c8ed81e 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -235,7 +235,6 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
         free(loc);
     } else {
         /* If we have hwloc support, then get accurate information */
-        loc = NULL;
         if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) {
             rc = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE, 0,
                                                     OPAL_HWLOC_AVAILABLE);
@@ -249,6 +248,7 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
             mca_btl_smcuda_component.num_mem_nodes = rc;
         }
     }
+    loc = NULL;
     /* see if we were given our location */
     OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING, &OPAL_PROC_MY_NAME, &loc, PMIX_STRING);
     if (OPAL_SUCCESS == rc) {
@@ -267,6 +267,7 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
                 free(mynuma);
             }
             free(loc);
+            loc = NULL;
         }
     } else {
         /* If we have hwloc support, then get accurate information */
diff --git a/opal/mca/btl/smcuda/configure.m4 b/opal/mca/btl/smcuda/configure.m4
new file mode 100644
index 00000000000..e9cb2df2996
--- /dev/null
+++ b/opal/mca/btl/smcuda/configure.m4
@@ -0,0 +1,29 @@
+# Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
+#
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+#
+# If any accelerators have been discovered, then build support for the
+# accelerator BTL. This assumes the discovery has already been done.
+#
+# Beware: Un like the name seems to indicate this BTl is generic and used by
+# all accelerators.
+
+AC_DEFUN([MCA_opal_btl_smcuda_CONFIG],[
+    AC_CONFIG_FILES([opal/mca/btl/smcuda/Makefile])
+
+    # This component shall be configured only after the accelerator discovery
+    # has been completed. This discovery is part of the OPAL accelerator framework.
+    AC_MSG_CHECKING([if any accelerator components were found (cuda, rocm, ze)])
+    AS_IF([test "x$OMPI_HAVE_ACCELERATOR_SUPPORT" = "x1"],
+              [AC_MSG_RESULT([yes])
+              $1],
+              [AC_MSG_RESULT([no])
+              $2])
+
+])dnl
diff --git a/opal/mca/rcache/gpusm/configure.m4 b/opal/mca/rcache/gpusm/configure.m4
new file mode 100644
index 00000000000..d721910500e
--- /dev/null
+++ b/opal/mca/rcache/gpusm/configure.m4
@@ -0,0 +1,27 @@
+# Copyright (c) 2026      NVIDIA Corporation.  All rights reserved.
+#
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+#
+# If any accelerators have been discovered, then build support for the
+# accelerator rcache component.
+#
+AC_DEFUN([MCA_opal_rcache_gpusm_CONFIG],[
+
+    AC_CONFIG_FILES([opal/mca/rcache/gpusm/Makefile])
+
+    # This component shall be configured only after the accelerator discovery
+    # has been completed. This discovery is part of the OPAL accelerator framework.
+    AC_MSG_CHECKING([if any accelerator components were found (cuda, rocm, ze)])
+    AS_IF([test "x$OMPI_HAVE_ACCELERATOR_SUPPORT" = "x1"],
+              [AC_MSG_RESULT([yes])
+              $1],
+              [AC_MSG_RESULT([no])
+              $2])
+
+])dnl
diff --git a/opal/mca/rcache/rgpusm/configure.m4 b/opal/mca/rcache/rgpusm/configure.m4
new file mode 100644
index 00000000000..f5e3eda0154
--- /dev/null
+++ b/opal/mca/rcache/rgpusm/configure.m4
@@ -0,0 +1,27 @@
+# Copyright (c) 2026      NVIDIA Corporation.  All rights reserved.
+#
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+#
+# If any accelerators have been discovered, then build support for the
+# accelerator rcache component.
+#
+AC_DEFUN([MCA_opal_rcache_rgpusm_CONFIG],[
+
+    AC_CONFIG_FILES([opal/mca/rcache/rgpusm/Makefile])
+
+    # This component shall be configured only after the accelerator discovery
+    # has been completed. This discovery is part of the OPAL accelerator framework.
+    AC_MSG_CHECKING([if any accelerator components were found (cuda, rocm, ze)])
+    AS_IF([test "x$OMPI_HAVE_ACCELERATOR_SUPPORT" = "x1"],
+              [AC_MSG_RESULT([yes])
+              $1],
+              [AC_MSG_RESULT([no])
+              $2])
+
+])dnl
diff --git a/opal/mca/smsc/accelerator/configure.m4 b/opal/mca/smsc/accelerator/configure.m4
new file mode 100644
index 00000000000..9fa993e9cf5
--- /dev/null
+++ b/opal/mca/smsc/accelerator/configure.m4
@@ -0,0 +1,27 @@
+# Copyright (c) 2026      NVIDIA Corporation.  All rights reserved.
+#
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+#
+# If any accelerators have been discovered, then build support for the
+# accelerator SMSC component.
+#
+AC_DEFUN([MCA_opal_smsc_accelerator_CONFIG],[
+
+    AC_CONFIG_FILES([opal/mca/smsc/accelerator/Makefile])
+
+    # This component shall be configured only after the accelerator discovery
+    # has been completed. This discovery is part of the OPAL accelerator framework.
+    AC_MSG_CHECKING([if any accelerator components were found (cuda, rocm, ze)])
+    AS_IF([test "x$OMPI_HAVE_ACCELERATOR_SUPPORT" = "x1"],
+              [AC_MSG_RESULT([yes])
+              $1],
+              [AC_MSG_RESULT([no])
+              $2])
+
+])dnl
diff --git a/oshmem/mca/memheap/base/memheap_base_frame.c b/oshmem/mca/memheap/base/memheap_base_frame.c
index 53a71b27a9e..82658e09791 100644
--- a/oshmem/mca/memheap/base/memheap_base_frame.c
+++ b/oshmem/mca/memheap/base/memheap_base_frame.c
@@ -33,9 +33,9 @@
 
 int mca_memheap_base_output = -1;
 int mca_memheap_base_key_exchange = 1;
-opal_list_t mca_memheap_base_components_opened = {{0}};
+opal_list_t mca_memheap_base_components_opened = {};
 int mca_memheap_base_already_opened = 0;
-mca_memheap_map_t mca_memheap_base_map = {{{{0}}}};
+mca_memheap_map_t mca_memheap_base_map = {};
 int mca_memheap_num_segments_warn = 32;
 
 static int mca_memheap_base_register(mca_base_register_flag_t flags)
diff --git a/oshmem/shmem/c/shmem_put_nb.c b/oshmem/shmem/c/shmem_put_nb.c
index 89e4bf18240..cef6abcc40b 100644
--- a/oshmem/shmem/c/shmem_put_nb.c
+++ b/oshmem/shmem/c/shmem_put_nb.c
@@ -11,6 +11,7 @@
 
 #include "oshmem/constants.h"
 #include "oshmem/include/shmem.h"
+#include "oshmem/include/shmemx.h"
 
 #include "oshmem/runtime/runtime.h"