From 0ec1e957ba6bf98fbc99c36757a1a4087d488d07 Mon Sep 17 00:00:00 2001 From: Li Wei Date: Tue, 17 Feb 2026 11:28:23 +0900 Subject: [PATCH] DAOS-18552 pool: Fix a PS start-stop race The following race happened during a pool create operation, triggered by abnormally slow VMs: ds_rsvc_start start pool_svc_alloc_cb ds_pool_lookup: OK ....VM slowness causes start timeout, which triggers stop.... ds_pool_stop pool->sp_stopping = 1 ds_pool_svc_stop: none insert wait for ds_pool references: hang This patch is a quick fix that prevents ds_rsvc_start from inserting a PS to the hash table if the ds_pool is stopping, so that ds_pool_stop won't hang. Manual testing shows that such a pool create operation will now retry and succeed transparently. Signed-off-by: Li Wei --- src/include/daos_srv/rsvc.h | 5 ++++- src/pool/srv_pool.c | 17 +++++++++++++++++ src/rsvc/srv.c | 11 ++++++++++- 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/src/include/daos_srv/rsvc.h b/src/include/daos_srv/rsvc.h index 7f66d66b329..d2105222380 100644 --- a/src/include/daos_srv/rsvc.h +++ b/src/include/daos_srv/rsvc.h @@ -1,6 +1,6 @@ /* * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -53,6 +53,9 @@ struct ds_rsvc_class { */ void (*sc_free)(struct ds_rsvc *svc); + /** Prepare for being inserted into the hash table. */ + int (*sc_insert)(struct ds_rsvc *svc); + /** * Bootstrap (i.e., initialize) the DB with the argument passed to * ds_rsvc_start. If supplied, this is called on a self-only service. diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index b7ada57c251..c2a59cec0a9 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -1910,6 +1910,20 @@ pool_svc_free_cb(struct ds_rsvc *rsvc) D_FREE(svc); } +static int +pool_svc_insert_cb(struct ds_rsvc *rsvc) +{ + struct pool_svc *svc = pool_svc_obj(rsvc); + + /* + * While we were starting svc, there might be a ds_pool_stop call who + * is waiting for us to put svc->ps_pool. + */ + if (svc->ps_pool->sp_stopping) + return -DER_CANCELED; + return 0; +} + /* * Update svc->ps_pool with map_buf and map_version. This ensures that * svc->ps_pool matches the latest pool map. @@ -2727,16 +2741,19 @@ pool_svc_map_dist_cb(struct ds_rsvc *rsvc, uint32_t *version) return rc; } +/* clang-format off */ static struct ds_rsvc_class pool_svc_rsvc_class = { .sc_name = pool_svc_name_cb, .sc_locate = pool_svc_locate_cb, .sc_alloc = pool_svc_alloc_cb, .sc_free = pool_svc_free_cb, + .sc_insert = pool_svc_insert_cb, .sc_step_up = pool_svc_step_up_cb, .sc_step_down = pool_svc_step_down_cb, .sc_drain = pool_svc_drain_cb, .sc_map_dist = pool_svc_map_dist_cb }; +/* clang-format on */ void ds_pool_rsvc_class_register(void) diff --git a/src/rsvc/srv.c b/src/rsvc/srv.c index 3f2b599eb2b..6cedab28ba8 100644 --- a/src/rsvc/srv.c +++ b/src/rsvc/srv.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1042,10 +1042,19 @@ ds_rsvc_start(enum ds_rsvc_class_id class, d_iov_t *id, uuid_t db_uuid, uint64_t if (rc != 0) goto out; + if (rsvc_class(class)->sc_insert != NULL) { + rc = rsvc_class(class)->sc_insert(svc); + if (rc != 0) { + D_DEBUG(DB_MD, "%s: sc_insert: " DF_RC "\n", svc->s_name, DP_RC(rc)); + goto err_svc_started; + } + } + rc = d_hash_rec_insert(&rsvc_hash, svc->s_id.iov_buf, svc->s_id.iov_len, &svc->s_entry, true /* exclusive */); if (rc != 0) { D_DEBUG(DB_MD, "%s: insert: "DF_RC"\n", svc->s_name, DP_RC(rc)); +err_svc_started: stop(svc, mode == DS_RSVC_CREATE /* destroy */); goto out; }