diff --git a/src/control/lib/control/system.go b/src/control/lib/control/system.go index 5c5fd8e4eb0..3c77374d932 100644 --- a/src/control/lib/control/system.go +++ b/src/control/lib/control/system.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -1318,6 +1318,7 @@ func SystemRebuildManage(ctx context.Context, rpcClient UnaryInvoker, req *Syste type SystemSelfHealEvalReq struct { unaryRequest msRequest + retryableRequest } // SystemSelfHealEvalResp contains the response. @@ -1341,6 +1342,10 @@ func SystemSelfHealEval(ctx context.Context, rpcClient UnaryInvoker, req *System req.setRPC(func(ctx context.Context, conn *grpc.ClientConn) (proto.Message, error) { return mgmtpb.NewMgmtSvcClient(conn).SystemSelfHealEval(ctx, pbReq) }) + req.retryTestFn = func(err error, _ uint) bool { + return (system.IsUnavailable(err) || IsRetryableConnErr(err) || + system.IsNotLeader(err) || system.IsNotReplica(err)) + } rpcClient.Debugf("DAOS system self-heal eval request: %s", pbUtil.Debug(pbReq)) ur, err := rpcClient.InvokeUnaryRPC(ctx, req) diff --git a/src/control/system/errors.go b/src/control/system/errors.go index 509bee13906..335a255bf2f 100644 --- a/src/control/system/errors.go +++ b/src/control/system/errors.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -17,6 +17,8 @@ import ( "github.com/pkg/errors" "github.com/daos-stack/daos/src/control/build" + "github.com/daos-stack/daos/src/control/fault" + "github.com/daos-stack/daos/src/control/fault/code" "github.com/daos-stack/daos/src/control/lib/ranklist" ) @@ -39,8 +41,10 @@ func IsUnavailable(err error) bool { if err == nil { return false } - cause := errors.Cause(err).Error() - return strings.Contains(cause, ErrRaftUnavail.Error()) || strings.Contains(cause, ErrLeaderStepUpInProgress.Error()) + cause := errors.Cause(err) + return strings.Contains(cause.Error(), ErrRaftUnavail.Error()) || + strings.Contains(cause.Error(), ErrLeaderStepUpInProgress.Error()) || + fault.IsFaultCode(cause, code.ServerDataPlaneNotStarted) } // IsEmptyGroupMap returns a boolean indicating whether or not the