From b951002cb85c7df87ac8e77e9fe41d43e07396ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arnaud=20He=CC=81ritier?= <aheritier@apache.org>
Date: Sun, 15 Feb 2026 23:47:20 +0100
Subject: [PATCH] fix(#1743): retry primary model on transient errors even
 without fallback models

getEffectiveRetries returned 0 when no fallback models were configured,
meaning retryable errors (5xx, timeouts) got zero retries. This caused
Anthropic streaming 'Internal server error' to immediately surface as
'all models failed' instead of being retried with backoff.

Changed getEffectiveRetries to always return DefaultFallbackRetries (2)
when no explicit retry count is configured, regardless of whether
fallback models exist. Retrying the same model on transient errors is
always valuable.

Fixes #1743

Assisted-By: cagent
---
 pkg/runtime/fallback.go      | 10 +++----
 pkg/runtime/fallback_test.go | 52 ++++++++++++++++++++++++++++++++++--
 2 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/pkg/runtime/fallback.go b/pkg/runtime/fallback.go
index 863d666b1..51159e598 100644
--- a/pkg/runtime/fallback.go
+++ b/pkg/runtime/fallback.go
@@ -369,9 +369,10 @@ func getEffectiveCooldown(a *agent.Agent) time.Duration {
 }
 
 // getEffectiveRetries returns the number of retries to use for the agent.
-// If no retries are explicitly configured (retries == 0) and fallback models
-// are configured, returns DefaultFallbackRetries to provide sensible retry
-// behavior out of the box.
+// If no retries are explicitly configured (retries == 0), returns
+// DefaultFallbackRetries to provide sensible retry behavior out of the box.
+// Retries apply to retryable errors (5xx, timeouts) on the same model,
+// regardless of whether fallback models are configured.
 //
 // Note: Users who explicitly want 0 retries can set retries: -1 in their config
 // (though this is an edge case - most users want some retries for resilience).
@@ -381,8 +382,7 @@ func getEffectiveRetries(a *agent.Agent) int {
 	if retries < 0 {
 		return 0
 	}
-	// 0 means "use default" when fallback models are configured
-	if retries == 0 && len(a.FallbackModels()) > 0 {
+	if retries == 0 {
 		return DefaultFallbackRetries
 	}
 	return retries
diff --git a/pkg/runtime/fallback_test.go b/pkg/runtime/fallback_test.go
index 31d133fff..658c0bbc0 100644
--- a/pkg/runtime/fallback_test.go
+++ b/pkg/runtime/fallback_test.go
@@ -181,6 +181,11 @@ func TestIsRetryableModelError(t *testing.T) {
 			err:      errors.New("something weird happened"),
 			expected: false,
 		},
+		{
+			name:     "anthropic streaming internal server error",
+			err:      fmt.Errorf("error receiving from stream: %w", errors.New(`received error while streaming: {"type":"error","error":{"details":null,"type":"api_error","message":"Internal server error"},"request_id":"req_test"}`)),
+			expected: true,
+		},
 	}
 
 	for _, tt := range tests {
@@ -719,12 +724,12 @@ func TestGetEffectiveRetries(t *testing.T) {
 	mockModel := &mockProvider{id: "test/model", stream: newStreamBuilder().AddContent("ok").AddStopWithUsage(1, 1).Build()}
 	mockFallback := &mockProvider{id: "test/fallback", stream: newStreamBuilder().AddContent("ok").AddStopWithUsage(1, 1).Build()}
 
-	// Agent with no retries configured and no fallback models should return 0
+	// Agent with no retries configured and no fallback models should still get default retries
 	agentNoFallback := agent.New("no-fallback", "test",
 		agent.WithModel(mockModel),
 	)
 	retries := getEffectiveRetries(agentNoFallback)
-	assert.Equal(t, 0, retries, "no fallback models = no retries (nothing to retry to)")
+	assert.Equal(t, DefaultFallbackRetries, retries, "no fallback models should still get default retries for transient errors")
 
 	// Agent with no retries configured but with fallback models should use default
 	agentWithFallback := agent.New("with-fallback", "test",
@@ -877,6 +882,49 @@ func TestFallbackModelsClonedWithThinkingEnabled(t *testing.T) {
 	})
 }
 
+func TestPrimaryRetriesWithoutFallbackModels(t *testing.T) {
+	synctest.Test(t, func(t *testing.T) {
+		// Primary fails twice with retryable error (mimics Anthropic streaming internal
+		// server error), then succeeds. No fallback models are configured.
+		successStream := newStreamBuilder().
+			AddContent("Success after transient failures").
+			AddStopWithUsage(10, 5).
+			Build()
+		primary := &countingProvider{
+			id:        "primary/counting",
+			failCount: 2,
+			err:       errors.New(`error receiving from stream: received error while streaming: {"type":"error","error":{"details":null,"type":"api_error","message":"Internal server error"}}`),
+			stream:    successStream,
+		}
+
+		root := agent.New("root", "test",
+			agent.WithModel(primary),
+			// No fallback models
+		)
+
+		tm := team.New(team.WithAgents(root))
+		rt, err := NewLocalRuntime(tm, WithSessionCompaction(false), WithModelStore(mockModelStore{}))
+		require.NoError(t, err)
+
+		sess := session.New(session.WithUserMessage("test"))
+		sess.Title = "No Fallback Retry Test"
+
+		events := rt.RunStream(t.Context(), sess)
+
+		var gotContent bool
+		for ev := range events {
+			if choice, ok := ev.(*AgentChoiceEvent); ok {
+				if choice.Content == "Success after transient failures" {
+					gotContent = true
+				}
+			}
+		}
+
+		assert.True(t, gotContent, "should recover from transient errors even without fallback models")
+		assert.Equal(t, 3, primary.callCount, "primary should be called 3 times (2 failures + 1 success)")
+	})
+}
+
 // Verify interface compliance
 var (
 	_ provider.Provider = (*mockProvider)(nil)