From 17363edf253499751ce4aba1f4b9ce2a45b7438d Mon Sep 17 00:00:00 2001 From: Luis Pater Date: Mon, 30 Mar 2026 22:22:42 +0800 Subject: [PATCH] fix(auth): skip downtime for request-scoped 404 errors in model state management --- sdk/cliproxy/auth/conductor.go | 157 ++++++++++-------- sdk/cliproxy/auth/conductor_overrides_test.go | 113 +++++++++++++ 2 files changed, 204 insertions(+), 66 deletions(-) diff --git a/sdk/cliproxy/auth/conductor.go b/sdk/cliproxy/auth/conductor.go index 64c110dc..61f32278 100644 --- a/sdk/cliproxy/auth/conductor.go +++ b/sdk/cliproxy/auth/conductor.go @@ -1734,77 +1734,79 @@ func (m *Manager) MarkResult(ctx context.Context, result Result) { } } else { if result.Model != "" { - state := ensureModelState(auth, result.Model) - state.Unavailable = true - state.Status = StatusError - state.UpdatedAt = now - if result.Error != nil { - state.LastError = cloneError(result.Error) - state.StatusMessage = result.Error.Message - auth.LastError = cloneError(result.Error) - auth.StatusMessage = result.Error.Message - } + if !isRequestScopedNotFoundResultError(result.Error) { + state := ensureModelState(auth, result.Model) + state.Unavailable = true + state.Status = StatusError + state.UpdatedAt = now + if result.Error != nil { + state.LastError = cloneError(result.Error) + state.StatusMessage = result.Error.Message + auth.LastError = cloneError(result.Error) + auth.StatusMessage = result.Error.Message + } - statusCode := statusCodeFromResult(result.Error) - if isModelSupportResultError(result.Error) { - next := now.Add(12 * time.Hour) - state.NextRetryAfter = next - suspendReason = "model_not_supported" - shouldSuspendModel = true - } else { - switch statusCode { - case 401: - next := now.Add(30 * time.Minute) - state.NextRetryAfter = next - suspendReason = "unauthorized" - shouldSuspendModel = true - case 402, 403: - next := now.Add(30 * time.Minute) - state.NextRetryAfter = next - suspendReason = "payment_required" - shouldSuspendModel = true - case 404: + statusCode := statusCodeFromResult(result.Error) + if isModelSupportResultError(result.Error) { next := now.Add(12 * time.Hour) state.NextRetryAfter = next - suspendReason = "not_found" + suspendReason = "model_not_supported" shouldSuspendModel = true - case 429: - var next time.Time - backoffLevel := state.Quota.BackoffLevel - if result.RetryAfter != nil { - next = now.Add(*result.RetryAfter) - } else { - cooldown, nextLevel := nextQuotaCooldown(backoffLevel, quotaCooldownDisabledForAuth(auth)) - if cooldown > 0 { - next = now.Add(cooldown) - } - backoffLevel = nextLevel - } - state.NextRetryAfter = next - state.Quota = QuotaState{ - Exceeded: true, - Reason: "quota", - NextRecoverAt: next, - BackoffLevel: backoffLevel, - } - suspendReason = "quota" - shouldSuspendModel = true - setModelQuota = true - case 408, 500, 502, 503, 504: - if quotaCooldownDisabledForAuth(auth) { - state.NextRetryAfter = time.Time{} - } else { - next := now.Add(1 * time.Minute) + } else { + switch statusCode { + case 401: + next := now.Add(30 * time.Minute) state.NextRetryAfter = next + suspendReason = "unauthorized" + shouldSuspendModel = true + case 402, 403: + next := now.Add(30 * time.Minute) + state.NextRetryAfter = next + suspendReason = "payment_required" + shouldSuspendModel = true + case 404: + next := now.Add(12 * time.Hour) + state.NextRetryAfter = next + suspendReason = "not_found" + shouldSuspendModel = true + case 429: + var next time.Time + backoffLevel := state.Quota.BackoffLevel + if result.RetryAfter != nil { + next = now.Add(*result.RetryAfter) + } else { + cooldown, nextLevel := nextQuotaCooldown(backoffLevel, quotaCooldownDisabledForAuth(auth)) + if cooldown > 0 { + next = now.Add(cooldown) + } + backoffLevel = nextLevel + } + state.NextRetryAfter = next + state.Quota = QuotaState{ + Exceeded: true, + Reason: "quota", + NextRecoverAt: next, + BackoffLevel: backoffLevel, + } + suspendReason = "quota" + shouldSuspendModel = true + setModelQuota = true + case 408, 500, 502, 503, 504: + if quotaCooldownDisabledForAuth(auth) { + state.NextRetryAfter = time.Time{} + } else { + next := now.Add(1 * time.Minute) + state.NextRetryAfter = next + } + default: + state.NextRetryAfter = time.Time{} } - default: - state.NextRetryAfter = time.Time{} } - } - auth.Status = StatusError - auth.UpdatedAt = now - updateAggregatedAvailability(auth, now) + auth.Status = StatusError + auth.UpdatedAt = now + updateAggregatedAvailability(auth, now) + } } else { applyAuthFailureState(auth, result.Error, result.RetryAfter, now) } @@ -2056,11 +2058,29 @@ func isModelSupportResultError(err *Error) bool { return isModelSupportErrorMessage(err.Message) } +func isRequestScopedNotFoundMessage(message string) bool { + if message == "" { + return false + } + lower := strings.ToLower(message) + return strings.Contains(lower, "item with id") && + strings.Contains(lower, "not found") && + strings.Contains(lower, "items are not persisted when `store` is set to false") +} + +func isRequestScopedNotFoundResultError(err *Error) bool { + if err == nil || statusCodeFromResult(err) != http.StatusNotFound { + return false + } + return isRequestScopedNotFoundMessage(err.Message) +} + // isRequestInvalidError returns true if the error represents a client request // error that should not be retried. Specifically, it treats 400 responses with -// "invalid_request_error" and all 422 responses as request-shape failures, -// where switching auths or pooled upstream models will not help. Model-support -// errors are excluded so routing can fall through to another auth or upstream. +// "invalid_request_error", request-scoped 404 item misses caused by `store=false`, +// and all 422 responses as request-shape failures, where switching auths or +// pooled upstream models will not help. Model-support errors are excluded so +// routing can fall through to another auth or upstream. func isRequestInvalidError(err error) bool { if err == nil { return false @@ -2072,6 +2092,8 @@ func isRequestInvalidError(err error) bool { switch status { case http.StatusBadRequest: return strings.Contains(err.Error(), "invalid_request_error") + case http.StatusNotFound: + return isRequestScopedNotFoundMessage(err.Error()) case http.StatusUnprocessableEntity: return true default: @@ -2083,6 +2105,9 @@ func applyAuthFailureState(auth *Auth, resultErr *Error, retryAfter *time.Durati if auth == nil { return } + if isRequestScopedNotFoundResultError(resultErr) { + return + } auth.Unavailable = true auth.Status = StatusError auth.UpdatedAt = now diff --git a/sdk/cliproxy/auth/conductor_overrides_test.go b/sdk/cliproxy/auth/conductor_overrides_test.go index 3ad0ce67..50915ce0 100644 --- a/sdk/cliproxy/auth/conductor_overrides_test.go +++ b/sdk/cliproxy/auth/conductor_overrides_test.go @@ -12,6 +12,8 @@ import ( cliproxyexecutor "github.com/router-for-me/CLIProxyAPI/v6/sdk/cliproxy/executor" ) +const requestScopedNotFoundMessage = "Item with id 'rs_0b5f3eb6f51f175c0169ca74e4a85881998539920821603a74' not found. Items are not persisted when `store` is set to false. Try again with `store` set to true, or remove this item from your input." + func TestManager_ShouldRetryAfterError_RespectsAuthRequestRetryOverride(t *testing.T) { m := NewManager(nil, nil, nil) m.SetRetryConfig(3, 30*time.Second, 0) @@ -447,3 +449,114 @@ func TestManager_MarkResult_RespectsAuthDisableCoolingOverride(t *testing.T) { t.Fatalf("expected NextRetryAfter to be zero when disable_cooling=true, got %v", state.NextRetryAfter) } } + +func TestManager_MarkResult_RequestScopedNotFoundDoesNotCooldownAuth(t *testing.T) { + m := NewManager(nil, nil, nil) + + auth := &Auth{ + ID: "auth-1", + Provider: "openai", + } + if _, errRegister := m.Register(context.Background(), auth); errRegister != nil { + t.Fatalf("register auth: %v", errRegister) + } + + model := "gpt-4.1" + m.MarkResult(context.Background(), Result{ + AuthID: auth.ID, + Provider: auth.Provider, + Model: model, + Success: false, + Error: &Error{ + HTTPStatus: http.StatusNotFound, + Message: requestScopedNotFoundMessage, + }, + }) + + updated, ok := m.GetByID(auth.ID) + if !ok || updated == nil { + t.Fatalf("expected auth to be present") + } + if updated.Unavailable { + t.Fatalf("expected request-scoped 404 to keep auth available") + } + if !updated.NextRetryAfter.IsZero() { + t.Fatalf("expected request-scoped 404 to keep auth cooldown unset, got %v", updated.NextRetryAfter) + } + if state := updated.ModelStates[model]; state != nil { + t.Fatalf("expected request-scoped 404 to avoid model cooldown state, got %#v", state) + } +} + +func TestManager_RequestScopedNotFoundStopsRetryWithoutSuspendingAuth(t *testing.T) { + m := NewManager(nil, nil, nil) + executor := &authFallbackExecutor{ + id: "openai", + executeErrors: map[string]error{ + "aa-bad-auth": &Error{ + HTTPStatus: http.StatusNotFound, + Message: requestScopedNotFoundMessage, + }, + }, + } + m.RegisterExecutor(executor) + + model := "gpt-4.1" + badAuth := &Auth{ID: "aa-bad-auth", Provider: "openai"} + goodAuth := &Auth{ID: "bb-good-auth", Provider: "openai"} + + reg := registry.GetGlobalRegistry() + reg.RegisterClient(badAuth.ID, "openai", []*registry.ModelInfo{{ID: model}}) + reg.RegisterClient(goodAuth.ID, "openai", []*registry.ModelInfo{{ID: model}}) + t.Cleanup(func() { + reg.UnregisterClient(badAuth.ID) + reg.UnregisterClient(goodAuth.ID) + }) + + if _, errRegister := m.Register(context.Background(), badAuth); errRegister != nil { + t.Fatalf("register bad auth: %v", errRegister) + } + if _, errRegister := m.Register(context.Background(), goodAuth); errRegister != nil { + t.Fatalf("register good auth: %v", errRegister) + } + + _, errExecute := m.Execute(context.Background(), []string{"openai"}, cliproxyexecutor.Request{Model: model}, cliproxyexecutor.Options{}) + if errExecute == nil { + t.Fatal("expected request-scoped not-found error") + } + errResult, ok := errExecute.(*Error) + if !ok { + t.Fatalf("expected *Error, got %T", errExecute) + } + if errResult.HTTPStatus != http.StatusNotFound { + t.Fatalf("status = %d, want %d", errResult.HTTPStatus, http.StatusNotFound) + } + if errResult.Message != requestScopedNotFoundMessage { + t.Fatalf("message = %q, want %q", errResult.Message, requestScopedNotFoundMessage) + } + + got := executor.ExecuteCalls() + want := []string{badAuth.ID} + if len(got) != len(want) { + t.Fatalf("execute calls = %v, want %v", got, want) + } + for i := range want { + if got[i] != want[i] { + t.Fatalf("execute call %d auth = %q, want %q", i, got[i], want[i]) + } + } + + updatedBad, ok := m.GetByID(badAuth.ID) + if !ok || updatedBad == nil { + t.Fatalf("expected bad auth to remain registered") + } + if updatedBad.Unavailable { + t.Fatalf("expected request-scoped 404 to keep bad auth available") + } + if !updatedBad.NextRetryAfter.IsZero() { + t.Fatalf("expected request-scoped 404 to keep bad auth cooldown unset, got %v", updatedBad.NextRetryAfter) + } + if state := updatedBad.ModelStates[model]; state != nil { + t.Fatalf("expected request-scoped 404 to avoid bad auth model cooldown state, got %#v", state) + } +}