fix(auth): tighten registry model reconciliation
This commit is contained in:
@@ -233,23 +233,19 @@ func (m *Manager) RefreshSchedulerEntry(authID string) {
|
|||||||
m.scheduler.upsertAuth(snapshot)
|
m.scheduler.upsertAuth(snapshot)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ReconcileRegistryModelStates clears stale per-model runtime failures for
|
// ReconcileRegistryModelStates aligns per-model runtime state with the current
|
||||||
// models that are currently registered for the auth in the global model registry.
|
// registry snapshot for one auth.
|
||||||
//
|
//
|
||||||
// This keeps the scheduler and the global registry aligned after model
|
// Supported models are reset to a clean state because re-registration already
|
||||||
// re-registration. Without this reconciliation, a model can reappear in
|
// cleared the registry-side cooldown/suspension snapshot. ModelStates for
|
||||||
// /v1/models after registry refresh while the scheduler still blocks it because
|
// models that are no longer present in the registry are pruned entirely so
|
||||||
// auth.ModelStates retained an older failure such as not_found or quota.
|
// renamed/removed models cannot keep auth-level status stale.
|
||||||
func (m *Manager) ReconcileRegistryModelStates(ctx context.Context, authID string) {
|
func (m *Manager) ReconcileRegistryModelStates(ctx context.Context, authID string) {
|
||||||
if m == nil || authID == "" {
|
if m == nil || authID == "" {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
supportedModels := registry.GetGlobalRegistry().GetModelsForClient(authID)
|
supportedModels := registry.GetGlobalRegistry().GetModelsForClient(authID)
|
||||||
if len(supportedModels) == 0 {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
supported := make(map[string]struct{}, len(supportedModels))
|
supported := make(map[string]struct{}, len(supportedModels))
|
||||||
for _, model := range supportedModels {
|
for _, model := range supportedModels {
|
||||||
if model == nil {
|
if model == nil {
|
||||||
@@ -261,9 +257,6 @@ func (m *Manager) ReconcileRegistryModelStates(ctx context.Context, authID strin
|
|||||||
}
|
}
|
||||||
supported[modelKey] = struct{}{}
|
supported[modelKey] = struct{}{}
|
||||||
}
|
}
|
||||||
if len(supported) == 0 {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
var snapshot *Auth
|
var snapshot *Auth
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
@@ -273,14 +266,19 @@ func (m *Manager) ReconcileRegistryModelStates(ctx context.Context, authID strin
|
|||||||
if ok && auth != nil && len(auth.ModelStates) > 0 {
|
if ok && auth != nil && len(auth.ModelStates) > 0 {
|
||||||
changed := false
|
changed := false
|
||||||
for modelKey, state := range auth.ModelStates {
|
for modelKey, state := range auth.ModelStates {
|
||||||
if state == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
baseModel := canonicalModelKey(modelKey)
|
baseModel := canonicalModelKey(modelKey)
|
||||||
if baseModel == "" {
|
if baseModel == "" {
|
||||||
baseModel = strings.TrimSpace(modelKey)
|
baseModel = strings.TrimSpace(modelKey)
|
||||||
}
|
}
|
||||||
if _, supportedModel := supported[baseModel]; !supportedModel {
|
if _, supportedModel := supported[baseModel]; !supportedModel {
|
||||||
|
// Drop state for models that disappeared from the current registry
|
||||||
|
// snapshot. Keeping them around leaks stale errors into auth-level
|
||||||
|
// status, management output, and websocket fallback checks.
|
||||||
|
delete(auth.ModelStates, modelKey)
|
||||||
|
changed = true
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if state == nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if modelStateIsClean(state) {
|
if modelStateIsClean(state) {
|
||||||
@@ -289,6 +287,9 @@ func (m *Manager) ReconcileRegistryModelStates(ctx context.Context, authID strin
|
|||||||
resetModelState(state, now)
|
resetModelState(state, now)
|
||||||
changed = true
|
changed = true
|
||||||
}
|
}
|
||||||
|
if len(auth.ModelStates) == 0 {
|
||||||
|
auth.ModelStates = nil
|
||||||
|
}
|
||||||
if changed {
|
if changed {
|
||||||
updateAggregatedAvailability(auth, now)
|
updateAggregatedAvailability(auth, now)
|
||||||
if !hasModelError(auth, now) {
|
if !hasModelError(auth, now) {
|
||||||
@@ -297,7 +298,9 @@ func (m *Manager) ReconcileRegistryModelStates(ctx context.Context, authID strin
|
|||||||
auth.Status = StatusActive
|
auth.Status = StatusActive
|
||||||
}
|
}
|
||||||
auth.UpdatedAt = now
|
auth.UpdatedAt = now
|
||||||
_ = m.persist(ctx, auth)
|
if errPersist := m.persist(ctx, auth); errPersist != nil {
|
||||||
|
logEntryWithRequestID(ctx).WithField("auth_id", auth.ID).Warnf("failed to persist auth changes during model state reconciliation: %v", errPersist)
|
||||||
|
}
|
||||||
snapshot = auth.Clone()
|
snapshot = auth.Clone()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1827,7 +1830,11 @@ func modelStateIsClean(state *ModelState) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func updateAggregatedAvailability(auth *Auth, now time.Time) {
|
func updateAggregatedAvailability(auth *Auth, now time.Time) {
|
||||||
if auth == nil || len(auth.ModelStates) == 0 {
|
if auth == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if len(auth.ModelStates) == 0 {
|
||||||
|
clearAggregatedAvailability(auth)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
allUnavailable := true
|
allUnavailable := true
|
||||||
@@ -1835,10 +1842,12 @@ func updateAggregatedAvailability(auth *Auth, now time.Time) {
|
|||||||
quotaExceeded := false
|
quotaExceeded := false
|
||||||
quotaRecover := time.Time{}
|
quotaRecover := time.Time{}
|
||||||
maxBackoffLevel := 0
|
maxBackoffLevel := 0
|
||||||
|
hasState := false
|
||||||
for _, state := range auth.ModelStates {
|
for _, state := range auth.ModelStates {
|
||||||
if state == nil {
|
if state == nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
hasState = true
|
||||||
stateUnavailable := false
|
stateUnavailable := false
|
||||||
if state.Status == StatusDisabled {
|
if state.Status == StatusDisabled {
|
||||||
stateUnavailable = true
|
stateUnavailable = true
|
||||||
@@ -1868,6 +1877,10 @@ func updateAggregatedAvailability(auth *Auth, now time.Time) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if !hasState {
|
||||||
|
clearAggregatedAvailability(auth)
|
||||||
|
return
|
||||||
|
}
|
||||||
auth.Unavailable = allUnavailable
|
auth.Unavailable = allUnavailable
|
||||||
if allUnavailable {
|
if allUnavailable {
|
||||||
auth.NextRetryAfter = earliestRetry
|
auth.NextRetryAfter = earliestRetry
|
||||||
@@ -1887,6 +1900,15 @@ func updateAggregatedAvailability(auth *Auth, now time.Time) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func clearAggregatedAvailability(auth *Auth) {
|
||||||
|
if auth == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
auth.Unavailable = false
|
||||||
|
auth.NextRetryAfter = time.Time{}
|
||||||
|
auth.Quota = QuotaState{}
|
||||||
|
}
|
||||||
|
|
||||||
func hasModelError(auth *Auth, now time.Time) bool {
|
func hasModelError(auth *Auth, now time.Time) bool {
|
||||||
if auth == nil || len(auth.ModelStates) == 0 {
|
if auth == nil || len(auth.ModelStates) == 0 {
|
||||||
return false
|
return false
|
||||||
|
|||||||
@@ -0,0 +1,182 @@
|
|||||||
|
package auth
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"net/http"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
cliproxyexecutor "github.com/router-for-me/CLIProxyAPI/v6/sdk/cliproxy/executor"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestManager_ReconcileRegistryModelStates_ClearsStaleSupportedModelErrors(t *testing.T) {
|
||||||
|
ctx := context.Background()
|
||||||
|
manager := NewManager(nil, &RoundRobinSelector{}, nil)
|
||||||
|
|
||||||
|
auth := &Auth{
|
||||||
|
ID: "reconcile-auth",
|
||||||
|
Provider: "codex",
|
||||||
|
ModelStates: map[string]*ModelState{
|
||||||
|
"gpt-5.4": {
|
||||||
|
Status: StatusError,
|
||||||
|
StatusMessage: "not_found",
|
||||||
|
Unavailable: true,
|
||||||
|
NextRetryAfter: time.Now().Add(12 * time.Hour),
|
||||||
|
LastError: &Error{HTTPStatus: http.StatusNotFound, Message: "not_found"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
if _, errRegister := manager.Register(ctx, auth); errRegister != nil {
|
||||||
|
t.Fatalf("register auth: %v", errRegister)
|
||||||
|
}
|
||||||
|
|
||||||
|
registerSchedulerModels(t, "codex", "gpt-5.4", auth.ID)
|
||||||
|
manager.RefreshSchedulerEntry(auth.ID)
|
||||||
|
|
||||||
|
got, errPick := manager.scheduler.pickSingle(ctx, "codex", "gpt-5.4", cliproxyexecutor.Options{}, nil)
|
||||||
|
var authErr *Error
|
||||||
|
if !errors.As(errPick, &authErr) || authErr == nil {
|
||||||
|
t.Fatalf("pickSingle() before reconcile error = %v, want auth_unavailable", errPick)
|
||||||
|
}
|
||||||
|
if authErr.Code != "auth_unavailable" {
|
||||||
|
t.Fatalf("pickSingle() before reconcile code = %q, want %q", authErr.Code, "auth_unavailable")
|
||||||
|
}
|
||||||
|
if got != nil {
|
||||||
|
t.Fatalf("pickSingle() before reconcile auth = %v, want nil", got)
|
||||||
|
}
|
||||||
|
|
||||||
|
manager.ReconcileRegistryModelStates(ctx, auth.ID)
|
||||||
|
|
||||||
|
got, errPick = manager.scheduler.pickSingle(ctx, "codex", "gpt-5.4", cliproxyexecutor.Options{}, nil)
|
||||||
|
if errPick != nil {
|
||||||
|
t.Fatalf("pickSingle() after reconcile error = %v", errPick)
|
||||||
|
}
|
||||||
|
if got == nil || got.ID != auth.ID {
|
||||||
|
t.Fatalf("pickSingle() after reconcile auth = %v, want %q", got, auth.ID)
|
||||||
|
}
|
||||||
|
|
||||||
|
reconciled, ok := manager.GetByID(auth.ID)
|
||||||
|
if !ok || reconciled == nil {
|
||||||
|
t.Fatalf("expected auth to still exist")
|
||||||
|
}
|
||||||
|
state := reconciled.ModelStates["gpt-5.4"]
|
||||||
|
if state == nil {
|
||||||
|
t.Fatalf("expected reconciled model state to exist")
|
||||||
|
}
|
||||||
|
if state.Unavailable {
|
||||||
|
t.Fatalf("state.Unavailable = true, want false")
|
||||||
|
}
|
||||||
|
if state.Status != StatusActive {
|
||||||
|
t.Fatalf("state.Status = %q, want %q", state.Status, StatusActive)
|
||||||
|
}
|
||||||
|
if !state.NextRetryAfter.IsZero() {
|
||||||
|
t.Fatalf("state.NextRetryAfter = %v, want zero", state.NextRetryAfter)
|
||||||
|
}
|
||||||
|
if state.LastError != nil {
|
||||||
|
t.Fatalf("state.LastError = %v, want nil", state.LastError)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestManager_ReconcileRegistryModelStates_PrunesUnsupportedModelStates(t *testing.T) {
|
||||||
|
ctx := context.Background()
|
||||||
|
manager := NewManager(nil, &RoundRobinSelector{}, nil)
|
||||||
|
|
||||||
|
nextRetry := time.Now().Add(30 * time.Minute)
|
||||||
|
auth := &Auth{
|
||||||
|
ID: "reconcile-unsupported-auth",
|
||||||
|
Provider: "codex",
|
||||||
|
Status: StatusError,
|
||||||
|
Unavailable: true,
|
||||||
|
StatusMessage: "payment_required",
|
||||||
|
LastError: &Error{HTTPStatus: http.StatusPaymentRequired, Message: "payment_required"},
|
||||||
|
ModelStates: map[string]*ModelState{
|
||||||
|
"gpt-5.4": {
|
||||||
|
Status: StatusError,
|
||||||
|
StatusMessage: "payment_required",
|
||||||
|
Unavailable: true,
|
||||||
|
NextRetryAfter: nextRetry,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
if _, errRegister := manager.Register(ctx, auth); errRegister != nil {
|
||||||
|
t.Fatalf("register auth: %v", errRegister)
|
||||||
|
}
|
||||||
|
|
||||||
|
registerSchedulerModels(t, "codex", "gpt-5.5", auth.ID)
|
||||||
|
manager.ReconcileRegistryModelStates(ctx, auth.ID)
|
||||||
|
|
||||||
|
reconciled, ok := manager.GetByID(auth.ID)
|
||||||
|
if !ok || reconciled == nil {
|
||||||
|
t.Fatalf("expected auth to still exist")
|
||||||
|
}
|
||||||
|
if len(reconciled.ModelStates) != 0 {
|
||||||
|
t.Fatalf("expected stale unsupported model state to be pruned, got %+v", reconciled.ModelStates)
|
||||||
|
}
|
||||||
|
if reconciled.Unavailable {
|
||||||
|
t.Fatalf("auth.Unavailable = true, want false")
|
||||||
|
}
|
||||||
|
if reconciled.Status != StatusActive {
|
||||||
|
t.Fatalf("auth.Status = %q, want %q", reconciled.Status, StatusActive)
|
||||||
|
}
|
||||||
|
if reconciled.StatusMessage != "" {
|
||||||
|
t.Fatalf("auth.StatusMessage = %q, want empty", reconciled.StatusMessage)
|
||||||
|
}
|
||||||
|
if reconciled.LastError != nil {
|
||||||
|
t.Fatalf("auth.LastError = %v, want nil", reconciled.LastError)
|
||||||
|
}
|
||||||
|
if !reconciled.NextRetryAfter.IsZero() {
|
||||||
|
t.Fatalf("auth.NextRetryAfter = %v, want zero", reconciled.NextRetryAfter)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestManager_ReconcileRegistryModelStates_ClearsRemovedModelStateWhenRegistryIsEmpty(t *testing.T) {
|
||||||
|
ctx := context.Background()
|
||||||
|
manager := NewManager(nil, &RoundRobinSelector{}, nil)
|
||||||
|
|
||||||
|
auth := &Auth{
|
||||||
|
ID: "reconcile-empty-registry-auth",
|
||||||
|
Provider: "codex",
|
||||||
|
Status: StatusError,
|
||||||
|
Unavailable: true,
|
||||||
|
StatusMessage: "not_found",
|
||||||
|
LastError: &Error{HTTPStatus: http.StatusNotFound, Message: "not_found"},
|
||||||
|
ModelStates: map[string]*ModelState{
|
||||||
|
"gpt-5.4": {
|
||||||
|
Status: StatusError,
|
||||||
|
StatusMessage: "not_found",
|
||||||
|
Unavailable: true,
|
||||||
|
NextRetryAfter: time.Now().Add(12 * time.Hour),
|
||||||
|
LastError: &Error{HTTPStatus: http.StatusNotFound, Message: "not_found"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
if _, errRegister := manager.Register(ctx, auth); errRegister != nil {
|
||||||
|
t.Fatalf("register auth: %v", errRegister)
|
||||||
|
}
|
||||||
|
|
||||||
|
manager.ReconcileRegistryModelStates(ctx, auth.ID)
|
||||||
|
|
||||||
|
reconciled, ok := manager.GetByID(auth.ID)
|
||||||
|
if !ok || reconciled == nil {
|
||||||
|
t.Fatalf("expected auth to still exist")
|
||||||
|
}
|
||||||
|
if len(reconciled.ModelStates) != 0 {
|
||||||
|
t.Fatalf("expected stale model state to be pruned when registry is empty, got %+v", reconciled.ModelStates)
|
||||||
|
}
|
||||||
|
if reconciled.Unavailable {
|
||||||
|
t.Fatalf("auth.Unavailable = true, want false")
|
||||||
|
}
|
||||||
|
if reconciled.Status != StatusActive {
|
||||||
|
t.Fatalf("auth.Status = %q, want %q", reconciled.Status, StatusActive)
|
||||||
|
}
|
||||||
|
if reconciled.StatusMessage != "" {
|
||||||
|
t.Fatalf("auth.StatusMessage = %q, want empty", reconciled.StatusMessage)
|
||||||
|
}
|
||||||
|
if reconciled.LastError != nil {
|
||||||
|
t.Fatalf("auth.LastError = %v, want nil", reconciled.LastError)
|
||||||
|
}
|
||||||
|
if !reconciled.NextRetryAfter.IsZero() {
|
||||||
|
t.Fatalf("auth.NextRetryAfter = %v, want zero", reconciled.NextRetryAfter)
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user