From 782bba0bc473674f16c1ea0cb092f8518f7bd8b1 Mon Sep 17 00:00:00 2001 From: Ben Vargas Date: Wed, 19 Nov 2025 09:13:03 -0700 Subject: [PATCH 1/5] feat(registry): enable gemini-3-pro-preview for gemini-cli provider Add gemini-3-pro-preview model to GetGeminiCLIModels() to make it available for OAuth-based Gemini CLI users, matching the model already available in AI Studio provider. Model spec: - ID: gemini-3-pro-preview - Version: 3.0 - Input: 1M tokens - Output: 64K tokens - Thinking: 128-32K tokens (dynamic) --- internal/registry/model_definitions.go | 30 +++++++++++++------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/internal/registry/model_definitions.go b/internal/registry/model_definitions.go index eea00daf..f7c41224 100644 --- a/internal/registry/model_definitions.go +++ b/internal/registry/model_definitions.go @@ -170,21 +170,21 @@ func GetGeminiCLIModels() []*ModelInfo { SupportedGenerationMethods: []string{"generateContent", "countTokens", "createCachedContent", "batchGenerateContent"}, Thinking: &ThinkingSupport{Min: 0, Max: 24576, ZeroAllowed: true, DynamicAllowed: true}, }, - // { - // ID: "gemini-3-pro-preview-11-2025", - // Object: "model", - // Created: time.Now().Unix(), - // OwnedBy: "google", - // Type: "gemini", - // Name: "models/gemini-3-pro-preview-11-2025", - // Version: "3", - // DisplayName: "Gemini 3 Pro Preview 11-2025", - // Description: "Latest preview of Gemini Pro", - // InputTokenLimit: 1048576, - // OutputTokenLimit: 65536, - // SupportedGenerationMethods: []string{"generateContent", "countTokens", "createCachedContent", "batchGenerateContent"}, - // Thinking: &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true}, - // }, + { + ID: "gemini-3-pro-preview", + Object: "model", + Created: time.Now().Unix(), + OwnedBy: "google", + Type: "gemini", + Name: "models/gemini-3-pro-preview", + Version: "3.0", + DisplayName: "Gemini 3 Pro Preview", + Description: "Gemini 3 Pro Preview", + InputTokenLimit: 1048576, + OutputTokenLimit: 65536, + SupportedGenerationMethods: []string{"generateContent", "countTokens", "createCachedContent", "batchGenerateContent"}, + Thinking: &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true}, + }, } } From 6a3de3a89c6af66f4c84ea4f1de31aeec754b630 Mon Sep 17 00:00:00 2001 From: Ben Vargas Date: Wed, 19 Nov 2025 12:45:59 -0700 Subject: [PATCH 2/5] feat(executor): add intelligent retry logic for 429 rate limits Implement Google RetryInfo.retryDelay support for handling 429 rate limit errors. Retries same model up to 3 times using exact delays from Google's API before trying fallback models. - Add parseRetryDelay() to extract Google's retry guidance - Implement inner retry loop in Execute() and ExecuteStream() - Context-aware waiting with cancellation support - Cap delays at 60s maximum for safety --- .../runtime/executor/gemini_cli_executor.go | 410 ++++++++++++------ 1 file changed, 266 insertions(+), 144 deletions(-) diff --git a/internal/runtime/executor/gemini_cli_executor.go b/internal/runtime/executor/gemini_cli_executor.go index 9a2c2602..5e932fbd 100644 --- a/internal/runtime/executor/gemini_cli_executor.go +++ b/internal/runtime/executor/gemini_cli_executor.go @@ -99,89 +99,123 @@ func (e *GeminiCLIExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth var lastStatus int var lastBody []byte + // Get max retry count from config, default to 3 if not set + maxRetries := e.cfg.RequestRetry + if maxRetries <= 0 { + maxRetries = 3 + } + for idx, attemptModel := range models { - payload := append([]byte(nil), basePayload...) - if action == "countTokens" { - payload = deleteJSONField(payload, "project") - payload = deleteJSONField(payload, "model") - } else { - payload = setJSONField(payload, "project", projectID) - payload = setJSONField(payload, "model", attemptModel) - } - - tok, errTok := tokenSource.Token() - if errTok != nil { - err = errTok - return resp, err - } - updateGeminiCLITokenMetadata(auth, baseTokenData, tok) - - url := fmt.Sprintf("%s/%s:%s", codeAssistEndpoint, codeAssistVersion, action) - if opts.Alt != "" && action != "countTokens" { - url = url + fmt.Sprintf("?$alt=%s", opts.Alt) - } - - reqHTTP, errReq := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(payload)) - if errReq != nil { - err = errReq - return resp, err - } - reqHTTP.Header.Set("Content-Type", "application/json") - reqHTTP.Header.Set("Authorization", "Bearer "+tok.AccessToken) - applyGeminiCLIHeaders(reqHTTP) - reqHTTP.Header.Set("Accept", "application/json") - recordAPIRequest(ctx, e.cfg, upstreamRequestLog{ - URL: url, - Method: http.MethodPost, - Headers: reqHTTP.Header.Clone(), - Body: payload, - Provider: e.Identifier(), - AuthID: authID, - AuthLabel: authLabel, - AuthType: authType, - AuthValue: authValue, - }) - - httpResp, errDo := httpClient.Do(reqHTTP) - if errDo != nil { - recordAPIResponseError(ctx, e.cfg, errDo) - err = errDo - return resp, err - } - - data, errRead := io.ReadAll(httpResp.Body) - if errClose := httpResp.Body.Close(); errClose != nil { - log.Errorf("gemini cli executor: close response body error: %v", errClose) - } - recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone()) - if errRead != nil { - recordAPIResponseError(ctx, e.cfg, errRead) - err = errRead - return resp, err - } - appendAPIResponseChunk(ctx, e.cfg, data) - if httpResp.StatusCode >= 200 && httpResp.StatusCode < 300 { - reporter.publish(ctx, parseGeminiCLIUsage(data)) - var param any - out := sdktranslator.TranslateNonStream(respCtx, to, from, attemptModel, bytes.Clone(opts.OriginalRequest), payload, data, ¶m) - resp = cliproxyexecutor.Response{Payload: []byte(out)} - return resp, nil - } - - lastStatus = httpResp.StatusCode - lastBody = append([]byte(nil), data...) - log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), data)) - if httpResp.StatusCode == 429 { - if idx+1 < len(models) { - log.Debugf("gemini cli executor: rate limited, retrying with next model: %s", models[idx+1]) + // Inner retry loop for 429 errors on the same model + for retryCount := 0; retryCount <= maxRetries; retryCount++ { + payload := append([]byte(nil), basePayload...) + if action == "countTokens" { + payload = deleteJSONField(payload, "project") + payload = deleteJSONField(payload, "model") } else { - log.Debug("gemini cli executor: rate limited, no additional fallback model") + payload = setJSONField(payload, "project", projectID) + payload = setJSONField(payload, "model", attemptModel) } - continue - } - err = statusErr{code: httpResp.StatusCode, msg: string(data)} - return resp, err + tok, errTok := tokenSource.Token() + if errTok != nil { + err = errTok + return resp, err + } + updateGeminiCLITokenMetadata(auth, baseTokenData, tok) + + url := fmt.Sprintf("%s/%s:%s", codeAssistEndpoint, codeAssistVersion, action) + if opts.Alt != "" && action != "countTokens" { + url = url + fmt.Sprintf("?$alt=%s", opts.Alt) + } + + reqHTTP, errReq := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(payload)) + if errReq != nil { + err = errReq + return resp, err + } + reqHTTP.Header.Set("Content-Type", "application/json") + reqHTTP.Header.Set("Authorization", "Bearer "+tok.AccessToken) + applyGeminiCLIHeaders(reqHTTP) + reqHTTP.Header.Set("Accept", "application/json") + recordAPIRequest(ctx, e.cfg, upstreamRequestLog{ + URL: url, + Method: http.MethodPost, + Headers: reqHTTP.Header.Clone(), + Body: payload, + Provider: e.Identifier(), + AuthID: authID, + AuthLabel: authLabel, + AuthType: authType, + AuthValue: authValue, + }) + + httpResp, errDo := httpClient.Do(reqHTTP) + if errDo != nil { + recordAPIResponseError(ctx, e.cfg, errDo) + err = errDo + return resp, err + } + + data, errRead := io.ReadAll(httpResp.Body) + if errClose := httpResp.Body.Close(); errClose != nil { + log.Errorf("gemini cli executor: close response body error: %v", errClose) + } + recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone()) + if errRead != nil { + recordAPIResponseError(ctx, e.cfg, errRead) + err = errRead + return resp, err + } + appendAPIResponseChunk(ctx, e.cfg, data) + if httpResp.StatusCode >= 200 && httpResp.StatusCode < 300 { + reporter.publish(ctx, parseGeminiCLIUsage(data)) + var param any + out := sdktranslator.TranslateNonStream(respCtx, to, from, attemptModel, bytes.Clone(opts.OriginalRequest), payload, data, ¶m) + resp = cliproxyexecutor.Response{Payload: []byte(out)} + return resp, nil + } + + lastStatus = httpResp.StatusCode + lastBody = append([]byte(nil), data...) + log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), data)) + + // Handle 429 rate limit errors with retry + if httpResp.StatusCode == 429 { + if retryCount < maxRetries { + // Parse retry delay from Google's response + retryDelay := parseRetryDelay(data) + log.Infof("gemini cli executor: rate limited (429), retrying model %s in %v (attempt %d/%d)", attemptModel, retryDelay, retryCount+1, maxRetries) + + // Wait for the specified delay + select { + case <-time.After(retryDelay): + // Continue to next retry iteration + continue + case <-ctx.Done(): + // Context cancelled, return immediately + err = ctx.Err() + return resp, err + } + } else { + // Exhausted retries for this model, try next model if available + if idx+1 < len(models) { + log.Infof("gemini cli executor: rate limited, exhausted %d retries for model %s, trying fallback model: %s", maxRetries, attemptModel, models[idx+1]) + break // Break inner loop to try next model + } else { + log.Infof("gemini cli executor: rate limited, exhausted %d retries for model %s, no additional fallback model", maxRetries, attemptModel) + // No more models to try, will return error below + } + } + } else { + // Non-429 error, don't retry this model + err = statusErr{code: httpResp.StatusCode, msg: string(data)} + return resp, err + } + + // Break inner loop if we hit this point (no retry needed or exhausted retries) + break + } } if len(lastBody) > 0 { @@ -235,77 +269,120 @@ func (e *GeminiCLIExecutor) ExecuteStream(ctx context.Context, auth *cliproxyaut var lastStatus int var lastBody []byte + // Get max retry count from config, default to 3 if not set + maxRetries := e.cfg.RequestRetry + if maxRetries <= 0 { + maxRetries = 3 + } + for idx, attemptModel := range models { - payload := append([]byte(nil), basePayload...) - payload = setJSONField(payload, "project", projectID) - payload = setJSONField(payload, "model", attemptModel) + var httpResp *http.Response + var payload []byte + var errDo error - tok, errTok := tokenSource.Token() - if errTok != nil { - err = errTok - return nil, err - } - updateGeminiCLITokenMetadata(auth, baseTokenData, tok) + // Inner retry loop for 429 errors on the same model + for retryCount := 0; retryCount <= maxRetries; retryCount++ { + payload = append([]byte(nil), basePayload...) + payload = setJSONField(payload, "project", projectID) + payload = setJSONField(payload, "model", attemptModel) - url := fmt.Sprintf("%s/%s:%s", codeAssistEndpoint, codeAssistVersion, "streamGenerateContent") - if opts.Alt == "" { - url = url + "?alt=sse" - } else { - url = url + fmt.Sprintf("?$alt=%s", opts.Alt) - } - - reqHTTP, errReq := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(payload)) - if errReq != nil { - err = errReq - return nil, err - } - reqHTTP.Header.Set("Content-Type", "application/json") - reqHTTP.Header.Set("Authorization", "Bearer "+tok.AccessToken) - applyGeminiCLIHeaders(reqHTTP) - reqHTTP.Header.Set("Accept", "text/event-stream") - recordAPIRequest(ctx, e.cfg, upstreamRequestLog{ - URL: url, - Method: http.MethodPost, - Headers: reqHTTP.Header.Clone(), - Body: payload, - Provider: e.Identifier(), - AuthID: authID, - AuthLabel: authLabel, - AuthType: authType, - AuthValue: authValue, - }) - - httpResp, errDo := httpClient.Do(reqHTTP) - if errDo != nil { - recordAPIResponseError(ctx, e.cfg, errDo) - err = errDo - return nil, err - } - recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone()) - if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 { - data, errRead := io.ReadAll(httpResp.Body) - if errClose := httpResp.Body.Close(); errClose != nil { - log.Errorf("gemini cli executor: close response body error: %v", errClose) - } - if errRead != nil { - recordAPIResponseError(ctx, e.cfg, errRead) - err = errRead + tok, errTok := tokenSource.Token() + if errTok != nil { + err = errTok return nil, err } - appendAPIResponseChunk(ctx, e.cfg, data) - lastStatus = httpResp.StatusCode - lastBody = append([]byte(nil), data...) - log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), data)) - if httpResp.StatusCode == 429 { - if idx+1 < len(models) { - log.Debugf("gemini cli executor: rate limited, retrying with next model: %s", models[idx+1]) - } else { - log.Debug("gemini cli executor: rate limited, no additional fallback model") - } - continue + updateGeminiCLITokenMetadata(auth, baseTokenData, tok) + + url := fmt.Sprintf("%s/%s:%s", codeAssistEndpoint, codeAssistVersion, "streamGenerateContent") + if opts.Alt == "" { + url = url + "?alt=sse" + } else { + url = url + fmt.Sprintf("?$alt=%s", opts.Alt) } - err = statusErr{code: httpResp.StatusCode, msg: string(data)} - return nil, err + + reqHTTP, errReq := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(payload)) + if errReq != nil { + err = errReq + return nil, err + } + reqHTTP.Header.Set("Content-Type", "application/json") + reqHTTP.Header.Set("Authorization", "Bearer "+tok.AccessToken) + applyGeminiCLIHeaders(reqHTTP) + reqHTTP.Header.Set("Accept", "text/event-stream") + recordAPIRequest(ctx, e.cfg, upstreamRequestLog{ + URL: url, + Method: http.MethodPost, + Headers: reqHTTP.Header.Clone(), + Body: payload, + Provider: e.Identifier(), + AuthID: authID, + AuthLabel: authLabel, + AuthType: authType, + AuthValue: authValue, + }) + + httpResp, errDo = httpClient.Do(reqHTTP) + if errDo != nil { + recordAPIResponseError(ctx, e.cfg, errDo) + err = errDo + return nil, err + } + recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone()) + if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 { + data, errRead := io.ReadAll(httpResp.Body) + if errClose := httpResp.Body.Close(); errClose != nil { + log.Errorf("gemini cli executor: close response body error: %v", errClose) + } + if errRead != nil { + recordAPIResponseError(ctx, e.cfg, errRead) + err = errRead + return nil, err + } + appendAPIResponseChunk(ctx, e.cfg, data) + lastStatus = httpResp.StatusCode + lastBody = append([]byte(nil), data...) + log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), data)) + + // Handle 429 rate limit errors with retry + if httpResp.StatusCode == 429 { + if retryCount < maxRetries { + // Parse retry delay from Google's response + retryDelay := parseRetryDelay(data) + log.Infof("gemini cli executor: rate limited (429), retrying stream model %s in %v (attempt %d/%d)", attemptModel, retryDelay, retryCount+1, maxRetries) + + // Wait for the specified delay + select { + case <-time.After(retryDelay): + // Continue to next retry iteration + continue + case <-ctx.Done(): + // Context cancelled, return immediately + err = ctx.Err() + return nil, err + } + } else { + // Exhausted retries for this model, try next model if available + if idx+1 < len(models) { + log.Infof("gemini cli executor: rate limited, exhausted %d retries for stream model %s, trying fallback model: %s", maxRetries, attemptModel, models[idx+1]) + break // Break inner loop to try next model + } else { + log.Infof("gemini cli executor: rate limited, exhausted %d retries for stream model %s, no additional fallback model", maxRetries, attemptModel) + // No more models to try, will return error below + } + } + } else { + // Non-429 error, don't retry this model + err = statusErr{code: httpResp.StatusCode, msg: string(data)} + return nil, err + } + + // Break inner loop if we hit this point (no retry needed or exhausted retries) + break + } + + // Success - httpResp.StatusCode is 2xx, break out of retry loop + // and proceed to streaming logic below + break } out := make(chan cliproxyexecutor.StreamChunk) @@ -769,3 +846,48 @@ func fixGeminiCLIImageAspectRatio(modelName string, rawJSON []byte) []byte { } return rawJSON } + +// parseRetryDelay extracts the retry delay from a Google API 429 error response. +// The error response contains a RetryInfo.retryDelay field in the format "0.847655010s". +// Returns the duration to wait, or a default duration if parsing fails. +func parseRetryDelay(errorBody []byte) time.Duration { + const defaultDelay = 1 * time.Second + const maxDelay = 60 * time.Second + + // Try to parse the retryDelay from the error response + // Format: error.details[].retryDelay where @type == "type.googleapis.com/google.rpc.RetryInfo" + details := gjson.GetBytes(errorBody, "error.details") + if !details.Exists() || !details.IsArray() { + log.Debugf("parseRetryDelay: no error.details found, using default delay %v", defaultDelay) + return defaultDelay + } + + for _, detail := range details.Array() { + typeVal := detail.Get("@type").String() + if typeVal == "type.googleapis.com/google.rpc.RetryInfo" { + retryDelay := detail.Get("retryDelay").String() + if retryDelay != "" { + // Parse duration string like "0.847655010s" + duration, err := time.ParseDuration(retryDelay) + if err != nil { + log.Debugf("parseRetryDelay: failed to parse duration %q: %v, using default", retryDelay, err) + return defaultDelay + } + // Cap at maxDelay to prevent excessive waits + if duration > maxDelay { + log.Debugf("parseRetryDelay: capping delay from %v to %v", duration, maxDelay) + return maxDelay + } + if duration < 0 { + log.Debugf("parseRetryDelay: negative delay %v, using default", duration) + return defaultDelay + } + log.Debugf("parseRetryDelay: using delay %v from API response", duration) + return duration + } + } + } + + log.Debugf("parseRetryDelay: no RetryInfo found, using default delay %v", defaultDelay) + return defaultDelay +} From ede4471b84bffc11fe78781c54ea710831acc6fb Mon Sep 17 00:00:00 2001 From: Ben Vargas Date: Wed, 19 Nov 2025 12:46:13 -0700 Subject: [PATCH 3/5] feat(translator): add default thinkingConfig for gemini-3-pro-preview Match official Gemini CLI behavior by always sending default thinkingConfig when client doesn't specify reasoning parameters. - Set thinkingBudget=-1 (dynamic) for gemini-3-pro-preview - Set include_thoughts=true to return thinking process - Apply to both /v1/chat/completions and /v1/responses endpoints - See: ai-gemini-cli/packages/core/src/config/defaultModelConfigs.ts --- .../chat-completions/gemini-cli_openai_request.go | 9 +++++++++ .../responses/gemini_openai-responses_request.go | 12 ++++++++++++ 2 files changed, 21 insertions(+) diff --git a/internal/translator/gemini-cli/openai/chat-completions/gemini-cli_openai_request.go b/internal/translator/gemini-cli/openai/chat-completions/gemini-cli_openai_request.go index 6f7ac724..99b50366 100644 --- a/internal/translator/gemini-cli/openai/chat-completions/gemini-cli_openai_request.go +++ b/internal/translator/gemini-cli/openai/chat-completions/gemini-cli_openai_request.go @@ -88,6 +88,15 @@ func ConvertOpenAIRequestToGeminiCLI(modelName string, inputRawJSON []byte, _ bo } } + // For gemini-3-pro-preview, always send default thinkingConfig when none specified. + // This matches the official Gemini CLI behavior which always sends: + // { thinkingBudget: -1, includeThoughts: true } + // See: ai-gemini-cli/packages/core/src/config/defaultModelConfigs.ts + if !gjson.GetBytes(out, "request.generationConfig.thinkingConfig").Exists() && modelName == "gemini-3-pro-preview" { + out, _ = sjson.SetBytes(out, "request.generationConfig.thinkingConfig.thinkingBudget", -1) + out, _ = sjson.SetBytes(out, "request.generationConfig.thinkingConfig.include_thoughts", true) + } + // Temperature/top_p/top_k if tr := gjson.GetBytes(rawJSON, "temperature"); tr.Exists() && tr.Type == gjson.Number { out, _ = sjson.SetBytes(out, "request.generationConfig.temperature", tr.Num) diff --git a/internal/translator/gemini/openai/responses/gemini_openai-responses_request.go b/internal/translator/gemini/openai/responses/gemini_openai-responses_request.go index 4eeebf3c..981fafc1 100644 --- a/internal/translator/gemini/openai/responses/gemini_openai-responses_request.go +++ b/internal/translator/gemini/openai/responses/gemini_openai-responses_request.go @@ -6,6 +6,7 @@ import ( "github.com/router-for-me/CLIProxyAPI/v6/internal/translator/gemini/common" "github.com/router-for-me/CLIProxyAPI/v6/internal/util" + log "github.com/sirupsen/logrus" "github.com/tidwall/gjson" "github.com/tidwall/sjson" ) @@ -294,6 +295,17 @@ func ConvertOpenAIResponsesRequestToGemini(modelName string, inputRawJSON []byte } } } + + // For gemini-3-pro-preview, always send default thinkingConfig when none specified. + // This matches the official Gemini CLI behavior which always sends: + // { thinkingBudget: -1, includeThoughts: true } + // See: ai-gemini-cli/packages/core/src/config/defaultModelConfigs.ts + if !gjson.Get(out, "generationConfig.thinkingConfig").Exists() && modelName == "gemini-3-pro-preview" { + out, _ = sjson.Set(out, "generationConfig.thinkingConfig.thinkingBudget", -1) + out, _ = sjson.Set(out, "generationConfig.thinkingConfig.include_thoughts", true) + log.Debugf("Applied default thinkingConfig for gemini-3-pro-preview (matches Gemini CLI): thinkingBudget=-1, include_thoughts=true") + } + result := []byte(out) result = common.AttachDefaultSafetySettings(result, "safetySettings") return result From ed23472d9427d55a68656a898902797b62a15a7e Mon Sep 17 00:00:00 2001 From: Ben Vargas Date: Wed, 19 Nov 2025 13:05:38 -0700 Subject: [PATCH 4/5] fix(executor): prevent streaming from 429 response when fallback available Fix critical bug where ExecuteStream would create a streaming channel using a 429 error response instead of continuing to the next fallback model after exhausting retries. When 429 retries were exhausted and a fallback model was available, the inner retry loop would break but immediately fall through to the streaming channel creation, attempting to stream from the failed 429 response instead of trying the next model. Solution: Add shouldContinueToNextModel flag to explicitly skip the streaming logic and continue the outer model loop when appropriate. Identified by: codex-bot review Ref: https://github.com/router-for-me/CLIProxyAPI/pull/280#pullrequestreview-3484479106 --- internal/runtime/executor/gemini_cli_executor.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/internal/runtime/executor/gemini_cli_executor.go b/internal/runtime/executor/gemini_cli_executor.go index 5e932fbd..294761c8 100644 --- a/internal/runtime/executor/gemini_cli_executor.go +++ b/internal/runtime/executor/gemini_cli_executor.go @@ -279,6 +279,7 @@ func (e *GeminiCLIExecutor) ExecuteStream(ctx context.Context, auth *cliproxyaut var httpResp *http.Response var payload []byte var errDo error + shouldContinueToNextModel := false // Inner retry loop for 429 errors on the same model for retryCount := 0; retryCount <= maxRetries; retryCount++ { @@ -364,6 +365,7 @@ func (e *GeminiCLIExecutor) ExecuteStream(ctx context.Context, auth *cliproxyaut // Exhausted retries for this model, try next model if available if idx+1 < len(models) { log.Infof("gemini cli executor: rate limited, exhausted %d retries for stream model %s, trying fallback model: %s", maxRetries, attemptModel, models[idx+1]) + shouldContinueToNextModel = true break // Break inner loop to try next model } else { log.Infof("gemini cli executor: rate limited, exhausted %d retries for stream model %s, no additional fallback model", maxRetries, attemptModel) @@ -385,6 +387,11 @@ func (e *GeminiCLIExecutor) ExecuteStream(ctx context.Context, auth *cliproxyaut break } + // If we need to try the next fallback model, skip streaming logic + if shouldContinueToNextModel { + continue + } + out := make(chan cliproxyexecutor.StreamChunk) stream = out go func(resp *http.Response, reqBody []byte, attempt string) { From 0ff094b87f9b20b965ba45542eae302b4006e23f Mon Sep 17 00:00:00 2001 From: Ben Vargas Date: Wed, 19 Nov 2025 13:14:40 -0700 Subject: [PATCH 5/5] fix(executor): prevent streaming on failed response when no fallback Fix critical bug where ExecuteStream would create a streaming channel from a failed (non-2xx) response after exhausting all retries with no fallback models available. When retries were exhausted on the last model, the code would break from the inner loop but fall through to streaming channel creation (line 401), immediately returning at line 461. This made the error handling code at lines 464-471 unreachable, causing clients to receive an empty/closed stream instead of a proper error response. Solution: Check if httpResp is non-2xx before creating the streaming channel. If failed, continue the outer loop to reach error handling. Identified by: codex-bot review Ref: https://github.com/router-for-me/CLIProxyAPI/pull/280#pullrequestreview-3484560423 --- internal/runtime/executor/gemini_cli_executor.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/internal/runtime/executor/gemini_cli_executor.go b/internal/runtime/executor/gemini_cli_executor.go index 294761c8..2f48871b 100644 --- a/internal/runtime/executor/gemini_cli_executor.go +++ b/internal/runtime/executor/gemini_cli_executor.go @@ -392,6 +392,12 @@ func (e *GeminiCLIExecutor) ExecuteStream(ctx context.Context, auth *cliproxyaut continue } + // If we have a failed response (non-2xx), don't attempt streaming + // Continue outer loop to try next model or return error + if httpResp == nil || httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 { + continue + } + out := make(chan cliproxyexecutor.StreamChunk) stream = out go func(resp *http.Response, reqBody []byte, attempt string) {