diff --git a/internal/registry/model_definitions.go b/internal/registry/model_definitions.go index eea00daf..f7c41224 100644 --- a/internal/registry/model_definitions.go +++ b/internal/registry/model_definitions.go @@ -170,21 +170,21 @@ func GetGeminiCLIModels() []*ModelInfo { SupportedGenerationMethods: []string{"generateContent", "countTokens", "createCachedContent", "batchGenerateContent"}, Thinking: &ThinkingSupport{Min: 0, Max: 24576, ZeroAllowed: true, DynamicAllowed: true}, }, - // { - // ID: "gemini-3-pro-preview-11-2025", - // Object: "model", - // Created: time.Now().Unix(), - // OwnedBy: "google", - // Type: "gemini", - // Name: "models/gemini-3-pro-preview-11-2025", - // Version: "3", - // DisplayName: "Gemini 3 Pro Preview 11-2025", - // Description: "Latest preview of Gemini Pro", - // InputTokenLimit: 1048576, - // OutputTokenLimit: 65536, - // SupportedGenerationMethods: []string{"generateContent", "countTokens", "createCachedContent", "batchGenerateContent"}, - // Thinking: &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true}, - // }, + { + ID: "gemini-3-pro-preview", + Object: "model", + Created: time.Now().Unix(), + OwnedBy: "google", + Type: "gemini", + Name: "models/gemini-3-pro-preview", + Version: "3.0", + DisplayName: "Gemini 3 Pro Preview", + Description: "Gemini 3 Pro Preview", + InputTokenLimit: 1048576, + OutputTokenLimit: 65536, + SupportedGenerationMethods: []string{"generateContent", "countTokens", "createCachedContent", "batchGenerateContent"}, + Thinking: &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true}, + }, } } diff --git a/internal/runtime/executor/gemini_cli_executor.go b/internal/runtime/executor/gemini_cli_executor.go index 9a2c2602..2f48871b 100644 --- a/internal/runtime/executor/gemini_cli_executor.go +++ b/internal/runtime/executor/gemini_cli_executor.go @@ -99,89 +99,123 @@ func (e *GeminiCLIExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth var lastStatus int var lastBody []byte + // Get max retry count from config, default to 3 if not set + maxRetries := e.cfg.RequestRetry + if maxRetries <= 0 { + maxRetries = 3 + } + for idx, attemptModel := range models { - payload := append([]byte(nil), basePayload...) - if action == "countTokens" { - payload = deleteJSONField(payload, "project") - payload = deleteJSONField(payload, "model") - } else { - payload = setJSONField(payload, "project", projectID) - payload = setJSONField(payload, "model", attemptModel) - } - - tok, errTok := tokenSource.Token() - if errTok != nil { - err = errTok - return resp, err - } - updateGeminiCLITokenMetadata(auth, baseTokenData, tok) - - url := fmt.Sprintf("%s/%s:%s", codeAssistEndpoint, codeAssistVersion, action) - if opts.Alt != "" && action != "countTokens" { - url = url + fmt.Sprintf("?$alt=%s", opts.Alt) - } - - reqHTTP, errReq := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(payload)) - if errReq != nil { - err = errReq - return resp, err - } - reqHTTP.Header.Set("Content-Type", "application/json") - reqHTTP.Header.Set("Authorization", "Bearer "+tok.AccessToken) - applyGeminiCLIHeaders(reqHTTP) - reqHTTP.Header.Set("Accept", "application/json") - recordAPIRequest(ctx, e.cfg, upstreamRequestLog{ - URL: url, - Method: http.MethodPost, - Headers: reqHTTP.Header.Clone(), - Body: payload, - Provider: e.Identifier(), - AuthID: authID, - AuthLabel: authLabel, - AuthType: authType, - AuthValue: authValue, - }) - - httpResp, errDo := httpClient.Do(reqHTTP) - if errDo != nil { - recordAPIResponseError(ctx, e.cfg, errDo) - err = errDo - return resp, err - } - - data, errRead := io.ReadAll(httpResp.Body) - if errClose := httpResp.Body.Close(); errClose != nil { - log.Errorf("gemini cli executor: close response body error: %v", errClose) - } - recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone()) - if errRead != nil { - recordAPIResponseError(ctx, e.cfg, errRead) - err = errRead - return resp, err - } - appendAPIResponseChunk(ctx, e.cfg, data) - if httpResp.StatusCode >= 200 && httpResp.StatusCode < 300 { - reporter.publish(ctx, parseGeminiCLIUsage(data)) - var param any - out := sdktranslator.TranslateNonStream(respCtx, to, from, attemptModel, bytes.Clone(opts.OriginalRequest), payload, data, ¶m) - resp = cliproxyexecutor.Response{Payload: []byte(out)} - return resp, nil - } - - lastStatus = httpResp.StatusCode - lastBody = append([]byte(nil), data...) - log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), data)) - if httpResp.StatusCode == 429 { - if idx+1 < len(models) { - log.Debugf("gemini cli executor: rate limited, retrying with next model: %s", models[idx+1]) + // Inner retry loop for 429 errors on the same model + for retryCount := 0; retryCount <= maxRetries; retryCount++ { + payload := append([]byte(nil), basePayload...) + if action == "countTokens" { + payload = deleteJSONField(payload, "project") + payload = deleteJSONField(payload, "model") } else { - log.Debug("gemini cli executor: rate limited, no additional fallback model") + payload = setJSONField(payload, "project", projectID) + payload = setJSONField(payload, "model", attemptModel) } - continue - } - err = statusErr{code: httpResp.StatusCode, msg: string(data)} - return resp, err + tok, errTok := tokenSource.Token() + if errTok != nil { + err = errTok + return resp, err + } + updateGeminiCLITokenMetadata(auth, baseTokenData, tok) + + url := fmt.Sprintf("%s/%s:%s", codeAssistEndpoint, codeAssistVersion, action) + if opts.Alt != "" && action != "countTokens" { + url = url + fmt.Sprintf("?$alt=%s", opts.Alt) + } + + reqHTTP, errReq := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(payload)) + if errReq != nil { + err = errReq + return resp, err + } + reqHTTP.Header.Set("Content-Type", "application/json") + reqHTTP.Header.Set("Authorization", "Bearer "+tok.AccessToken) + applyGeminiCLIHeaders(reqHTTP) + reqHTTP.Header.Set("Accept", "application/json") + recordAPIRequest(ctx, e.cfg, upstreamRequestLog{ + URL: url, + Method: http.MethodPost, + Headers: reqHTTP.Header.Clone(), + Body: payload, + Provider: e.Identifier(), + AuthID: authID, + AuthLabel: authLabel, + AuthType: authType, + AuthValue: authValue, + }) + + httpResp, errDo := httpClient.Do(reqHTTP) + if errDo != nil { + recordAPIResponseError(ctx, e.cfg, errDo) + err = errDo + return resp, err + } + + data, errRead := io.ReadAll(httpResp.Body) + if errClose := httpResp.Body.Close(); errClose != nil { + log.Errorf("gemini cli executor: close response body error: %v", errClose) + } + recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone()) + if errRead != nil { + recordAPIResponseError(ctx, e.cfg, errRead) + err = errRead + return resp, err + } + appendAPIResponseChunk(ctx, e.cfg, data) + if httpResp.StatusCode >= 200 && httpResp.StatusCode < 300 { + reporter.publish(ctx, parseGeminiCLIUsage(data)) + var param any + out := sdktranslator.TranslateNonStream(respCtx, to, from, attemptModel, bytes.Clone(opts.OriginalRequest), payload, data, ¶m) + resp = cliproxyexecutor.Response{Payload: []byte(out)} + return resp, nil + } + + lastStatus = httpResp.StatusCode + lastBody = append([]byte(nil), data...) + log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), data)) + + // Handle 429 rate limit errors with retry + if httpResp.StatusCode == 429 { + if retryCount < maxRetries { + // Parse retry delay from Google's response + retryDelay := parseRetryDelay(data) + log.Infof("gemini cli executor: rate limited (429), retrying model %s in %v (attempt %d/%d)", attemptModel, retryDelay, retryCount+1, maxRetries) + + // Wait for the specified delay + select { + case <-time.After(retryDelay): + // Continue to next retry iteration + continue + case <-ctx.Done(): + // Context cancelled, return immediately + err = ctx.Err() + return resp, err + } + } else { + // Exhausted retries for this model, try next model if available + if idx+1 < len(models) { + log.Infof("gemini cli executor: rate limited, exhausted %d retries for model %s, trying fallback model: %s", maxRetries, attemptModel, models[idx+1]) + break // Break inner loop to try next model + } else { + log.Infof("gemini cli executor: rate limited, exhausted %d retries for model %s, no additional fallback model", maxRetries, attemptModel) + // No more models to try, will return error below + } + } + } else { + // Non-429 error, don't retry this model + err = statusErr{code: httpResp.StatusCode, msg: string(data)} + return resp, err + } + + // Break inner loop if we hit this point (no retry needed or exhausted retries) + break + } } if len(lastBody) > 0 { @@ -235,77 +269,133 @@ func (e *GeminiCLIExecutor) ExecuteStream(ctx context.Context, auth *cliproxyaut var lastStatus int var lastBody []byte + // Get max retry count from config, default to 3 if not set + maxRetries := e.cfg.RequestRetry + if maxRetries <= 0 { + maxRetries = 3 + } + for idx, attemptModel := range models { - payload := append([]byte(nil), basePayload...) - payload = setJSONField(payload, "project", projectID) - payload = setJSONField(payload, "model", attemptModel) + var httpResp *http.Response + var payload []byte + var errDo error + shouldContinueToNextModel := false - tok, errTok := tokenSource.Token() - if errTok != nil { - err = errTok - return nil, err - } - updateGeminiCLITokenMetadata(auth, baseTokenData, tok) + // Inner retry loop for 429 errors on the same model + for retryCount := 0; retryCount <= maxRetries; retryCount++ { + payload = append([]byte(nil), basePayload...) + payload = setJSONField(payload, "project", projectID) + payload = setJSONField(payload, "model", attemptModel) - url := fmt.Sprintf("%s/%s:%s", codeAssistEndpoint, codeAssistVersion, "streamGenerateContent") - if opts.Alt == "" { - url = url + "?alt=sse" - } else { - url = url + fmt.Sprintf("?$alt=%s", opts.Alt) - } - - reqHTTP, errReq := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(payload)) - if errReq != nil { - err = errReq - return nil, err - } - reqHTTP.Header.Set("Content-Type", "application/json") - reqHTTP.Header.Set("Authorization", "Bearer "+tok.AccessToken) - applyGeminiCLIHeaders(reqHTTP) - reqHTTP.Header.Set("Accept", "text/event-stream") - recordAPIRequest(ctx, e.cfg, upstreamRequestLog{ - URL: url, - Method: http.MethodPost, - Headers: reqHTTP.Header.Clone(), - Body: payload, - Provider: e.Identifier(), - AuthID: authID, - AuthLabel: authLabel, - AuthType: authType, - AuthValue: authValue, - }) - - httpResp, errDo := httpClient.Do(reqHTTP) - if errDo != nil { - recordAPIResponseError(ctx, e.cfg, errDo) - err = errDo - return nil, err - } - recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone()) - if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 { - data, errRead := io.ReadAll(httpResp.Body) - if errClose := httpResp.Body.Close(); errClose != nil { - log.Errorf("gemini cli executor: close response body error: %v", errClose) - } - if errRead != nil { - recordAPIResponseError(ctx, e.cfg, errRead) - err = errRead + tok, errTok := tokenSource.Token() + if errTok != nil { + err = errTok return nil, err } - appendAPIResponseChunk(ctx, e.cfg, data) - lastStatus = httpResp.StatusCode - lastBody = append([]byte(nil), data...) - log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), data)) - if httpResp.StatusCode == 429 { - if idx+1 < len(models) { - log.Debugf("gemini cli executor: rate limited, retrying with next model: %s", models[idx+1]) - } else { - log.Debug("gemini cli executor: rate limited, no additional fallback model") - } - continue + updateGeminiCLITokenMetadata(auth, baseTokenData, tok) + + url := fmt.Sprintf("%s/%s:%s", codeAssistEndpoint, codeAssistVersion, "streamGenerateContent") + if opts.Alt == "" { + url = url + "?alt=sse" + } else { + url = url + fmt.Sprintf("?$alt=%s", opts.Alt) } - err = statusErr{code: httpResp.StatusCode, msg: string(data)} - return nil, err + + reqHTTP, errReq := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(payload)) + if errReq != nil { + err = errReq + return nil, err + } + reqHTTP.Header.Set("Content-Type", "application/json") + reqHTTP.Header.Set("Authorization", "Bearer "+tok.AccessToken) + applyGeminiCLIHeaders(reqHTTP) + reqHTTP.Header.Set("Accept", "text/event-stream") + recordAPIRequest(ctx, e.cfg, upstreamRequestLog{ + URL: url, + Method: http.MethodPost, + Headers: reqHTTP.Header.Clone(), + Body: payload, + Provider: e.Identifier(), + AuthID: authID, + AuthLabel: authLabel, + AuthType: authType, + AuthValue: authValue, + }) + + httpResp, errDo = httpClient.Do(reqHTTP) + if errDo != nil { + recordAPIResponseError(ctx, e.cfg, errDo) + err = errDo + return nil, err + } + recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone()) + if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 { + data, errRead := io.ReadAll(httpResp.Body) + if errClose := httpResp.Body.Close(); errClose != nil { + log.Errorf("gemini cli executor: close response body error: %v", errClose) + } + if errRead != nil { + recordAPIResponseError(ctx, e.cfg, errRead) + err = errRead + return nil, err + } + appendAPIResponseChunk(ctx, e.cfg, data) + lastStatus = httpResp.StatusCode + lastBody = append([]byte(nil), data...) + log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), data)) + + // Handle 429 rate limit errors with retry + if httpResp.StatusCode == 429 { + if retryCount < maxRetries { + // Parse retry delay from Google's response + retryDelay := parseRetryDelay(data) + log.Infof("gemini cli executor: rate limited (429), retrying stream model %s in %v (attempt %d/%d)", attemptModel, retryDelay, retryCount+1, maxRetries) + + // Wait for the specified delay + select { + case <-time.After(retryDelay): + // Continue to next retry iteration + continue + case <-ctx.Done(): + // Context cancelled, return immediately + err = ctx.Err() + return nil, err + } + } else { + // Exhausted retries for this model, try next model if available + if idx+1 < len(models) { + log.Infof("gemini cli executor: rate limited, exhausted %d retries for stream model %s, trying fallback model: %s", maxRetries, attemptModel, models[idx+1]) + shouldContinueToNextModel = true + break // Break inner loop to try next model + } else { + log.Infof("gemini cli executor: rate limited, exhausted %d retries for stream model %s, no additional fallback model", maxRetries, attemptModel) + // No more models to try, will return error below + } + } + } else { + // Non-429 error, don't retry this model + err = statusErr{code: httpResp.StatusCode, msg: string(data)} + return nil, err + } + + // Break inner loop if we hit this point (no retry needed or exhausted retries) + break + } + + // Success - httpResp.StatusCode is 2xx, break out of retry loop + // and proceed to streaming logic below + break + } + + // If we need to try the next fallback model, skip streaming logic + if shouldContinueToNextModel { + continue + } + + // If we have a failed response (non-2xx), don't attempt streaming + // Continue outer loop to try next model or return error + if httpResp == nil || httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 { + continue } out := make(chan cliproxyexecutor.StreamChunk) @@ -769,3 +859,48 @@ func fixGeminiCLIImageAspectRatio(modelName string, rawJSON []byte) []byte { } return rawJSON } + +// parseRetryDelay extracts the retry delay from a Google API 429 error response. +// The error response contains a RetryInfo.retryDelay field in the format "0.847655010s". +// Returns the duration to wait, or a default duration if parsing fails. +func parseRetryDelay(errorBody []byte) time.Duration { + const defaultDelay = 1 * time.Second + const maxDelay = 60 * time.Second + + // Try to parse the retryDelay from the error response + // Format: error.details[].retryDelay where @type == "type.googleapis.com/google.rpc.RetryInfo" + details := gjson.GetBytes(errorBody, "error.details") + if !details.Exists() || !details.IsArray() { + log.Debugf("parseRetryDelay: no error.details found, using default delay %v", defaultDelay) + return defaultDelay + } + + for _, detail := range details.Array() { + typeVal := detail.Get("@type").String() + if typeVal == "type.googleapis.com/google.rpc.RetryInfo" { + retryDelay := detail.Get("retryDelay").String() + if retryDelay != "" { + // Parse duration string like "0.847655010s" + duration, err := time.ParseDuration(retryDelay) + if err != nil { + log.Debugf("parseRetryDelay: failed to parse duration %q: %v, using default", retryDelay, err) + return defaultDelay + } + // Cap at maxDelay to prevent excessive waits + if duration > maxDelay { + log.Debugf("parseRetryDelay: capping delay from %v to %v", duration, maxDelay) + return maxDelay + } + if duration < 0 { + log.Debugf("parseRetryDelay: negative delay %v, using default", duration) + return defaultDelay + } + log.Debugf("parseRetryDelay: using delay %v from API response", duration) + return duration + } + } + } + + log.Debugf("parseRetryDelay: no RetryInfo found, using default delay %v", defaultDelay) + return defaultDelay +} diff --git a/internal/translator/gemini-cli/openai/chat-completions/gemini-cli_openai_request.go b/internal/translator/gemini-cli/openai/chat-completions/gemini-cli_openai_request.go index 6f7ac724..99b50366 100644 --- a/internal/translator/gemini-cli/openai/chat-completions/gemini-cli_openai_request.go +++ b/internal/translator/gemini-cli/openai/chat-completions/gemini-cli_openai_request.go @@ -88,6 +88,15 @@ func ConvertOpenAIRequestToGeminiCLI(modelName string, inputRawJSON []byte, _ bo } } + // For gemini-3-pro-preview, always send default thinkingConfig when none specified. + // This matches the official Gemini CLI behavior which always sends: + // { thinkingBudget: -1, includeThoughts: true } + // See: ai-gemini-cli/packages/core/src/config/defaultModelConfigs.ts + if !gjson.GetBytes(out, "request.generationConfig.thinkingConfig").Exists() && modelName == "gemini-3-pro-preview" { + out, _ = sjson.SetBytes(out, "request.generationConfig.thinkingConfig.thinkingBudget", -1) + out, _ = sjson.SetBytes(out, "request.generationConfig.thinkingConfig.include_thoughts", true) + } + // Temperature/top_p/top_k if tr := gjson.GetBytes(rawJSON, "temperature"); tr.Exists() && tr.Type == gjson.Number { out, _ = sjson.SetBytes(out, "request.generationConfig.temperature", tr.Num) diff --git a/internal/translator/gemini/openai/responses/gemini_openai-responses_request.go b/internal/translator/gemini/openai/responses/gemini_openai-responses_request.go index 4eeebf3c..981fafc1 100644 --- a/internal/translator/gemini/openai/responses/gemini_openai-responses_request.go +++ b/internal/translator/gemini/openai/responses/gemini_openai-responses_request.go @@ -6,6 +6,7 @@ import ( "github.com/router-for-me/CLIProxyAPI/v6/internal/translator/gemini/common" "github.com/router-for-me/CLIProxyAPI/v6/internal/util" + log "github.com/sirupsen/logrus" "github.com/tidwall/gjson" "github.com/tidwall/sjson" ) @@ -294,6 +295,17 @@ func ConvertOpenAIResponsesRequestToGemini(modelName string, inputRawJSON []byte } } } + + // For gemini-3-pro-preview, always send default thinkingConfig when none specified. + // This matches the official Gemini CLI behavior which always sends: + // { thinkingBudget: -1, includeThoughts: true } + // See: ai-gemini-cli/packages/core/src/config/defaultModelConfigs.ts + if !gjson.Get(out, "generationConfig.thinkingConfig").Exists() && modelName == "gemini-3-pro-preview" { + out, _ = sjson.Set(out, "generationConfig.thinkingConfig.thinkingBudget", -1) + out, _ = sjson.Set(out, "generationConfig.thinkingConfig.include_thoughts", true) + log.Debugf("Applied default thinkingConfig for gemini-3-pro-preview (matches Gemini CLI): thinkingBudget=-1, include_thoughts=true") + } + result := []byte(out) result = common.AttachDefaultSafetySettings(result, "safetySettings") return result