From 782bba0bc473674f16c1ea0cb092f8518f7bd8b1 Mon Sep 17 00:00:00 2001
From: Ben Vargas <ben@vargas.com>
Date: Wed, 19 Nov 2025 09:13:03 -0700
Subject: [PATCH 1/5] feat(registry): enable gemini-3-pro-preview for
 gemini-cli provider

Add gemini-3-pro-preview model to GetGeminiCLIModels() to make it
available for OAuth-based Gemini CLI users, matching the model
already available in AI Studio provider.

Model spec:
- ID: gemini-3-pro-preview
- Version: 3.0
- Input: 1M tokens
- Output: 64K tokens
- Thinking: 128-32K tokens (dynamic)
---
 internal/registry/model_definitions.go | 30 +++++++++++++-------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/internal/registry/model_definitions.go b/internal/registry/model_definitions.go
index eea00daf..f7c41224 100644
--- a/internal/registry/model_definitions.go
+++ b/internal/registry/model_definitions.go
@@ -170,21 +170,21 @@ func GetGeminiCLIModels() []*ModelInfo {
 			SupportedGenerationMethods: []string{"generateContent", "countTokens", "createCachedContent", "batchGenerateContent"},
 			Thinking:                   &ThinkingSupport{Min: 0, Max: 24576, ZeroAllowed: true, DynamicAllowed: true},
 		},
-		// {
-		// 	ID:                         "gemini-3-pro-preview-11-2025",
-		// 	Object:                     "model",
-		// 	Created:                    time.Now().Unix(),
-		// 	OwnedBy:                    "google",
-		// 	Type:                       "gemini",
-		// 	Name:                       "models/gemini-3-pro-preview-11-2025",
-		// 	Version:                    "3",
-		// 	DisplayName:                "Gemini 3 Pro Preview 11-2025",
-		// 	Description:                "Latest preview of Gemini Pro",
-		// 	InputTokenLimit:            1048576,
-		// 	OutputTokenLimit:           65536,
-		// 	SupportedGenerationMethods: []string{"generateContent", "countTokens", "createCachedContent", "batchGenerateContent"},
-		// 	Thinking:                   &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true},
-		// },
+		{
+			ID:                         "gemini-3-pro-preview",
+			Object:                     "model",
+			Created:                    time.Now().Unix(),
+			OwnedBy:                    "google",
+			Type:                       "gemini",
+			Name:                       "models/gemini-3-pro-preview",
+			Version:                    "3.0",
+			DisplayName:                "Gemini 3 Pro Preview",
+			Description:                "Gemini 3 Pro Preview",
+			InputTokenLimit:            1048576,
+			OutputTokenLimit:           65536,
+			SupportedGenerationMethods: []string{"generateContent", "countTokens", "createCachedContent", "batchGenerateContent"},
+			Thinking:                   &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true},
+		},
 	}
 }
 

From 6a3de3a89c6af66f4c84ea4f1de31aeec754b630 Mon Sep 17 00:00:00 2001
From: Ben Vargas <ben@vargas.com>
Date: Wed, 19 Nov 2025 12:45:59 -0700
Subject: [PATCH 2/5] feat(executor): add intelligent retry logic for 429 rate
 limits

Implement Google RetryInfo.retryDelay support for handling 429 rate
limit errors. Retries same model up to 3 times using exact delays
from Google's API before trying fallback models.

- Add parseRetryDelay() to extract Google's retry guidance
- Implement inner retry loop in Execute() and ExecuteStream()
- Context-aware waiting with cancellation support
- Cap delays at 60s maximum for safety
---
 .../runtime/executor/gemini_cli_executor.go   | 410 ++++++++++++------
 1 file changed, 266 insertions(+), 144 deletions(-)

diff --git a/internal/runtime/executor/gemini_cli_executor.go b/internal/runtime/executor/gemini_cli_executor.go
index 9a2c2602..5e932fbd 100644
--- a/internal/runtime/executor/gemini_cli_executor.go
+++ b/internal/runtime/executor/gemini_cli_executor.go
@@ -99,89 +99,123 @@ func (e *GeminiCLIExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth
 	var lastStatus int
 	var lastBody []byte
 
+	// Get max retry count from config, default to 3 if not set
+	maxRetries := e.cfg.RequestRetry
+	if maxRetries <= 0 {
+		maxRetries = 3
+	}
+
 	for idx, attemptModel := range models {
-		payload := append([]byte(nil), basePayload...)
-		if action == "countTokens" {
-			payload = deleteJSONField(payload, "project")
-			payload = deleteJSONField(payload, "model")
-		} else {
-			payload = setJSONField(payload, "project", projectID)
-			payload = setJSONField(payload, "model", attemptModel)
-		}
-
-		tok, errTok := tokenSource.Token()
-		if errTok != nil {
-			err = errTok
-			return resp, err
-		}
-		updateGeminiCLITokenMetadata(auth, baseTokenData, tok)
-
-		url := fmt.Sprintf("%s/%s:%s", codeAssistEndpoint, codeAssistVersion, action)
-		if opts.Alt != "" && action != "countTokens" {
-			url = url + fmt.Sprintf("?$alt=%s", opts.Alt)
-		}
-
-		reqHTTP, errReq := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(payload))
-		if errReq != nil {
-			err = errReq
-			return resp, err
-		}
-		reqHTTP.Header.Set("Content-Type", "application/json")
-		reqHTTP.Header.Set("Authorization", "Bearer "+tok.AccessToken)
-		applyGeminiCLIHeaders(reqHTTP)
-		reqHTTP.Header.Set("Accept", "application/json")
-		recordAPIRequest(ctx, e.cfg, upstreamRequestLog{
-			URL:       url,
-			Method:    http.MethodPost,
-			Headers:   reqHTTP.Header.Clone(),
-			Body:      payload,
-			Provider:  e.Identifier(),
-			AuthID:    authID,
-			AuthLabel: authLabel,
-			AuthType:  authType,
-			AuthValue: authValue,
-		})
-
-		httpResp, errDo := httpClient.Do(reqHTTP)
-		if errDo != nil {
-			recordAPIResponseError(ctx, e.cfg, errDo)
-			err = errDo
-			return resp, err
-		}
-
-		data, errRead := io.ReadAll(httpResp.Body)
-		if errClose := httpResp.Body.Close(); errClose != nil {
-			log.Errorf("gemini cli executor: close response body error: %v", errClose)
-		}
-		recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone())
-		if errRead != nil {
-			recordAPIResponseError(ctx, e.cfg, errRead)
-			err = errRead
-			return resp, err
-		}
-		appendAPIResponseChunk(ctx, e.cfg, data)
-		if httpResp.StatusCode >= 200 && httpResp.StatusCode < 300 {
-			reporter.publish(ctx, parseGeminiCLIUsage(data))
-			var param any
-			out := sdktranslator.TranslateNonStream(respCtx, to, from, attemptModel, bytes.Clone(opts.OriginalRequest), payload, data, &param)
-			resp = cliproxyexecutor.Response{Payload: []byte(out)}
-			return resp, nil
-		}
-
-		lastStatus = httpResp.StatusCode
-		lastBody = append([]byte(nil), data...)
-		log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), data))
-		if httpResp.StatusCode == 429 {
-			if idx+1 < len(models) {
-				log.Debugf("gemini cli executor: rate limited, retrying with next model: %s", models[idx+1])
+		// Inner retry loop for 429 errors on the same model
+		for retryCount := 0; retryCount <= maxRetries; retryCount++ {
+			payload := append([]byte(nil), basePayload...)
+			if action == "countTokens" {
+				payload = deleteJSONField(payload, "project")
+				payload = deleteJSONField(payload, "model")
 			} else {
-				log.Debug("gemini cli executor: rate limited, no additional fallback model")
+				payload = setJSONField(payload, "project", projectID)
+				payload = setJSONField(payload, "model", attemptModel)
 			}
-			continue
-		}
 
-		err = statusErr{code: httpResp.StatusCode, msg: string(data)}
-		return resp, err
+			tok, errTok := tokenSource.Token()
+			if errTok != nil {
+				err = errTok
+				return resp, err
+			}
+			updateGeminiCLITokenMetadata(auth, baseTokenData, tok)
+
+			url := fmt.Sprintf("%s/%s:%s", codeAssistEndpoint, codeAssistVersion, action)
+			if opts.Alt != "" && action != "countTokens" {
+				url = url + fmt.Sprintf("?$alt=%s", opts.Alt)
+			}
+
+			reqHTTP, errReq := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(payload))
+			if errReq != nil {
+				err = errReq
+				return resp, err
+			}
+			reqHTTP.Header.Set("Content-Type", "application/json")
+			reqHTTP.Header.Set("Authorization", "Bearer "+tok.AccessToken)
+			applyGeminiCLIHeaders(reqHTTP)
+			reqHTTP.Header.Set("Accept", "application/json")
+			recordAPIRequest(ctx, e.cfg, upstreamRequestLog{
+				URL:       url,
+				Method:    http.MethodPost,
+				Headers:   reqHTTP.Header.Clone(),
+				Body:      payload,
+				Provider:  e.Identifier(),
+				AuthID:    authID,
+				AuthLabel: authLabel,
+				AuthType:  authType,
+				AuthValue: authValue,
+			})
+
+			httpResp, errDo := httpClient.Do(reqHTTP)
+			if errDo != nil {
+				recordAPIResponseError(ctx, e.cfg, errDo)
+				err = errDo
+				return resp, err
+			}
+
+			data, errRead := io.ReadAll(httpResp.Body)
+			if errClose := httpResp.Body.Close(); errClose != nil {
+				log.Errorf("gemini cli executor: close response body error: %v", errClose)
+			}
+			recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone())
+			if errRead != nil {
+				recordAPIResponseError(ctx, e.cfg, errRead)
+				err = errRead
+				return resp, err
+			}
+			appendAPIResponseChunk(ctx, e.cfg, data)
+			if httpResp.StatusCode >= 200 && httpResp.StatusCode < 300 {
+				reporter.publish(ctx, parseGeminiCLIUsage(data))
+				var param any
+				out := sdktranslator.TranslateNonStream(respCtx, to, from, attemptModel, bytes.Clone(opts.OriginalRequest), payload, data, &param)
+				resp = cliproxyexecutor.Response{Payload: []byte(out)}
+				return resp, nil
+			}
+
+			lastStatus = httpResp.StatusCode
+			lastBody = append([]byte(nil), data...)
+			log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), data))
+
+			// Handle 429 rate limit errors with retry
+			if httpResp.StatusCode == 429 {
+				if retryCount < maxRetries {
+					// Parse retry delay from Google's response
+					retryDelay := parseRetryDelay(data)
+					log.Infof("gemini cli executor: rate limited (429), retrying model %s in %v (attempt %d/%d)", attemptModel, retryDelay, retryCount+1, maxRetries)
+
+					// Wait for the specified delay
+					select {
+					case <-time.After(retryDelay):
+						// Continue to next retry iteration
+						continue
+					case <-ctx.Done():
+						// Context cancelled, return immediately
+						err = ctx.Err()
+						return resp, err
+					}
+				} else {
+					// Exhausted retries for this model, try next model if available
+					if idx+1 < len(models) {
+						log.Infof("gemini cli executor: rate limited, exhausted %d retries for model %s, trying fallback model: %s", maxRetries, attemptModel, models[idx+1])
+						break // Break inner loop to try next model
+					} else {
+						log.Infof("gemini cli executor: rate limited, exhausted %d retries for model %s, no additional fallback model", maxRetries, attemptModel)
+						// No more models to try, will return error below
+					}
+				}
+			} else {
+				// Non-429 error, don't retry this model
+				err = statusErr{code: httpResp.StatusCode, msg: string(data)}
+				return resp, err
+			}
+
+			// Break inner loop if we hit this point (no retry needed or exhausted retries)
+			break
+		}
 	}
 
 	if len(lastBody) > 0 {
@@ -235,77 +269,120 @@ func (e *GeminiCLIExecutor) ExecuteStream(ctx context.Context, auth *cliproxyaut
 	var lastStatus int
 	var lastBody []byte
 
+	// Get max retry count from config, default to 3 if not set
+	maxRetries := e.cfg.RequestRetry
+	if maxRetries <= 0 {
+		maxRetries = 3
+	}
+
 	for idx, attemptModel := range models {
-		payload := append([]byte(nil), basePayload...)
-		payload = setJSONField(payload, "project", projectID)
-		payload = setJSONField(payload, "model", attemptModel)
+		var httpResp *http.Response
+		var payload []byte
+		var errDo error
 
-		tok, errTok := tokenSource.Token()
-		if errTok != nil {
-			err = errTok
-			return nil, err
-		}
-		updateGeminiCLITokenMetadata(auth, baseTokenData, tok)
+		// Inner retry loop for 429 errors on the same model
+		for retryCount := 0; retryCount <= maxRetries; retryCount++ {
+			payload = append([]byte(nil), basePayload...)
+			payload = setJSONField(payload, "project", projectID)
+			payload = setJSONField(payload, "model", attemptModel)
 
-		url := fmt.Sprintf("%s/%s:%s", codeAssistEndpoint, codeAssistVersion, "streamGenerateContent")
-		if opts.Alt == "" {
-			url = url + "?alt=sse"
-		} else {
-			url = url + fmt.Sprintf("?$alt=%s", opts.Alt)
-		}
-
-		reqHTTP, errReq := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(payload))
-		if errReq != nil {
-			err = errReq
-			return nil, err
-		}
-		reqHTTP.Header.Set("Content-Type", "application/json")
-		reqHTTP.Header.Set("Authorization", "Bearer "+tok.AccessToken)
-		applyGeminiCLIHeaders(reqHTTP)
-		reqHTTP.Header.Set("Accept", "text/event-stream")
-		recordAPIRequest(ctx, e.cfg, upstreamRequestLog{
-			URL:       url,
-			Method:    http.MethodPost,
-			Headers:   reqHTTP.Header.Clone(),
-			Body:      payload,
-			Provider:  e.Identifier(),
-			AuthID:    authID,
-			AuthLabel: authLabel,
-			AuthType:  authType,
-			AuthValue: authValue,
-		})
-
-		httpResp, errDo := httpClient.Do(reqHTTP)
-		if errDo != nil {
-			recordAPIResponseError(ctx, e.cfg, errDo)
-			err = errDo
-			return nil, err
-		}
-		recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone())
-		if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
-			data, errRead := io.ReadAll(httpResp.Body)
-			if errClose := httpResp.Body.Close(); errClose != nil {
-				log.Errorf("gemini cli executor: close response body error: %v", errClose)
-			}
-			if errRead != nil {
-				recordAPIResponseError(ctx, e.cfg, errRead)
-				err = errRead
+			tok, errTok := tokenSource.Token()
+			if errTok != nil {
+				err = errTok
 				return nil, err
 			}
-			appendAPIResponseChunk(ctx, e.cfg, data)
-			lastStatus = httpResp.StatusCode
-			lastBody = append([]byte(nil), data...)
-			log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), data))
-			if httpResp.StatusCode == 429 {
-				if idx+1 < len(models) {
-					log.Debugf("gemini cli executor: rate limited, retrying with next model: %s", models[idx+1])
-				} else {
-					log.Debug("gemini cli executor: rate limited, no additional fallback model")
-				}
-				continue
+			updateGeminiCLITokenMetadata(auth, baseTokenData, tok)
+
+			url := fmt.Sprintf("%s/%s:%s", codeAssistEndpoint, codeAssistVersion, "streamGenerateContent")
+			if opts.Alt == "" {
+				url = url + "?alt=sse"
+			} else {
+				url = url + fmt.Sprintf("?$alt=%s", opts.Alt)
 			}
-			err = statusErr{code: httpResp.StatusCode, msg: string(data)}
-			return nil, err
+
+			reqHTTP, errReq := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(payload))
+			if errReq != nil {
+				err = errReq
+				return nil, err
+			}
+			reqHTTP.Header.Set("Content-Type", "application/json")
+			reqHTTP.Header.Set("Authorization", "Bearer "+tok.AccessToken)
+			applyGeminiCLIHeaders(reqHTTP)
+			reqHTTP.Header.Set("Accept", "text/event-stream")
+			recordAPIRequest(ctx, e.cfg, upstreamRequestLog{
+				URL:       url,
+				Method:    http.MethodPost,
+				Headers:   reqHTTP.Header.Clone(),
+				Body:      payload,
+				Provider:  e.Identifier(),
+				AuthID:    authID,
+				AuthLabel: authLabel,
+				AuthType:  authType,
+				AuthValue: authValue,
+			})
+
+			httpResp, errDo = httpClient.Do(reqHTTP)
+			if errDo != nil {
+				recordAPIResponseError(ctx, e.cfg, errDo)
+				err = errDo
+				return nil, err
+			}
+			recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone())
+			if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
+				data, errRead := io.ReadAll(httpResp.Body)
+				if errClose := httpResp.Body.Close(); errClose != nil {
+					log.Errorf("gemini cli executor: close response body error: %v", errClose)
+				}
+				if errRead != nil {
+					recordAPIResponseError(ctx, e.cfg, errRead)
+					err = errRead
+					return nil, err
+				}
+				appendAPIResponseChunk(ctx, e.cfg, data)
+				lastStatus = httpResp.StatusCode
+				lastBody = append([]byte(nil), data...)
+				log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), data))
+
+				// Handle 429 rate limit errors with retry
+				if httpResp.StatusCode == 429 {
+					if retryCount < maxRetries {
+						// Parse retry delay from Google's response
+						retryDelay := parseRetryDelay(data)
+						log.Infof("gemini cli executor: rate limited (429), retrying stream model %s in %v (attempt %d/%d)", attemptModel, retryDelay, retryCount+1, maxRetries)
+
+						// Wait for the specified delay
+						select {
+						case <-time.After(retryDelay):
+							// Continue to next retry iteration
+							continue
+						case <-ctx.Done():
+							// Context cancelled, return immediately
+							err = ctx.Err()
+							return nil, err
+						}
+					} else {
+						// Exhausted retries for this model, try next model if available
+						if idx+1 < len(models) {
+							log.Infof("gemini cli executor: rate limited, exhausted %d retries for stream model %s, trying fallback model: %s", maxRetries, attemptModel, models[idx+1])
+							break // Break inner loop to try next model
+						} else {
+							log.Infof("gemini cli executor: rate limited, exhausted %d retries for stream model %s, no additional fallback model", maxRetries, attemptModel)
+							// No more models to try, will return error below
+						}
+					}
+				} else {
+					// Non-429 error, don't retry this model
+					err = statusErr{code: httpResp.StatusCode, msg: string(data)}
+					return nil, err
+				}
+
+				// Break inner loop if we hit this point (no retry needed or exhausted retries)
+				break
+			}
+
+			// Success - httpResp.StatusCode is 2xx, break out of retry loop
+			// and proceed to streaming logic below
+			break
 		}
 
 		out := make(chan cliproxyexecutor.StreamChunk)
@@ -769,3 +846,48 @@ func fixGeminiCLIImageAspectRatio(modelName string, rawJSON []byte) []byte {
 	}
 	return rawJSON
 }
+
+// parseRetryDelay extracts the retry delay from a Google API 429 error response.
+// The error response contains a RetryInfo.retryDelay field in the format "0.847655010s".
+// Returns the duration to wait, or a default duration if parsing fails.
+func parseRetryDelay(errorBody []byte) time.Duration {
+	const defaultDelay = 1 * time.Second
+	const maxDelay = 60 * time.Second
+
+	// Try to parse the retryDelay from the error response
+	// Format: error.details[].retryDelay where @type == "type.googleapis.com/google.rpc.RetryInfo"
+	details := gjson.GetBytes(errorBody, "error.details")
+	if !details.Exists() || !details.IsArray() {
+		log.Debugf("parseRetryDelay: no error.details found, using default delay %v", defaultDelay)
+		return defaultDelay
+	}
+
+	for _, detail := range details.Array() {
+		typeVal := detail.Get("@type").String()
+		if typeVal == "type.googleapis.com/google.rpc.RetryInfo" {
+			retryDelay := detail.Get("retryDelay").String()
+			if retryDelay != "" {
+				// Parse duration string like "0.847655010s"
+				duration, err := time.ParseDuration(retryDelay)
+				if err != nil {
+					log.Debugf("parseRetryDelay: failed to parse duration %q: %v, using default", retryDelay, err)
+					return defaultDelay
+				}
+				// Cap at maxDelay to prevent excessive waits
+				if duration > maxDelay {
+					log.Debugf("parseRetryDelay: capping delay from %v to %v", duration, maxDelay)
+					return maxDelay
+				}
+				if duration < 0 {
+					log.Debugf("parseRetryDelay: negative delay %v, using default", duration)
+					return defaultDelay
+				}
+				log.Debugf("parseRetryDelay: using delay %v from API response", duration)
+				return duration
+			}
+		}
+	}
+
+	log.Debugf("parseRetryDelay: no RetryInfo found, using default delay %v", defaultDelay)
+	return defaultDelay
+}

From ede4471b84bffc11fe78781c54ea710831acc6fb Mon Sep 17 00:00:00 2001
From: Ben Vargas <ben@vargas.com>
Date: Wed, 19 Nov 2025 12:46:13 -0700
Subject: [PATCH 3/5] feat(translator): add default thinkingConfig for
 gemini-3-pro-preview

Match official Gemini CLI behavior by always sending default
thinkingConfig when client doesn't specify reasoning parameters.

- Set thinkingBudget=-1 (dynamic) for gemini-3-pro-preview
- Set include_thoughts=true to return thinking process
- Apply to both /v1/chat/completions and /v1/responses endpoints
- See: ai-gemini-cli/packages/core/src/config/defaultModelConfigs.ts
---
 .../chat-completions/gemini-cli_openai_request.go    |  9 +++++++++
 .../responses/gemini_openai-responses_request.go     | 12 ++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/internal/translator/gemini-cli/openai/chat-completions/gemini-cli_openai_request.go b/internal/translator/gemini-cli/openai/chat-completions/gemini-cli_openai_request.go
index 6f7ac724..99b50366 100644
--- a/internal/translator/gemini-cli/openai/chat-completions/gemini-cli_openai_request.go
+++ b/internal/translator/gemini-cli/openai/chat-completions/gemini-cli_openai_request.go
@@ -88,6 +88,15 @@ func ConvertOpenAIRequestToGeminiCLI(modelName string, inputRawJSON []byte, _ bo
 		}
 	}
 
+	// For gemini-3-pro-preview, always send default thinkingConfig when none specified.
+	// This matches the official Gemini CLI behavior which always sends:
+	// { thinkingBudget: -1, includeThoughts: true }
+	// See: ai-gemini-cli/packages/core/src/config/defaultModelConfigs.ts
+	if !gjson.GetBytes(out, "request.generationConfig.thinkingConfig").Exists() && modelName == "gemini-3-pro-preview" {
+		out, _ = sjson.SetBytes(out, "request.generationConfig.thinkingConfig.thinkingBudget", -1)
+		out, _ = sjson.SetBytes(out, "request.generationConfig.thinkingConfig.include_thoughts", true)
+	}
+
 	// Temperature/top_p/top_k
 	if tr := gjson.GetBytes(rawJSON, "temperature"); tr.Exists() && tr.Type == gjson.Number {
 		out, _ = sjson.SetBytes(out, "request.generationConfig.temperature", tr.Num)
diff --git a/internal/translator/gemini/openai/responses/gemini_openai-responses_request.go b/internal/translator/gemini/openai/responses/gemini_openai-responses_request.go
index 4eeebf3c..981fafc1 100644
--- a/internal/translator/gemini/openai/responses/gemini_openai-responses_request.go
+++ b/internal/translator/gemini/openai/responses/gemini_openai-responses_request.go
@@ -6,6 +6,7 @@ import (
 
 	"github.com/router-for-me/CLIProxyAPI/v6/internal/translator/gemini/common"
 	"github.com/router-for-me/CLIProxyAPI/v6/internal/util"
+	log "github.com/sirupsen/logrus"
 	"github.com/tidwall/gjson"
 	"github.com/tidwall/sjson"
 )
@@ -294,6 +295,17 @@ func ConvertOpenAIResponsesRequestToGemini(modelName string, inputRawJSON []byte
 			}
 		}
 	}
+
+	// For gemini-3-pro-preview, always send default thinkingConfig when none specified.
+	// This matches the official Gemini CLI behavior which always sends:
+	// { thinkingBudget: -1, includeThoughts: true }
+	// See: ai-gemini-cli/packages/core/src/config/defaultModelConfigs.ts
+	if !gjson.Get(out, "generationConfig.thinkingConfig").Exists() && modelName == "gemini-3-pro-preview" {
+		out, _ = sjson.Set(out, "generationConfig.thinkingConfig.thinkingBudget", -1)
+		out, _ = sjson.Set(out, "generationConfig.thinkingConfig.include_thoughts", true)
+		log.Debugf("Applied default thinkingConfig for gemini-3-pro-preview (matches Gemini CLI): thinkingBudget=-1, include_thoughts=true")
+	}
+
 	result := []byte(out)
 	result = common.AttachDefaultSafetySettings(result, "safetySettings")
 	return result

From ed23472d9427d55a68656a898902797b62a15a7e Mon Sep 17 00:00:00 2001
From: Ben Vargas <ben@vargas.com>
Date: Wed, 19 Nov 2025 13:05:38 -0700
Subject: [PATCH 4/5] fix(executor): prevent streaming from 429 response when
 fallback available

Fix critical bug where ExecuteStream would create a streaming channel
using a 429 error response instead of continuing to the next fallback
model after exhausting retries.

When 429 retries were exhausted and a fallback model was available,
the inner retry loop would break but immediately fall through to the
streaming channel creation, attempting to stream from the failed 429
response instead of trying the next model.

Solution: Add shouldContinueToNextModel flag to explicitly skip the
streaming logic and continue the outer model loop when appropriate.

Identified by: codex-bot review
Ref: https://github.com/router-for-me/CLIProxyAPI/pull/280#pullrequestreview-3484479106
---
 internal/runtime/executor/gemini_cli_executor.go | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/internal/runtime/executor/gemini_cli_executor.go b/internal/runtime/executor/gemini_cli_executor.go
index 5e932fbd..294761c8 100644
--- a/internal/runtime/executor/gemini_cli_executor.go
+++ b/internal/runtime/executor/gemini_cli_executor.go
@@ -279,6 +279,7 @@ func (e *GeminiCLIExecutor) ExecuteStream(ctx context.Context, auth *cliproxyaut
 		var httpResp *http.Response
 		var payload []byte
 		var errDo error
+		shouldContinueToNextModel := false
 
 		// Inner retry loop for 429 errors on the same model
 		for retryCount := 0; retryCount <= maxRetries; retryCount++ {
@@ -364,6 +365,7 @@ func (e *GeminiCLIExecutor) ExecuteStream(ctx context.Context, auth *cliproxyaut
 						// Exhausted retries for this model, try next model if available
 						if idx+1 < len(models) {
 							log.Infof("gemini cli executor: rate limited, exhausted %d retries for stream model %s, trying fallback model: %s", maxRetries, attemptModel, models[idx+1])
+							shouldContinueToNextModel = true
 							break // Break inner loop to try next model
 						} else {
 							log.Infof("gemini cli executor: rate limited, exhausted %d retries for stream model %s, no additional fallback model", maxRetries, attemptModel)
@@ -385,6 +387,11 @@ func (e *GeminiCLIExecutor) ExecuteStream(ctx context.Context, auth *cliproxyaut
 			break
 		}
 
+		// If we need to try the next fallback model, skip streaming logic
+		if shouldContinueToNextModel {
+			continue
+		}
+
 		out := make(chan cliproxyexecutor.StreamChunk)
 		stream = out
 		go func(resp *http.Response, reqBody []byte, attempt string) {

From 0ff094b87f9b20b965ba45542eae302b4006e23f Mon Sep 17 00:00:00 2001
From: Ben Vargas <ben@vargas.com>
Date: Wed, 19 Nov 2025 13:14:40 -0700
Subject: [PATCH 5/5] fix(executor): prevent streaming on failed response when
 no fallback

Fix critical bug where ExecuteStream would create a streaming channel
from a failed (non-2xx) response after exhausting all retries with no
fallback models available.

When retries were exhausted on the last model, the code would break from
the inner loop but fall through to streaming channel creation (line 401),
immediately returning at line 461. This made the error handling code at
lines 464-471 unreachable, causing clients to receive an empty/closed
stream instead of a proper error response.

Solution: Check if httpResp is non-2xx before creating the streaming
channel. If failed, continue the outer loop to reach error handling.

Identified by: codex-bot review
Ref: https://github.com/router-for-me/CLIProxyAPI/pull/280#pullrequestreview-3484560423
---
 internal/runtime/executor/gemini_cli_executor.go | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/internal/runtime/executor/gemini_cli_executor.go b/internal/runtime/executor/gemini_cli_executor.go
index 294761c8..2f48871b 100644
--- a/internal/runtime/executor/gemini_cli_executor.go
+++ b/internal/runtime/executor/gemini_cli_executor.go
@@ -392,6 +392,12 @@ func (e *GeminiCLIExecutor) ExecuteStream(ctx context.Context, auth *cliproxyaut
 			continue
 		}
 
+		// If we have a failed response (non-2xx), don't attempt streaming
+		// Continue outer loop to try next model or return error
+		if httpResp == nil || httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
+			continue
+		}
+
 		out := make(chan cliproxyexecutor.StreamChunk)
 		stream = out
 		go func(resp *http.Response, reqBody []byte, attempt string) {