feat(translator): add partial and full image generation support in Codex-GPT and Codex-Gemini flows

- Introduced `LastImageHashByItemID` in Codex-GPT and `LastImageHashByID` in Codex-Gemini for deduplication of generated images.
- Added support for handling `partial_image` and `image_generation_call` types, with inline data embedding for Gemini and URL payload conversion for GPT.
- Extended unit tests to verify image handling in both streaming and non-streaming modes.
This commit is contained in:
Luis Pater
2026-04-19 03:21:59 +08:00
parent c6baa64b4e
commit 86c856f56f
4 changed files with 351 additions and 1 deletions
@@ -7,6 +7,8 @@ package gemini
import (
"bytes"
"context"
"crypto/sha256"
"strings"
"time"
translatorcommon "github.com/router-for-me/CLIProxyAPI/v6/internal/translator/common"
@@ -25,6 +27,7 @@ type ConvertCodexResponseToGeminiParams struct {
ResponseID string
LastStorageOutput []byte
HasOutputTextDelta bool
LastImageHashByID map[string][32]byte
}
// ConvertCodexResponseToGemini converts Codex streaming response format to Gemini format.
@@ -48,6 +51,7 @@ func ConvertCodexResponseToGemini(_ context.Context, modelName string, originalR
ResponseID: "",
LastStorageOutput: nil,
HasOutputTextDelta: false,
LastImageHashByID: make(map[string][32]byte),
}
}
@@ -74,10 +78,63 @@ func ConvertCodexResponseToGemini(_ context.Context, modelName string, originalR
template, _ = sjson.SetBytes(template, "responseId", params.ResponseID)
}
if typeStr == "response.image_generation_call.partial_image" {
itemID := rootResult.Get("item_id").String()
b64 := rootResult.Get("partial_image_b64").String()
if b64 == "" {
return [][]byte{}
}
if itemID != "" {
if params.LastImageHashByID == nil {
params.LastImageHashByID = make(map[string][32]byte)
}
hash := sha256.Sum256([]byte(b64))
if last, ok := params.LastImageHashByID[itemID]; ok && last == hash {
return [][]byte{}
}
params.LastImageHashByID[itemID] = hash
}
outputFormat := rootResult.Get("output_format").String()
mimeType := mimeTypeFromCodexOutputFormat(outputFormat)
part := []byte(`{"inlineData":{"data":"","mimeType":""}}`)
part, _ = sjson.SetBytes(part, "inlineData.data", b64)
part, _ = sjson.SetBytes(part, "inlineData.mimeType", mimeType)
template, _ = sjson.SetRawBytes(template, "candidates.0.content.parts.-1", part)
return [][]byte{template}
}
// Handle function call completion
if typeStr == "response.output_item.done" {
itemResult := rootResult.Get("item")
itemType := itemResult.Get("type").String()
if itemType == "image_generation_call" {
itemID := itemResult.Get("id").String()
b64 := itemResult.Get("result").String()
if b64 == "" {
return [][]byte{}
}
if itemID != "" {
if params.LastImageHashByID == nil {
params.LastImageHashByID = make(map[string][32]byte)
}
hash := sha256.Sum256([]byte(b64))
if last, ok := params.LastImageHashByID[itemID]; ok && last == hash {
return [][]byte{}
}
params.LastImageHashByID[itemID] = hash
}
outputFormat := itemResult.Get("output_format").String()
mimeType := mimeTypeFromCodexOutputFormat(outputFormat)
part := []byte(`{"inlineData":{"data":"","mimeType":""}}`)
part, _ = sjson.SetBytes(part, "inlineData.data", b64)
part, _ = sjson.SetBytes(part, "inlineData.mimeType", mimeType)
template, _ = sjson.SetRawBytes(template, "candidates.0.content.parts.-1", part)
return [][]byte{template}
}
if itemType == "function_call" {
// Create function call part
functionCall := []byte(`{"functionCall":{"name":"","args":{}}}`)
@@ -270,6 +327,20 @@ func ConvertCodexResponseToGeminiNonStream(_ context.Context, modelName string,
})
}
case "image_generation_call":
flushPendingFunctionCalls()
b64 := value.Get("result").String()
if b64 == "" {
break
}
outputFormat := value.Get("output_format").String()
mimeType := mimeTypeFromCodexOutputFormat(outputFormat)
part := []byte(`{"inlineData":{"data":"","mimeType":""}}`)
part, _ = sjson.SetBytes(part, "inlineData.data", b64)
part, _ = sjson.SetBytes(part, "inlineData.mimeType", mimeType)
template, _ = sjson.SetRawBytes(template, "candidates.0.content.parts.-1", part)
case "function_call":
// Collect function call for potential merging with consecutive ones
hasToolCall = true
@@ -342,3 +413,24 @@ func buildReverseMapFromGeminiOriginal(original []byte) map[string]string {
func GeminiTokenCount(ctx context.Context, count int64) []byte {
return translatorcommon.GeminiTokenCountJSON(count)
}
func mimeTypeFromCodexOutputFormat(outputFormat string) string {
if outputFormat == "" {
return "image/png"
}
if strings.Contains(outputFormat, "/") {
return outputFormat
}
switch strings.ToLower(outputFormat) {
case "png":
return "image/png"
case "jpg", "jpeg":
return "image/jpeg"
case "webp":
return "image/webp"
case "gif":
return "image/gif"
default:
return "image/png"
}
}
@@ -33,3 +33,79 @@ func TestConvertCodexResponseToGemini_StreamEmptyOutputUsesOutputItemDoneMessage
t.Fatalf("expected fallback content from response.output_item.done message; outputs=%q", outputs)
}
}
func TestConvertCodexResponseToGemini_StreamPartialImageEmitsInlineData(t *testing.T) {
ctx := context.Background()
originalRequest := []byte(`{"tools":[]}`)
var param any
chunk := []byte(`data: {"type":"response.image_generation_call.partial_image","item_id":"ig_123","output_format":"png","partial_image_b64":"aGVsbG8=","partial_image_index":0}`)
out := ConvertCodexResponseToGemini(ctx, "gemini-2.5-pro", originalRequest, nil, chunk, &param)
if len(out) != 1 {
t.Fatalf("expected 1 chunk, got %d", len(out))
}
got := gjson.GetBytes(out[0], "candidates.0.content.parts.0.inlineData.data").String()
if got != "aGVsbG8=" {
t.Fatalf("expected inlineData.data %q, got %q; chunk=%s", "aGVsbG8=", got, string(out[0]))
}
gotMime := gjson.GetBytes(out[0], "candidates.0.content.parts.0.inlineData.mimeType").String()
if gotMime != "image/png" {
t.Fatalf("expected inlineData.mimeType %q, got %q; chunk=%s", "image/png", gotMime, string(out[0]))
}
out = ConvertCodexResponseToGemini(ctx, "gemini-2.5-pro", originalRequest, nil, chunk, &param)
if len(out) != 0 {
t.Fatalf("expected duplicate image chunk to be suppressed, got %d", len(out))
}
}
func TestConvertCodexResponseToGemini_StreamImageGenerationCallDoneEmitsInlineData(t *testing.T) {
ctx := context.Background()
originalRequest := []byte(`{"tools":[]}`)
var param any
out := ConvertCodexResponseToGemini(ctx, "gemini-2.5-pro", originalRequest, nil, []byte(`data: {"type":"response.image_generation_call.partial_image","item_id":"ig_123","output_format":"png","partial_image_b64":"aGVsbG8=","partial_image_index":0}`), &param)
if len(out) != 1 {
t.Fatalf("expected 1 chunk, got %d", len(out))
}
out = ConvertCodexResponseToGemini(ctx, "gemini-2.5-pro", originalRequest, nil, []byte(`data: {"type":"response.output_item.done","item":{"id":"ig_123","type":"image_generation_call","output_format":"png","result":"aGVsbG8="}}`), &param)
if len(out) != 0 {
t.Fatalf("expected output_item.done to be suppressed when identical to last partial image, got %d", len(out))
}
out = ConvertCodexResponseToGemini(ctx, "gemini-2.5-pro", originalRequest, nil, []byte(`data: {"type":"response.output_item.done","item":{"id":"ig_123","type":"image_generation_call","output_format":"jpeg","result":"Ymll"}}`), &param)
if len(out) != 1 {
t.Fatalf("expected 1 chunk, got %d", len(out))
}
got := gjson.GetBytes(out[0], "candidates.0.content.parts.0.inlineData.data").String()
if got != "Ymll" {
t.Fatalf("expected inlineData.data %q, got %q; chunk=%s", "Ymll", got, string(out[0]))
}
gotMime := gjson.GetBytes(out[0], "candidates.0.content.parts.0.inlineData.mimeType").String()
if gotMime != "image/jpeg" {
t.Fatalf("expected inlineData.mimeType %q, got %q; chunk=%s", "image/jpeg", gotMime, string(out[0]))
}
}
func TestConvertCodexResponseToGemini_NonStreamImageGenerationCallAddsInlineDataPart(t *testing.T) {
ctx := context.Background()
originalRequest := []byte(`{"tools":[]}`)
raw := []byte(`{"type":"response.completed","response":{"id":"resp_123","created_at":1700000000,"usage":{"input_tokens":1,"output_tokens":1},"output":[{"type":"message","content":[{"type":"output_text","text":"ok"}]},{"type":"image_generation_call","output_format":"png","result":"aGVsbG8="}]}}`)
out := ConvertCodexResponseToGeminiNonStream(ctx, "gemini-2.5-pro", originalRequest, nil, raw, nil)
got := gjson.GetBytes(out, "candidates.0.content.parts.1.inlineData.data").String()
if got != "aGVsbG8=" {
t.Fatalf("expected inlineData.data %q, got %q; chunk=%s", "aGVsbG8=", got, string(out))
}
gotMime := gjson.GetBytes(out, "candidates.0.content.parts.1.inlineData.mimeType").String()
if gotMime != "image/png" {
t.Fatalf("expected inlineData.mimeType %q, got %q; chunk=%s", "image/png", gotMime, string(out))
}
}