feat(translator): add partial and full image generation support in Codex-GPT and Codex-Gemini flows

- Introduced `LastImageHashByItemID` in Codex-GPT and `LastImageHashByID` in Codex-Gemini for deduplication of generated images.
- Added support for handling `partial_image` and `image_generation_call` types, with inline data embedding for Gemini and URL payload conversion for GPT.
- Extended unit tests to verify image handling in both streaming and non-streaming modes.
This commit is contained in:
Luis Pater
2026-04-19 03:21:59 +08:00
parent c6baa64b4e
commit 86c856f56f
4 changed files with 351 additions and 1 deletions
@@ -8,6 +8,8 @@ package chat_completions
import (
"bytes"
"context"
"crypto/sha256"
"strings"
"time"
"github.com/tidwall/gjson"
@@ -26,6 +28,7 @@ type ConvertCliToOpenAIParams struct {
FunctionCallIndex int
HasReceivedArgumentsDelta bool
HasToolCallAnnounced bool
LastImageHashByItemID map[string][32]byte
}
// ConvertCodexResponseToOpenAI translates a single chunk of a streaming response from the
@@ -51,6 +54,7 @@ func ConvertCodexResponseToOpenAI(_ context.Context, modelName string, originalR
FunctionCallIndex: -1,
HasReceivedArgumentsDelta: false,
HasToolCallAnnounced: false,
LastImageHashByItemID: make(map[string][32]byte),
}
}
@@ -70,6 +74,9 @@ func ConvertCodexResponseToOpenAI(_ context.Context, modelName string, originalR
(*param).(*ConvertCliToOpenAIParams).ResponseID = rootResult.Get("response.id").String()
(*param).(*ConvertCliToOpenAIParams).CreatedAt = rootResult.Get("response.created_at").Int()
(*param).(*ConvertCliToOpenAIParams).Model = rootResult.Get("response.model").String()
if (*param).(*ConvertCliToOpenAIParams).LastImageHashByItemID == nil {
(*param).(*ConvertCliToOpenAIParams).LastImageHashByItemID = make(map[string][32]byte)
}
return [][]byte{}
}
@@ -120,6 +127,39 @@ func ConvertCodexResponseToOpenAI(_ context.Context, modelName string, originalR
template, _ = sjson.SetBytes(template, "choices.0.delta.role", "assistant")
template, _ = sjson.SetBytes(template, "choices.0.delta.content", deltaResult.String())
}
} else if dataType == "response.image_generation_call.partial_image" {
itemID := rootResult.Get("item_id").String()
b64 := rootResult.Get("partial_image_b64").String()
if b64 == "" {
return [][]byte{}
}
if itemID != "" {
p := (*param).(*ConvertCliToOpenAIParams)
if p.LastImageHashByItemID == nil {
p.LastImageHashByItemID = make(map[string][32]byte)
}
hash := sha256.Sum256([]byte(b64))
if last, ok := p.LastImageHashByItemID[itemID]; ok && last == hash {
return [][]byte{}
}
p.LastImageHashByItemID[itemID] = hash
}
outputFormat := rootResult.Get("output_format").String()
mimeType := mimeTypeFromCodexOutputFormat(outputFormat)
imageURL := "data:" + mimeType + ";base64," + b64
imagesResult := gjson.GetBytes(template, "choices.0.delta.images")
if !imagesResult.Exists() || !imagesResult.IsArray() {
template, _ = sjson.SetRawBytes(template, "choices.0.delta.images", []byte(`[]`))
}
imageIndex := len(gjson.GetBytes(template, "choices.0.delta.images").Array())
imagePayload := []byte(`{"type":"image_url","image_url":{"url":""}}`)
imagePayload, _ = sjson.SetBytes(imagePayload, "index", imageIndex)
imagePayload, _ = sjson.SetBytes(imagePayload, "image_url.url", imageURL)
template, _ = sjson.SetBytes(template, "choices.0.delta.role", "assistant")
template, _ = sjson.SetRawBytes(template, "choices.0.delta.images.-1", imagePayload)
} else if dataType == "response.completed" {
finishReason := "stop"
if (*param).(*ConvertCliToOpenAIParams).FunctionCallIndex != -1 {
@@ -183,7 +223,46 @@ func ConvertCodexResponseToOpenAI(_ context.Context, modelName string, originalR
} else if dataType == "response.output_item.done" {
itemResult := rootResult.Get("item")
if !itemResult.Exists() || itemResult.Get("type").String() != "function_call" {
if !itemResult.Exists() {
return [][]byte{}
}
itemType := itemResult.Get("type").String()
if itemType == "image_generation_call" {
itemID := itemResult.Get("id").String()
b64 := itemResult.Get("result").String()
if b64 == "" {
return [][]byte{}
}
if itemID != "" {
p := (*param).(*ConvertCliToOpenAIParams)
if p.LastImageHashByItemID == nil {
p.LastImageHashByItemID = make(map[string][32]byte)
}
hash := sha256.Sum256([]byte(b64))
if last, ok := p.LastImageHashByItemID[itemID]; ok && last == hash {
return [][]byte{}
}
p.LastImageHashByItemID[itemID] = hash
}
outputFormat := itemResult.Get("output_format").String()
mimeType := mimeTypeFromCodexOutputFormat(outputFormat)
imageURL := "data:" + mimeType + ";base64," + b64
imagesResult := gjson.GetBytes(template, "choices.0.delta.images")
if !imagesResult.Exists() || !imagesResult.IsArray() {
template, _ = sjson.SetRawBytes(template, "choices.0.delta.images", []byte(`[]`))
}
imageIndex := len(gjson.GetBytes(template, "choices.0.delta.images").Array())
imagePayload := []byte(`{"type":"image_url","image_url":{"url":""}}`)
imagePayload, _ = sjson.SetBytes(imagePayload, "index", imageIndex)
imagePayload, _ = sjson.SetBytes(imagePayload, "image_url.url", imageURL)
template, _ = sjson.SetBytes(template, "choices.0.delta.role", "assistant")
template, _ = sjson.SetRawBytes(template, "choices.0.delta.images.-1", imagePayload)
return [][]byte{template}
}
if itemType != "function_call" {
return [][]byte{}
}
@@ -285,6 +364,7 @@ func ConvertCodexResponseToOpenAINonStream(_ context.Context, _ string, original
// Process the output array for content and function calls
var toolCalls [][]byte
var images [][]byte
outputResult := responseResult.Get("output")
if outputResult.IsArray() {
outputArray := outputResult.Array()
@@ -339,6 +419,19 @@ func ConvertCodexResponseToOpenAINonStream(_ context.Context, _ string, original
}
toolCalls = append(toolCalls, functionCallTemplate)
case "image_generation_call":
b64 := outputItem.Get("result").String()
if b64 == "" {
break
}
outputFormat := outputItem.Get("output_format").String()
mimeType := mimeTypeFromCodexOutputFormat(outputFormat)
imageURL := "data:" + mimeType + ";base64," + b64
imagePayload := []byte(`{"type":"image_url","image_url":{"url":""}}`)
imagePayload, _ = sjson.SetBytes(imagePayload, "index", len(images))
imagePayload, _ = sjson.SetBytes(imagePayload, "image_url.url", imageURL)
images = append(images, imagePayload)
}
}
@@ -361,6 +454,15 @@ func ConvertCodexResponseToOpenAINonStream(_ context.Context, _ string, original
}
template, _ = sjson.SetBytes(template, "choices.0.message.role", "assistant")
}
// Add images if any
if len(images) > 0 {
template, _ = sjson.SetRawBytes(template, "choices.0.message.images", []byte(`[]`))
for _, image := range images {
template, _ = sjson.SetRawBytes(template, "choices.0.message.images.-1", image)
}
template, _ = sjson.SetBytes(template, "choices.0.message.role", "assistant")
}
}
// Extract and set the finish reason based on status
@@ -409,3 +511,24 @@ func buildReverseMapFromOriginalOpenAI(original []byte) map[string]string {
}
return rev
}
func mimeTypeFromCodexOutputFormat(outputFormat string) string {
if outputFormat == "" {
return "image/png"
}
if strings.Contains(outputFormat, "/") {
return outputFormat
}
switch strings.ToLower(outputFormat) {
case "png":
return "image/png"
case "jpg", "jpeg":
return "image/jpeg"
case "webp":
return "image/webp"
case "gif":
return "image/gif"
default:
return "image/png"
}
}
@@ -90,3 +90,62 @@ func TestConvertCodexResponseToOpenAI_ToolCallArgumentsDeltaOmitsNullContentFiel
t.Fatalf("expected tool call arguments delta to exist, got %s", string(out[0]))
}
}
func TestConvertCodexResponseToOpenAI_StreamPartialImageEmitsDeltaImages(t *testing.T) {
ctx := context.Background()
var param any
chunk := []byte(`data: {"type":"response.image_generation_call.partial_image","item_id":"ig_123","output_format":"png","partial_image_b64":"aGVsbG8=","partial_image_index":0}`)
out := ConvertCodexResponseToOpenAI(ctx, "gpt-5.4", nil, nil, chunk, &param)
if len(out) != 1 {
t.Fatalf("expected 1 chunk, got %d", len(out))
}
gotURL := gjson.GetBytes(out[0], "choices.0.delta.images.0.image_url.url").String()
if gotURL != "data:image/png;base64,aGVsbG8=" {
t.Fatalf("expected image url %q, got %q; chunk=%s", "data:image/png;base64,aGVsbG8=", gotURL, string(out[0]))
}
out = ConvertCodexResponseToOpenAI(ctx, "gpt-5.4", nil, nil, chunk, &param)
if len(out) != 0 {
t.Fatalf("expected duplicate image chunk to be suppressed, got %d", len(out))
}
}
func TestConvertCodexResponseToOpenAI_StreamImageGenerationCallDoneEmitsDeltaImages(t *testing.T) {
ctx := context.Background()
var param any
out := ConvertCodexResponseToOpenAI(ctx, "gpt-5.4", nil, nil, []byte(`data: {"type":"response.image_generation_call.partial_image","item_id":"ig_123","output_format":"png","partial_image_b64":"aGVsbG8=","partial_image_index":0}`), &param)
if len(out) != 1 {
t.Fatalf("expected 1 chunk, got %d", len(out))
}
out = ConvertCodexResponseToOpenAI(ctx, "gpt-5.4", nil, nil, []byte(`data: {"type":"response.output_item.done","item":{"id":"ig_123","type":"image_generation_call","output_format":"png","result":"aGVsbG8="}}`), &param)
if len(out) != 0 {
t.Fatalf("expected output_item.done to be suppressed when identical to last partial image, got %d", len(out))
}
out = ConvertCodexResponseToOpenAI(ctx, "gpt-5.4", nil, nil, []byte(`data: {"type":"response.output_item.done","item":{"id":"ig_123","type":"image_generation_call","output_format":"jpeg","result":"Ymll"}}`), &param)
if len(out) != 1 {
t.Fatalf("expected 1 chunk, got %d", len(out))
}
gotURL := gjson.GetBytes(out[0], "choices.0.delta.images.0.image_url.url").String()
if gotURL != "data:image/jpeg;base64,Ymll" {
t.Fatalf("expected image url %q, got %q; chunk=%s", "data:image/jpeg;base64,Ymll", gotURL, string(out[0]))
}
}
func TestConvertCodexResponseToOpenAI_NonStreamImageGenerationCallAddsMessageImages(t *testing.T) {
ctx := context.Background()
raw := []byte(`{"type":"response.completed","response":{"id":"resp_123","created_at":1700000000,"model":"gpt-5.4","status":"completed","usage":{"input_tokens":1,"output_tokens":1,"total_tokens":2},"output":[{"type":"message","content":[{"type":"output_text","text":"ok"}]},{"type":"image_generation_call","output_format":"png","result":"aGVsbG8="}]}}`)
out := ConvertCodexResponseToOpenAINonStream(ctx, "gpt-5.4", nil, nil, raw, nil)
gotURL := gjson.GetBytes(out, "choices.0.message.images.0.image_url.url").String()
if gotURL != "data:image/png;base64,aGVsbG8=" {
t.Fatalf("expected image url %q, got %q; chunk=%s", "data:image/png;base64,aGVsbG8=", gotURL, string(out))
}
}