Files
open-design/daemon/media-models.js
pftom 976a6eadf2 feat(media): add image/video/audio project kinds via od media generate
Introduce non-web media surfaces (image, video, audio) as first-class
project kinds. The unifying contract is "skill workflow + project
metadata tell the agent WHAT to make; one shell command — od media
generate — is HOW bytes are produced", so any code-agent CLI with
shell access can drive it without bespoke tools.

- Frontend: New Project panel gains Image/Video/Audio tabs with model
  picker, aspect/length/duration controls, and audio kind/voice
  selection. Examples and Design Systems tabs gain layered sections.
  FileViewer renders the generated image/video/audio files.
- Shared registry: src/media/models.ts is the single source of truth
  for image/video/audio model IDs, aspects, and defaults — consumed
  by the picker AND the daemon dispatcher.
- Prompts: media-contract.ts is pinned LAST in the system prompt for
  media surfaces so its hard rules (call od media generate, don't
  emit binary in <artifact>, allowed model IDs) win over softer
  earlier wording.
- Daemon: new media.js dispatcher + media-models.js JSON view of the
  registry; cli.js gets the `od media generate` subcommand wired up
  via server.js / projects.js so the daemon writes files back into
  the project dir.
- Skills: audio-jingle, image-poster, video-shortform seed examples
  for the three surfaces.

Made-with: Cursor
2026-04-28 22:41:14 +08:00

63 lines
2.7 KiB
JavaScript

// Daemon-side mirror of src/media/models.ts. We keep this in plain JS so
// node imports are native and the daemon never needs a TS toolchain at
// runtime. The two files are kept in sync by review — any model added to
// src/media/models.ts must be added here too. Tests in verify ensure the
// arrays are non-empty and IDs are unique.
export const IMAGE_MODELS = [
{ id: 'gpt-image-2', label: 'gpt-image-2', hint: 'OpenAI · default', caps: ['t2i', 'i2i', 'inpaint'] },
{ id: 'flux-1.1-pro', label: 'flux-1.1-pro', hint: 'Black Forest Labs', caps: ['t2i', 'i2i'] },
{ id: 'imagen-4', label: 'imagen-4', hint: 'Google', caps: ['t2i'] },
{ id: 'midjourney-v7', label: 'midjourney-v7', hint: 'Midjourney', caps: ['t2i'] },
];
export const VIDEO_MODELS = [
{ id: 'seedance-2', label: 'seedance-2', hint: 'ByteDance · default', caps: ['t2v', 'i2v'] },
{ id: 'kling-3', label: 'kling-3', hint: 'Kuaishou', caps: ['t2v', 'i2v'] },
{ id: 'kling-4', label: 'kling-4', hint: 'Kuaishou · latest', caps: ['t2v', 'i2v'] },
{ id: 'veo-3', label: 'veo-3', hint: 'Google', caps: ['t2v'] },
{ id: 'sora-2', label: 'sora-2', hint: 'OpenAI', caps: ['t2v'] },
];
export const AUDIO_MODELS_BY_KIND = {
music: [
{ id: 'suno-v5', label: 'suno-v5', hint: 'Suno · default', caps: ['music'] },
{ id: 'udio-v2', label: 'udio-v2', hint: 'Udio', caps: ['music'] },
{ id: 'lyria-2', label: 'lyria-2', hint: 'Google', caps: ['music'] },
],
speech: [
{ id: 'minimax-tts', label: 'minimax-tts', hint: 'MiniMax · default', caps: ['tts'] },
{ id: 'fish-speech-2', label: 'fish-speech-2', hint: 'FishAudio', caps: ['tts', 'voice-clone'] },
{ id: 'elevenlabs-v3', label: 'elevenlabs-v3', hint: 'ElevenLabs', caps: ['tts', 'voice-clone'] },
],
sfx: [
{ id: 'elevenlabs-sfx', label: 'elevenlabs-sfx', hint: 'ElevenLabs SFX', caps: ['sfx'] },
{ id: 'audiocraft', label: 'audiocraft', hint: 'Meta · open', caps: ['sfx', 'music'] },
],
};
export const MEDIA_ASPECTS = ['1:1', '16:9', '9:16', '4:3', '3:4'];
export const VIDEO_LENGTHS_SEC = [3, 5, 8, 10, 15, 30];
export const AUDIO_DURATIONS_SEC = [5, 10, 15, 30, 60, 120];
export function findMediaModel(id) {
const all = [
...IMAGE_MODELS,
...VIDEO_MODELS,
...AUDIO_MODELS_BY_KIND.music,
...AUDIO_MODELS_BY_KIND.speech,
...AUDIO_MODELS_BY_KIND.sfx,
];
return all.find((m) => m.id === id) || null;
}
export function modelsForSurface(surface, audioKind) {
if (surface === 'image') return IMAGE_MODELS;
if (surface === 'video') return VIDEO_MODELS;
if (surface === 'audio') {
const k = audioKind || 'music';
return AUDIO_MODELS_BY_KIND[k] || AUDIO_MODELS_BY_KIND.music;
}
return [];
}