976a6eadf2
Introduce non-web media surfaces (image, video, audio) as first-class project kinds. The unifying contract is "skill workflow + project metadata tell the agent WHAT to make; one shell command — od media generate — is HOW bytes are produced", so any code-agent CLI with shell access can drive it without bespoke tools. - Frontend: New Project panel gains Image/Video/Audio tabs with model picker, aspect/length/duration controls, and audio kind/voice selection. Examples and Design Systems tabs gain layered sections. FileViewer renders the generated image/video/audio files. - Shared registry: src/media/models.ts is the single source of truth for image/video/audio model IDs, aspects, and defaults — consumed by the picker AND the daemon dispatcher. - Prompts: media-contract.ts is pinned LAST in the system prompt for media surfaces so its hard rules (call od media generate, don't emit binary in <artifact>, allowed model IDs) win over softer earlier wording. - Daemon: new media.js dispatcher + media-models.js JSON view of the registry; cli.js gets the `od media generate` subcommand wired up via server.js / projects.js so the daemon writes files back into the project dir. - Skills: audio-jingle, image-poster, video-shortform seed examples for the three surfaces. Made-with: Cursor
63 lines
2.7 KiB
JavaScript
63 lines
2.7 KiB
JavaScript
// Daemon-side mirror of src/media/models.ts. We keep this in plain JS so
|
|
// node imports are native and the daemon never needs a TS toolchain at
|
|
// runtime. The two files are kept in sync by review — any model added to
|
|
// src/media/models.ts must be added here too. Tests in verify ensure the
|
|
// arrays are non-empty and IDs are unique.
|
|
|
|
export const IMAGE_MODELS = [
|
|
{ id: 'gpt-image-2', label: 'gpt-image-2', hint: 'OpenAI · default', caps: ['t2i', 'i2i', 'inpaint'] },
|
|
{ id: 'flux-1.1-pro', label: 'flux-1.1-pro', hint: 'Black Forest Labs', caps: ['t2i', 'i2i'] },
|
|
{ id: 'imagen-4', label: 'imagen-4', hint: 'Google', caps: ['t2i'] },
|
|
{ id: 'midjourney-v7', label: 'midjourney-v7', hint: 'Midjourney', caps: ['t2i'] },
|
|
];
|
|
|
|
export const VIDEO_MODELS = [
|
|
{ id: 'seedance-2', label: 'seedance-2', hint: 'ByteDance · default', caps: ['t2v', 'i2v'] },
|
|
{ id: 'kling-3', label: 'kling-3', hint: 'Kuaishou', caps: ['t2v', 'i2v'] },
|
|
{ id: 'kling-4', label: 'kling-4', hint: 'Kuaishou · latest', caps: ['t2v', 'i2v'] },
|
|
{ id: 'veo-3', label: 'veo-3', hint: 'Google', caps: ['t2v'] },
|
|
{ id: 'sora-2', label: 'sora-2', hint: 'OpenAI', caps: ['t2v'] },
|
|
];
|
|
|
|
export const AUDIO_MODELS_BY_KIND = {
|
|
music: [
|
|
{ id: 'suno-v5', label: 'suno-v5', hint: 'Suno · default', caps: ['music'] },
|
|
{ id: 'udio-v2', label: 'udio-v2', hint: 'Udio', caps: ['music'] },
|
|
{ id: 'lyria-2', label: 'lyria-2', hint: 'Google', caps: ['music'] },
|
|
],
|
|
speech: [
|
|
{ id: 'minimax-tts', label: 'minimax-tts', hint: 'MiniMax · default', caps: ['tts'] },
|
|
{ id: 'fish-speech-2', label: 'fish-speech-2', hint: 'FishAudio', caps: ['tts', 'voice-clone'] },
|
|
{ id: 'elevenlabs-v3', label: 'elevenlabs-v3', hint: 'ElevenLabs', caps: ['tts', 'voice-clone'] },
|
|
],
|
|
sfx: [
|
|
{ id: 'elevenlabs-sfx', label: 'elevenlabs-sfx', hint: 'ElevenLabs SFX', caps: ['sfx'] },
|
|
{ id: 'audiocraft', label: 'audiocraft', hint: 'Meta · open', caps: ['sfx', 'music'] },
|
|
],
|
|
};
|
|
|
|
export const MEDIA_ASPECTS = ['1:1', '16:9', '9:16', '4:3', '3:4'];
|
|
export const VIDEO_LENGTHS_SEC = [3, 5, 8, 10, 15, 30];
|
|
export const AUDIO_DURATIONS_SEC = [5, 10, 15, 30, 60, 120];
|
|
|
|
export function findMediaModel(id) {
|
|
const all = [
|
|
...IMAGE_MODELS,
|
|
...VIDEO_MODELS,
|
|
...AUDIO_MODELS_BY_KIND.music,
|
|
...AUDIO_MODELS_BY_KIND.speech,
|
|
...AUDIO_MODELS_BY_KIND.sfx,
|
|
];
|
|
return all.find((m) => m.id === id) || null;
|
|
}
|
|
|
|
export function modelsForSurface(surface, audioKind) {
|
|
if (surface === 'image') return IMAGE_MODELS;
|
|
if (surface === 'video') return VIDEO_MODELS;
|
|
if (surface === 'audio') {
|
|
const k = audioKind || 'music';
|
|
return AUDIO_MODELS_BY_KIND[k] || AUDIO_MODELS_BY_KIND.music;
|
|
}
|
|
return [];
|
|
}
|