Files
open-design/daemon/media-models.js
T
pftom ac70719d4d feat(media): add image / video / audio surfaces with unified od media generate dispatcher
Extends Open Design from web-only to a multi-modal creation tool. The
unifying contract is one code-agent loop driven by skills + project
metadata + prompt constraints; for non-web surfaces the agent shells
out to a single dispatcher (`od media generate`) that the daemon
routes per (surface, model).

- Types: new Surface union, MediaAspect / AudioKind, image/video/audio
  ProjectKind + ProjectMetadata fields, video/audio ProjectFileKind.
- NewProjectPanel: top-level surface picker + Image / Video / Audio
  forms with model, aspect, length, duration, voice, audio-kind pickers.
- ExamplesTab + DesignSystemsTab: surface filter row that scopes
  before mode / scenario / category filters.
- FileViewer / FileWorkspace: native <video> and <audio> previews and
  matching tab icons.
- Daemon: parses `od.surface` and `> Surface:` blockquotes; recognises
  mp4 / webm / mov / mp3 / wav / ogg / m4a / flac extensions; spawns
  agents with OD_BIN / OD_DAEMON_URL / OD_PROJECT_ID / OD_PROJECT_DIR
  env so any code-agent CLI with shell access can call the dispatcher.
- daemon/media.js + daemon/media-models.js: surface-agnostic dispatcher
  with stub providers that emit deterministic placeholder bytes
  (1x1 PNG, valid mp4 ftyp, mp3 frame / silent WAV) so the framework
  works without API keys; real provider integrations slot in later.
- daemon/cli.js: `od media generate --surface ... --model ...`
  subcommand routes to POST /api/projects/:id/media/generate and
  prints one JSON line for the agent to parse.
- prompts/media-contract.ts: hard contract pinned LAST in the system
  prompt for image/video/audio surfaces — env vars, exact invocation,
  registered model IDs per surface, six workflow rules. system.ts
  metadata block updated to point at the contract.
- Seed skills: image-poster, video-shortform, audio-jingle each ship a
  SKILL.md with `mode/surface: image|video|audio` and a stylized
  example.html preview, and instruct the agent to dispatch via the
  contract.

Made-with: Cursor
2026-04-28 22:40:58 +08:00

63 lines
2.7 KiB
JavaScript

// Daemon-side mirror of src/media/models.ts. We keep this in plain JS so
// node imports are native and the daemon never needs a TS toolchain at
// runtime. The two files are kept in sync by review — any model added to
// src/media/models.ts must be added here too. Tests in verify ensure the
// arrays are non-empty and IDs are unique.
export const IMAGE_MODELS = [
{ id: 'gpt-image-2', label: 'gpt-image-2', hint: 'OpenAI · default', caps: ['t2i', 'i2i', 'inpaint'] },
{ id: 'flux-1.1-pro', label: 'flux-1.1-pro', hint: 'Black Forest Labs', caps: ['t2i', 'i2i'] },
{ id: 'imagen-4', label: 'imagen-4', hint: 'Google', caps: ['t2i'] },
{ id: 'midjourney-v7', label: 'midjourney-v7', hint: 'Midjourney', caps: ['t2i'] },
];
export const VIDEO_MODELS = [
{ id: 'seedance-2', label: 'seedance-2', hint: 'ByteDance · default', caps: ['t2v', 'i2v'] },
{ id: 'kling-3', label: 'kling-3', hint: 'Kuaishou', caps: ['t2v', 'i2v'] },
{ id: 'kling-4', label: 'kling-4', hint: 'Kuaishou · latest', caps: ['t2v', 'i2v'] },
{ id: 'veo-3', label: 'veo-3', hint: 'Google', caps: ['t2v'] },
{ id: 'sora-2', label: 'sora-2', hint: 'OpenAI', caps: ['t2v'] },
];
export const AUDIO_MODELS_BY_KIND = {
music: [
{ id: 'suno-v5', label: 'suno-v5', hint: 'Suno · default', caps: ['music'] },
{ id: 'udio-v2', label: 'udio-v2', hint: 'Udio', caps: ['music'] },
{ id: 'lyria-2', label: 'lyria-2', hint: 'Google', caps: ['music'] },
],
speech: [
{ id: 'minimax-tts', label: 'minimax-tts', hint: 'MiniMax · default', caps: ['tts'] },
{ id: 'fish-speech-2', label: 'fish-speech-2', hint: 'FishAudio', caps: ['tts', 'voice-clone'] },
{ id: 'elevenlabs-v3', label: 'elevenlabs-v3', hint: 'ElevenLabs', caps: ['tts', 'voice-clone'] },
],
sfx: [
{ id: 'elevenlabs-sfx', label: 'elevenlabs-sfx', hint: 'ElevenLabs SFX', caps: ['sfx'] },
{ id: 'audiocraft', label: 'audiocraft', hint: 'Meta · open', caps: ['sfx', 'music'] },
],
};
export const MEDIA_ASPECTS = ['1:1', '16:9', '9:16', '4:3', '3:4'];
export const VIDEO_LENGTHS_SEC = [3, 5, 8, 10, 15, 30];
export const AUDIO_DURATIONS_SEC = [5, 10, 15, 30, 60, 120];
export function findMediaModel(id) {
const all = [
...IMAGE_MODELS,
...VIDEO_MODELS,
...AUDIO_MODELS_BY_KIND.music,
...AUDIO_MODELS_BY_KIND.speech,
...AUDIO_MODELS_BY_KIND.sfx,
];
return all.find((m) => m.id === id) || null;
}
export function modelsForSurface(surface, audioKind) {
if (surface === 'image') return IMAGE_MODELS;
if (surface === 'video') return VIDEO_MODELS;
if (surface === 'audio') {
const k = audioKind || 'music';
return AUDIO_MODELS_BY_KIND[k] || AUDIO_MODELS_BY_KIND.music;
}
return [];
}