Files
open-design/daemon/media.js
T
pftom ac70719d4d feat(media): add image / video / audio surfaces with unified od media generate dispatcher
Extends Open Design from web-only to a multi-modal creation tool. The
unifying contract is one code-agent loop driven by skills + project
metadata + prompt constraints; for non-web surfaces the agent shells
out to a single dispatcher (`od media generate`) that the daemon
routes per (surface, model).

- Types: new Surface union, MediaAspect / AudioKind, image/video/audio
  ProjectKind + ProjectMetadata fields, video/audio ProjectFileKind.
- NewProjectPanel: top-level surface picker + Image / Video / Audio
  forms with model, aspect, length, duration, voice, audio-kind pickers.
- ExamplesTab + DesignSystemsTab: surface filter row that scopes
  before mode / scenario / category filters.
- FileViewer / FileWorkspace: native <video> and <audio> previews and
  matching tab icons.
- Daemon: parses `od.surface` and `> Surface:` blockquotes; recognises
  mp4 / webm / mov / mp3 / wav / ogg / m4a / flac extensions; spawns
  agents with OD_BIN / OD_DAEMON_URL / OD_PROJECT_ID / OD_PROJECT_DIR
  env so any code-agent CLI with shell access can call the dispatcher.
- daemon/media.js + daemon/media-models.js: surface-agnostic dispatcher
  with stub providers that emit deterministic placeholder bytes
  (1x1 PNG, valid mp4 ftyp, mp3 frame / silent WAV) so the framework
  works without API keys; real provider integrations slot in later.
- daemon/cli.js: `od media generate --surface ... --model ...`
  subcommand routes to POST /api/projects/:id/media/generate and
  prints one JSON line for the agent to parse.
- prompts/media-contract.ts: hard contract pinned LAST in the system
  prompt for image/video/audio surfaces — env vars, exact invocation,
  registered model IDs per surface, six workflow rules. system.ts
  metadata block updated to point at the contract.
- Seed skills: image-poster, video-shortform, audio-jingle each ship a
  SKILL.md with `mode/surface: image|video|audio` and a stylized
  example.html preview, and instruct the agent to dispatch via the
  contract.

Made-with: Cursor
2026-04-28 22:40:58 +08:00

264 lines
9.2 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Media-generation dispatcher. The unifying contract is:
//
// skills + metadata + system-prompt
// ↓ (the code agent decides what to make)
// `od media generate --surface … --model … --output … --prompt …`
// ↓ (this module routes to a provider)
// bytes written to <projectsRoot>/<projectId>/<output>
// ↓
// FileViewer renders it.
//
// Every surface (image / video / audio) flows through this single
// entrypoint. Providers are pluggable: each file under ./media-providers/
// (or inline below) registers handlers keyed by (surface, model). The
// fallback handlers emit a deterministic, lightweight placeholder
// (labeled SVG-PNG, silent WAV/MP3, blank MP4) so the framework works
// without API keys — real provider integrations slot in later by
// replacing the handler.
import { mkdir, stat, writeFile } from 'node:fs/promises';
import path from 'node:path';
import { findMediaModel } from './media-models.js';
import {
ensureProject,
kindFor,
mimeFor,
sanitizeName,
} from './projects.js';
const DEFAULT_OUTPUT_BY_SURFACE = {
image: 'image.png',
video: 'video.mp4',
audio: 'audio.mp3',
};
const SURFACES = new Set(['image', 'video', 'audio']);
/**
* Generate a media artifact and write it into the project's files dir.
*
* @param {Object} args
* @param {string} args.projectsRoot - Absolute path to <repo>/.od/projects.
* @param {string} args.projectId
* @param {'image'|'video'|'audio'} args.surface
* @param {string} args.model - Must be a registered model id.
* @param {string} [args.prompt]
* @param {string} [args.output] - Optional filename; auto-named if missing.
* @param {string} [args.aspect] - 1:1 / 16:9 / 9:16 / 4:3 / 3:4
* @param {number} [args.length] - Video length, seconds.
* @param {number} [args.duration] - Audio duration, seconds.
* @param {string} [args.voice]
* @param {string} [args.audioKind] - music | speech | sfx
* @returns {Promise<{ name: string, size: number, mtime: number, kind: string, mime: string, model: string, surface: string, providerNote: string }>}
*/
export async function generateMedia(args) {
const {
projectsRoot,
projectId,
surface,
model,
prompt,
output,
aspect,
length,
duration,
voice,
audioKind,
} = args;
if (!projectsRoot) throw new Error('projectsRoot required');
if (typeof projectId !== 'string' || !projectId) {
throw new Error('projectId required');
}
if (!SURFACES.has(surface)) {
throw new Error(`unsupported surface: ${surface}`);
}
if (typeof model !== 'string' || !model) {
throw new Error('model required');
}
const def = findMediaModel(model);
if (!def) {
throw new Error(
`unknown model: ${model}. Pass --model from the registered list (see /api/media/models).`,
);
}
const dir = await ensureProject(projectsRoot, projectId);
const safeOut = sanitizeName(
output || autoOutputName(surface, model, audioKind),
);
const target = path.join(dir, safeOut);
await mkdir(path.dirname(target), { recursive: true });
const ctx = {
surface,
model,
prompt: prompt || '',
aspect: aspect || defaultAspectFor(surface),
length: typeof length === 'number' ? length : undefined,
duration: typeof duration === 'number' ? duration : undefined,
voice: voice || '',
audioKind: audioKind || (surface === 'audio' ? 'music' : undefined),
};
let bytes;
let providerNote;
if (surface === 'image') {
({ bytes, providerNote } = await renderImage(ctx, safeOut));
} else if (surface === 'video') {
({ bytes, providerNote } = await renderVideo(ctx, safeOut));
} else {
({ bytes, providerNote } = await renderAudio(ctx, safeOut));
}
await writeFile(target, bytes);
const st = await stat(target);
return {
name: safeOut,
size: st.size,
mtime: st.mtimeMs,
kind: kindFor(safeOut),
mime: mimeFor(safeOut),
model,
surface,
providerNote,
};
}
function autoOutputName(surface, model, audioKind) {
const base = DEFAULT_OUTPUT_BY_SURFACE[surface] || 'artifact.bin';
const stamp = Date.now().toString(36);
const tag = surface === 'audio' && audioKind ? `${audioKind}-${model}` : model;
const dot = base.lastIndexOf('.');
const stem = dot > 0 ? base.slice(0, dot) : base;
const ext = dot > 0 ? base.slice(dot) : '';
return `${stem}-${tag}-${stamp}${ext}`;
}
function defaultAspectFor(surface) {
if (surface === 'image') return '1:1';
if (surface === 'video') return '16:9';
return undefined;
}
// ---------------------------------------------------------------------------
// Provider stubs.
//
// Each renderer returns Buffer bytes that the caller writes to disk. They
// produce real, lightweight placeholder media labelled with the model +
// prompt so the user can verify which call was dispatched while the real
// provider integrations are still pending. To replace a stub with a real
// provider, swap the body — keep the (ctx, fileName) → { bytes, note }
// shape so server.js doesn't change.
async function renderImage(ctx, fileName) {
// SVG-as-image: write SVG bytes into a .png filename only when ext is
// svg; otherwise emit a tiny PNG that browsers can decode. We pick
// PNG-as-bytes by encoding the SVG inside a minimal PNG container —
// simpler: just write SVG XML into a .png, browsers can't render that.
// So instead: for png/jpg, emit a deterministic 1×1 PNG; for svg, emit
// a labelled SVG.
const ext = path.extname(fileName).toLowerCase();
if (ext === '.svg') {
return { bytes: Buffer.from(svgPlaceholder(ctx), 'utf8'), providerNote: 'svg-stub' };
}
// Minimal 1×1 transparent PNG. Real provider would emit a full image.
const png = Buffer.from(
[
0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0x00, 0x00, 0x00, 0x0d,
0x49, 0x48, 0x44, 0x52, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
0x08, 0x06, 0x00, 0x00, 0x00, 0x1f, 0x15, 0xc4, 0x89, 0x00, 0x00, 0x00,
0x0d, 0x49, 0x44, 0x41, 0x54, 0x78, 0x9c, 0x63, 0x00, 0x01, 0x00, 0x00,
0x05, 0x00, 0x01, 0x0d, 0x0a, 0x2d, 0xb4, 0x00, 0x00, 0x00, 0x00, 0x49,
0x45, 0x4e, 0x44, 0xae, 0x42, 0x60, 0x82,
],
);
return {
bytes: png,
providerNote: `stub-png · model=${ctx.model} · aspect=${ctx.aspect} · prompt=${truncate(ctx.prompt, 60)}`,
};
}
async function renderVideo(ctx, _fileName) {
// Tiny but valid mp4 (ftyp + minimal moov). Browsers without a video
// track will show 0 seconds, which is fine — this proves the dispatch
// round-trip; real Seedance/Kling/Veo providers replace this body.
const ftyp = Buffer.from([
0x00, 0x00, 0x00, 0x18, 0x66, 0x74, 0x79, 0x70, 0x69, 0x73, 0x6f, 0x6d,
0x00, 0x00, 0x02, 0x00, 0x69, 0x73, 0x6f, 0x6d, 0x69, 0x73, 0x6f, 0x32,
]);
const mdat = Buffer.from([0x00, 0x00, 0x00, 0x08, 0x6d, 0x64, 0x61, 0x74]);
return {
bytes: Buffer.concat([ftyp, mdat]),
providerNote: `stub-mp4 · model=${ctx.model} · aspect=${ctx.aspect} · length=${ctx.length ?? '?'}s · prompt=${truncate(ctx.prompt, 60)}`,
};
}
async function renderAudio(ctx, fileName) {
const ext = path.extname(fileName).toLowerCase();
if (ext === '.wav') {
return {
bytes: silentWav(0.5),
providerNote: `stub-wav · model=${ctx.model} · kind=${ctx.audioKind} · duration=${ctx.duration ?? '?'}s`,
};
}
// Default: emit a near-empty mp3 frame header so the file is valid but
// tiny. Browsers may report 0:00; replace with real provider output.
const mp3 = Buffer.from([
0xff, 0xfb, 0x90, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
]);
return {
bytes: mp3,
providerNote: `stub-mp3 · model=${ctx.model} · kind=${ctx.audioKind} · voice=${ctx.voice || '-'} · duration=${ctx.duration ?? '?'}s`,
};
}
function svgPlaceholder(ctx) {
const [w, h] = aspectToBox(ctx.aspect, 800);
const safe = (s) =>
String(s || '')
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;');
return [
`<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 ${w} ${h}" width="${w}" height="${h}">`,
`<rect width="${w}" height="${h}" fill="#0f1424"/>`,
`<text x="50%" y="50%" fill="#7da4ff" font-family="ui-sans-serif" font-size="20" text-anchor="middle">${safe(ctx.model)}${safe(ctx.prompt).slice(0, 60)}</text>`,
'</svg>',
].join('');
}
function aspectToBox(aspect, base) {
const [a, b] = String(aspect || '1:1').split(':').map(Number);
if (!a || !b) return [base, base];
if (a >= b) return [base, Math.round((base * b) / a)];
return [Math.round((base * a) / b), base];
}
function silentWav(seconds) {
const sampleRate = 8000;
const numSamples = Math.max(1, Math.round(sampleRate * seconds));
const dataSize = numSamples * 2;
const buf = Buffer.alloc(44 + dataSize);
buf.write('RIFF', 0, 'ascii');
buf.writeUInt32LE(36 + dataSize, 4);
buf.write('WAVE', 8, 'ascii');
buf.write('fmt ', 12, 'ascii');
buf.writeUInt32LE(16, 16);
buf.writeUInt16LE(1, 20); // PCM
buf.writeUInt16LE(1, 22); // mono
buf.writeUInt32LE(sampleRate, 24);
buf.writeUInt32LE(sampleRate * 2, 28);
buf.writeUInt16LE(2, 32);
buf.writeUInt16LE(16, 34);
buf.write('data', 36, 'ascii');
buf.writeUInt32LE(dataSize, 40);
return buf;
}
function truncate(s, n) {
const v = String(s || '');
if (v.length <= n) return v;
return v.slice(0, n - 1) + '…';
}