Files
open-design/daemon/media.js
T
pftom 976a6eadf2 feat(media): add image/video/audio project kinds via od media generate
Introduce non-web media surfaces (image, video, audio) as first-class
project kinds. The unifying contract is "skill workflow + project
metadata tell the agent WHAT to make; one shell command — od media
generate — is HOW bytes are produced", so any code-agent CLI with
shell access can drive it without bespoke tools.

- Frontend: New Project panel gains Image/Video/Audio tabs with model
  picker, aspect/length/duration controls, and audio kind/voice
  selection. Examples and Design Systems tabs gain layered sections.
  FileViewer renders the generated image/video/audio files.
- Shared registry: src/media/models.ts is the single source of truth
  for image/video/audio model IDs, aspects, and defaults — consumed
  by the picker AND the daemon dispatcher.
- Prompts: media-contract.ts is pinned LAST in the system prompt for
  media surfaces so its hard rules (call od media generate, don't
  emit binary in <artifact>, allowed model IDs) win over softer
  earlier wording.
- Daemon: new media.js dispatcher + media-models.js JSON view of the
  registry; cli.js gets the `od media generate` subcommand wired up
  via server.js / projects.js so the daemon writes files back into
  the project dir.
- Skills: audio-jingle, image-poster, video-shortform seed examples
  for the three surfaces.

Made-with: Cursor
2026-04-28 22:41:14 +08:00

264 lines
9.2 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Media-generation dispatcher. The unifying contract is:
//
// skills + metadata + system-prompt
// ↓ (the code agent decides what to make)
// `od media generate --surface … --model … --output … --prompt …`
// ↓ (this module routes to a provider)
// bytes written to <projectsRoot>/<projectId>/<output>
// ↓
// FileViewer renders it.
//
// Every surface (image / video / audio) flows through this single
// entrypoint. Providers are pluggable: each file under ./media-providers/
// (or inline below) registers handlers keyed by (surface, model). The
// fallback handlers emit a deterministic, lightweight placeholder
// (labeled SVG-PNG, silent WAV/MP3, blank MP4) so the framework works
// without API keys — real provider integrations slot in later by
// replacing the handler.
import { mkdir, stat, writeFile } from 'node:fs/promises';
import path from 'node:path';
import { findMediaModel } from './media-models.js';
import {
ensureProject,
kindFor,
mimeFor,
sanitizeName,
} from './projects.js';
const DEFAULT_OUTPUT_BY_SURFACE = {
image: 'image.png',
video: 'video.mp4',
audio: 'audio.mp3',
};
const SURFACES = new Set(['image', 'video', 'audio']);
/**
* Generate a media artifact and write it into the project's files dir.
*
* @param {Object} args
* @param {string} args.projectsRoot - Absolute path to <repo>/.od/projects.
* @param {string} args.projectId
* @param {'image'|'video'|'audio'} args.surface
* @param {string} args.model - Must be a registered model id.
* @param {string} [args.prompt]
* @param {string} [args.output] - Optional filename; auto-named if missing.
* @param {string} [args.aspect] - 1:1 / 16:9 / 9:16 / 4:3 / 3:4
* @param {number} [args.length] - Video length, seconds.
* @param {number} [args.duration] - Audio duration, seconds.
* @param {string} [args.voice]
* @param {string} [args.audioKind] - music | speech | sfx
* @returns {Promise<{ name: string, size: number, mtime: number, kind: string, mime: string, model: string, surface: string, providerNote: string }>}
*/
export async function generateMedia(args) {
const {
projectsRoot,
projectId,
surface,
model,
prompt,
output,
aspect,
length,
duration,
voice,
audioKind,
} = args;
if (!projectsRoot) throw new Error('projectsRoot required');
if (typeof projectId !== 'string' || !projectId) {
throw new Error('projectId required');
}
if (!SURFACES.has(surface)) {
throw new Error(`unsupported surface: ${surface}`);
}
if (typeof model !== 'string' || !model) {
throw new Error('model required');
}
const def = findMediaModel(model);
if (!def) {
throw new Error(
`unknown model: ${model}. Pass --model from the registered list (see /api/media/models).`,
);
}
const dir = await ensureProject(projectsRoot, projectId);
const safeOut = sanitizeName(
output || autoOutputName(surface, model, audioKind),
);
const target = path.join(dir, safeOut);
await mkdir(path.dirname(target), { recursive: true });
const ctx = {
surface,
model,
prompt: prompt || '',
aspect: aspect || defaultAspectFor(surface),
length: typeof length === 'number' ? length : undefined,
duration: typeof duration === 'number' ? duration : undefined,
voice: voice || '',
audioKind: audioKind || (surface === 'audio' ? 'music' : undefined),
};
let bytes;
let providerNote;
if (surface === 'image') {
({ bytes, providerNote } = await renderImage(ctx, safeOut));
} else if (surface === 'video') {
({ bytes, providerNote } = await renderVideo(ctx, safeOut));
} else {
({ bytes, providerNote } = await renderAudio(ctx, safeOut));
}
await writeFile(target, bytes);
const st = await stat(target);
return {
name: safeOut,
size: st.size,
mtime: st.mtimeMs,
kind: kindFor(safeOut),
mime: mimeFor(safeOut),
model,
surface,
providerNote,
};
}
function autoOutputName(surface, model, audioKind) {
const base = DEFAULT_OUTPUT_BY_SURFACE[surface] || 'artifact.bin';
const stamp = Date.now().toString(36);
const tag = surface === 'audio' && audioKind ? `${audioKind}-${model}` : model;
const dot = base.lastIndexOf('.');
const stem = dot > 0 ? base.slice(0, dot) : base;
const ext = dot > 0 ? base.slice(dot) : '';
return `${stem}-${tag}-${stamp}${ext}`;
}
function defaultAspectFor(surface) {
if (surface === 'image') return '1:1';
if (surface === 'video') return '16:9';
return undefined;
}
// ---------------------------------------------------------------------------
// Provider stubs.
//
// Each renderer returns Buffer bytes that the caller writes to disk. They
// produce real, lightweight placeholder media labelled with the model +
// prompt so the user can verify which call was dispatched while the real
// provider integrations are still pending. To replace a stub with a real
// provider, swap the body — keep the (ctx, fileName) → { bytes, note }
// shape so server.js doesn't change.
async function renderImage(ctx, fileName) {
// SVG-as-image: write SVG bytes into a .png filename only when ext is
// svg; otherwise emit a tiny PNG that browsers can decode. We pick
// PNG-as-bytes by encoding the SVG inside a minimal PNG container —
// simpler: just write SVG XML into a .png, browsers can't render that.
// So instead: for png/jpg, emit a deterministic 1×1 PNG; for svg, emit
// a labelled SVG.
const ext = path.extname(fileName).toLowerCase();
if (ext === '.svg') {
return { bytes: Buffer.from(svgPlaceholder(ctx), 'utf8'), providerNote: 'svg-stub' };
}
// Minimal 1×1 transparent PNG. Real provider would emit a full image.
const png = Buffer.from(
[
0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0x00, 0x00, 0x00, 0x0d,
0x49, 0x48, 0x44, 0x52, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
0x08, 0x06, 0x00, 0x00, 0x00, 0x1f, 0x15, 0xc4, 0x89, 0x00, 0x00, 0x00,
0x0d, 0x49, 0x44, 0x41, 0x54, 0x78, 0x9c, 0x63, 0x00, 0x01, 0x00, 0x00,
0x05, 0x00, 0x01, 0x0d, 0x0a, 0x2d, 0xb4, 0x00, 0x00, 0x00, 0x00, 0x49,
0x45, 0x4e, 0x44, 0xae, 0x42, 0x60, 0x82,
],
);
return {
bytes: png,
providerNote: `stub-png · model=${ctx.model} · aspect=${ctx.aspect} · prompt=${truncate(ctx.prompt, 60)}`,
};
}
async function renderVideo(ctx, _fileName) {
// Tiny but valid mp4 (ftyp + minimal moov). Browsers without a video
// track will show 0 seconds, which is fine — this proves the dispatch
// round-trip; real Seedance/Kling/Veo providers replace this body.
const ftyp = Buffer.from([
0x00, 0x00, 0x00, 0x18, 0x66, 0x74, 0x79, 0x70, 0x69, 0x73, 0x6f, 0x6d,
0x00, 0x00, 0x02, 0x00, 0x69, 0x73, 0x6f, 0x6d, 0x69, 0x73, 0x6f, 0x32,
]);
const mdat = Buffer.from([0x00, 0x00, 0x00, 0x08, 0x6d, 0x64, 0x61, 0x74]);
return {
bytes: Buffer.concat([ftyp, mdat]),
providerNote: `stub-mp4 · model=${ctx.model} · aspect=${ctx.aspect} · length=${ctx.length ?? '?'}s · prompt=${truncate(ctx.prompt, 60)}`,
};
}
async function renderAudio(ctx, fileName) {
const ext = path.extname(fileName).toLowerCase();
if (ext === '.wav') {
return {
bytes: silentWav(0.5),
providerNote: `stub-wav · model=${ctx.model} · kind=${ctx.audioKind} · duration=${ctx.duration ?? '?'}s`,
};
}
// Default: emit a near-empty mp3 frame header so the file is valid but
// tiny. Browsers may report 0:00; replace with real provider output.
const mp3 = Buffer.from([
0xff, 0xfb, 0x90, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
]);
return {
bytes: mp3,
providerNote: `stub-mp3 · model=${ctx.model} · kind=${ctx.audioKind} · voice=${ctx.voice || '-'} · duration=${ctx.duration ?? '?'}s`,
};
}
function svgPlaceholder(ctx) {
const [w, h] = aspectToBox(ctx.aspect, 800);
const safe = (s) =>
String(s || '')
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;');
return [
`<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 ${w} ${h}" width="${w}" height="${h}">`,
`<rect width="${w}" height="${h}" fill="#0f1424"/>`,
`<text x="50%" y="50%" fill="#7da4ff" font-family="ui-sans-serif" font-size="20" text-anchor="middle">${safe(ctx.model)}${safe(ctx.prompt).slice(0, 60)}</text>`,
'</svg>',
].join('');
}
function aspectToBox(aspect, base) {
const [a, b] = String(aspect || '1:1').split(':').map(Number);
if (!a || !b) return [base, base];
if (a >= b) return [base, Math.round((base * b) / a)];
return [Math.round((base * a) / b), base];
}
function silentWav(seconds) {
const sampleRate = 8000;
const numSamples = Math.max(1, Math.round(sampleRate * seconds));
const dataSize = numSamples * 2;
const buf = Buffer.alloc(44 + dataSize);
buf.write('RIFF', 0, 'ascii');
buf.writeUInt32LE(36 + dataSize, 4);
buf.write('WAVE', 8, 'ascii');
buf.write('fmt ', 12, 'ascii');
buf.writeUInt32LE(16, 16);
buf.writeUInt16LE(1, 20); // PCM
buf.writeUInt16LE(1, 22); // mono
buf.writeUInt32LE(sampleRate, 24);
buf.writeUInt32LE(sampleRate * 2, 28);
buf.writeUInt16LE(2, 32);
buf.writeUInt16LE(16, 34);
buf.write('data', 36, 'ascii');
buf.writeUInt32LE(dataSize, 40);
return buf;
}
function truncate(s, n) {
const v = String(s || '');
if (v.length <= n) return v;
return v.slice(0, n - 1) + '…';
}