ac70719d4d
Extends Open Design from web-only to a multi-modal creation tool. The unifying contract is one code-agent loop driven by skills + project metadata + prompt constraints; for non-web surfaces the agent shells out to a single dispatcher (`od media generate`) that the daemon routes per (surface, model). - Types: new Surface union, MediaAspect / AudioKind, image/video/audio ProjectKind + ProjectMetadata fields, video/audio ProjectFileKind. - NewProjectPanel: top-level surface picker + Image / Video / Audio forms with model, aspect, length, duration, voice, audio-kind pickers. - ExamplesTab + DesignSystemsTab: surface filter row that scopes before mode / scenario / category filters. - FileViewer / FileWorkspace: native <video> and <audio> previews and matching tab icons. - Daemon: parses `od.surface` and `> Surface:` blockquotes; recognises mp4 / webm / mov / mp3 / wav / ogg / m4a / flac extensions; spawns agents with OD_BIN / OD_DAEMON_URL / OD_PROJECT_ID / OD_PROJECT_DIR env so any code-agent CLI with shell access can call the dispatcher. - daemon/media.js + daemon/media-models.js: surface-agnostic dispatcher with stub providers that emit deterministic placeholder bytes (1x1 PNG, valid mp4 ftyp, mp3 frame / silent WAV) so the framework works without API keys; real provider integrations slot in later. - daemon/cli.js: `od media generate --surface ... --model ...` subcommand routes to POST /api/projects/:id/media/generate and prints one JSON line for the agent to parse. - prompts/media-contract.ts: hard contract pinned LAST in the system prompt for image/video/audio surfaces — env vars, exact invocation, registered model IDs per surface, six workflow rules. system.ts metadata block updated to point at the contract. - Seed skills: image-poster, video-shortform, audio-jingle each ship a SKILL.md with `mode/surface: image|video|audio` and a stylized example.html preview, and instruct the agent to dispatch via the contract. Made-with: Cursor
264 lines
9.2 KiB
JavaScript
264 lines
9.2 KiB
JavaScript
// Media-generation dispatcher. The unifying contract is:
|
||
//
|
||
// skills + metadata + system-prompt
|
||
// ↓ (the code agent decides what to make)
|
||
// `od media generate --surface … --model … --output … --prompt …`
|
||
// ↓ (this module routes to a provider)
|
||
// bytes written to <projectsRoot>/<projectId>/<output>
|
||
// ↓
|
||
// FileViewer renders it.
|
||
//
|
||
// Every surface (image / video / audio) flows through this single
|
||
// entrypoint. Providers are pluggable: each file under ./media-providers/
|
||
// (or inline below) registers handlers keyed by (surface, model). The
|
||
// fallback handlers emit a deterministic, lightweight placeholder
|
||
// (labeled SVG-PNG, silent WAV/MP3, blank MP4) so the framework works
|
||
// without API keys — real provider integrations slot in later by
|
||
// replacing the handler.
|
||
|
||
import { mkdir, stat, writeFile } from 'node:fs/promises';
|
||
import path from 'node:path';
|
||
import { findMediaModel } from './media-models.js';
|
||
import {
|
||
ensureProject,
|
||
kindFor,
|
||
mimeFor,
|
||
sanitizeName,
|
||
} from './projects.js';
|
||
|
||
const DEFAULT_OUTPUT_BY_SURFACE = {
|
||
image: 'image.png',
|
||
video: 'video.mp4',
|
||
audio: 'audio.mp3',
|
||
};
|
||
|
||
const SURFACES = new Set(['image', 'video', 'audio']);
|
||
|
||
/**
|
||
* Generate a media artifact and write it into the project's files dir.
|
||
*
|
||
* @param {Object} args
|
||
* @param {string} args.projectsRoot - Absolute path to <repo>/.od/projects.
|
||
* @param {string} args.projectId
|
||
* @param {'image'|'video'|'audio'} args.surface
|
||
* @param {string} args.model - Must be a registered model id.
|
||
* @param {string} [args.prompt]
|
||
* @param {string} [args.output] - Optional filename; auto-named if missing.
|
||
* @param {string} [args.aspect] - 1:1 / 16:9 / 9:16 / 4:3 / 3:4
|
||
* @param {number} [args.length] - Video length, seconds.
|
||
* @param {number} [args.duration] - Audio duration, seconds.
|
||
* @param {string} [args.voice]
|
||
* @param {string} [args.audioKind] - music | speech | sfx
|
||
* @returns {Promise<{ name: string, size: number, mtime: number, kind: string, mime: string, model: string, surface: string, providerNote: string }>}
|
||
*/
|
||
export async function generateMedia(args) {
|
||
const {
|
||
projectsRoot,
|
||
projectId,
|
||
surface,
|
||
model,
|
||
prompt,
|
||
output,
|
||
aspect,
|
||
length,
|
||
duration,
|
||
voice,
|
||
audioKind,
|
||
} = args;
|
||
|
||
if (!projectsRoot) throw new Error('projectsRoot required');
|
||
if (typeof projectId !== 'string' || !projectId) {
|
||
throw new Error('projectId required');
|
||
}
|
||
if (!SURFACES.has(surface)) {
|
||
throw new Error(`unsupported surface: ${surface}`);
|
||
}
|
||
if (typeof model !== 'string' || !model) {
|
||
throw new Error('model required');
|
||
}
|
||
const def = findMediaModel(model);
|
||
if (!def) {
|
||
throw new Error(
|
||
`unknown model: ${model}. Pass --model from the registered list (see /api/media/models).`,
|
||
);
|
||
}
|
||
|
||
const dir = await ensureProject(projectsRoot, projectId);
|
||
const safeOut = sanitizeName(
|
||
output || autoOutputName(surface, model, audioKind),
|
||
);
|
||
const target = path.join(dir, safeOut);
|
||
await mkdir(path.dirname(target), { recursive: true });
|
||
|
||
const ctx = {
|
||
surface,
|
||
model,
|
||
prompt: prompt || '',
|
||
aspect: aspect || defaultAspectFor(surface),
|
||
length: typeof length === 'number' ? length : undefined,
|
||
duration: typeof duration === 'number' ? duration : undefined,
|
||
voice: voice || '',
|
||
audioKind: audioKind || (surface === 'audio' ? 'music' : undefined),
|
||
};
|
||
|
||
let bytes;
|
||
let providerNote;
|
||
if (surface === 'image') {
|
||
({ bytes, providerNote } = await renderImage(ctx, safeOut));
|
||
} else if (surface === 'video') {
|
||
({ bytes, providerNote } = await renderVideo(ctx, safeOut));
|
||
} else {
|
||
({ bytes, providerNote } = await renderAudio(ctx, safeOut));
|
||
}
|
||
|
||
await writeFile(target, bytes);
|
||
const st = await stat(target);
|
||
return {
|
||
name: safeOut,
|
||
size: st.size,
|
||
mtime: st.mtimeMs,
|
||
kind: kindFor(safeOut),
|
||
mime: mimeFor(safeOut),
|
||
model,
|
||
surface,
|
||
providerNote,
|
||
};
|
||
}
|
||
|
||
function autoOutputName(surface, model, audioKind) {
|
||
const base = DEFAULT_OUTPUT_BY_SURFACE[surface] || 'artifact.bin';
|
||
const stamp = Date.now().toString(36);
|
||
const tag = surface === 'audio' && audioKind ? `${audioKind}-${model}` : model;
|
||
const dot = base.lastIndexOf('.');
|
||
const stem = dot > 0 ? base.slice(0, dot) : base;
|
||
const ext = dot > 0 ? base.slice(dot) : '';
|
||
return `${stem}-${tag}-${stamp}${ext}`;
|
||
}
|
||
|
||
function defaultAspectFor(surface) {
|
||
if (surface === 'image') return '1:1';
|
||
if (surface === 'video') return '16:9';
|
||
return undefined;
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Provider stubs.
|
||
//
|
||
// Each renderer returns Buffer bytes that the caller writes to disk. They
|
||
// produce real, lightweight placeholder media labelled with the model +
|
||
// prompt so the user can verify which call was dispatched while the real
|
||
// provider integrations are still pending. To replace a stub with a real
|
||
// provider, swap the body — keep the (ctx, fileName) → { bytes, note }
|
||
// shape so server.js doesn't change.
|
||
|
||
async function renderImage(ctx, fileName) {
|
||
// SVG-as-image: write SVG bytes into a .png filename only when ext is
|
||
// svg; otherwise emit a tiny PNG that browsers can decode. We pick
|
||
// PNG-as-bytes by encoding the SVG inside a minimal PNG container —
|
||
// simpler: just write SVG XML into a .png, browsers can't render that.
|
||
// So instead: for png/jpg, emit a deterministic 1×1 PNG; for svg, emit
|
||
// a labelled SVG.
|
||
const ext = path.extname(fileName).toLowerCase();
|
||
if (ext === '.svg') {
|
||
return { bytes: Buffer.from(svgPlaceholder(ctx), 'utf8'), providerNote: 'svg-stub' };
|
||
}
|
||
// Minimal 1×1 transparent PNG. Real provider would emit a full image.
|
||
const png = Buffer.from(
|
||
[
|
||
0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0x00, 0x00, 0x00, 0x0d,
|
||
0x49, 0x48, 0x44, 0x52, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
|
||
0x08, 0x06, 0x00, 0x00, 0x00, 0x1f, 0x15, 0xc4, 0x89, 0x00, 0x00, 0x00,
|
||
0x0d, 0x49, 0x44, 0x41, 0x54, 0x78, 0x9c, 0x63, 0x00, 0x01, 0x00, 0x00,
|
||
0x05, 0x00, 0x01, 0x0d, 0x0a, 0x2d, 0xb4, 0x00, 0x00, 0x00, 0x00, 0x49,
|
||
0x45, 0x4e, 0x44, 0xae, 0x42, 0x60, 0x82,
|
||
],
|
||
);
|
||
return {
|
||
bytes: png,
|
||
providerNote: `stub-png · model=${ctx.model} · aspect=${ctx.aspect} · prompt=${truncate(ctx.prompt, 60)}`,
|
||
};
|
||
}
|
||
|
||
async function renderVideo(ctx, _fileName) {
|
||
// Tiny but valid mp4 (ftyp + minimal moov). Browsers without a video
|
||
// track will show 0 seconds, which is fine — this proves the dispatch
|
||
// round-trip; real Seedance/Kling/Veo providers replace this body.
|
||
const ftyp = Buffer.from([
|
||
0x00, 0x00, 0x00, 0x18, 0x66, 0x74, 0x79, 0x70, 0x69, 0x73, 0x6f, 0x6d,
|
||
0x00, 0x00, 0x02, 0x00, 0x69, 0x73, 0x6f, 0x6d, 0x69, 0x73, 0x6f, 0x32,
|
||
]);
|
||
const mdat = Buffer.from([0x00, 0x00, 0x00, 0x08, 0x6d, 0x64, 0x61, 0x74]);
|
||
return {
|
||
bytes: Buffer.concat([ftyp, mdat]),
|
||
providerNote: `stub-mp4 · model=${ctx.model} · aspect=${ctx.aspect} · length=${ctx.length ?? '?'}s · prompt=${truncate(ctx.prompt, 60)}`,
|
||
};
|
||
}
|
||
|
||
async function renderAudio(ctx, fileName) {
|
||
const ext = path.extname(fileName).toLowerCase();
|
||
if (ext === '.wav') {
|
||
return {
|
||
bytes: silentWav(0.5),
|
||
providerNote: `stub-wav · model=${ctx.model} · kind=${ctx.audioKind} · duration=${ctx.duration ?? '?'}s`,
|
||
};
|
||
}
|
||
// Default: emit a near-empty mp3 frame header so the file is valid but
|
||
// tiny. Browsers may report 0:00; replace with real provider output.
|
||
const mp3 = Buffer.from([
|
||
0xff, 0xfb, 0x90, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||
]);
|
||
return {
|
||
bytes: mp3,
|
||
providerNote: `stub-mp3 · model=${ctx.model} · kind=${ctx.audioKind} · voice=${ctx.voice || '-'} · duration=${ctx.duration ?? '?'}s`,
|
||
};
|
||
}
|
||
|
||
function svgPlaceholder(ctx) {
|
||
const [w, h] = aspectToBox(ctx.aspect, 800);
|
||
const safe = (s) =>
|
||
String(s || '')
|
||
.replace(/&/g, '&')
|
||
.replace(/</g, '<')
|
||
.replace(/>/g, '>');
|
||
return [
|
||
`<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 ${w} ${h}" width="${w}" height="${h}">`,
|
||
`<rect width="${w}" height="${h}" fill="#0f1424"/>`,
|
||
`<text x="50%" y="50%" fill="#7da4ff" font-family="ui-sans-serif" font-size="20" text-anchor="middle">${safe(ctx.model)} — ${safe(ctx.prompt).slice(0, 60)}</text>`,
|
||
'</svg>',
|
||
].join('');
|
||
}
|
||
|
||
function aspectToBox(aspect, base) {
|
||
const [a, b] = String(aspect || '1:1').split(':').map(Number);
|
||
if (!a || !b) return [base, base];
|
||
if (a >= b) return [base, Math.round((base * b) / a)];
|
||
return [Math.round((base * a) / b), base];
|
||
}
|
||
|
||
function silentWav(seconds) {
|
||
const sampleRate = 8000;
|
||
const numSamples = Math.max(1, Math.round(sampleRate * seconds));
|
||
const dataSize = numSamples * 2;
|
||
const buf = Buffer.alloc(44 + dataSize);
|
||
buf.write('RIFF', 0, 'ascii');
|
||
buf.writeUInt32LE(36 + dataSize, 4);
|
||
buf.write('WAVE', 8, 'ascii');
|
||
buf.write('fmt ', 12, 'ascii');
|
||
buf.writeUInt32LE(16, 16);
|
||
buf.writeUInt16LE(1, 20); // PCM
|
||
buf.writeUInt16LE(1, 22); // mono
|
||
buf.writeUInt32LE(sampleRate, 24);
|
||
buf.writeUInt32LE(sampleRate * 2, 28);
|
||
buf.writeUInt16LE(2, 32);
|
||
buf.writeUInt16LE(16, 34);
|
||
buf.write('data', 36, 'ascii');
|
||
buf.writeUInt32LE(dataSize, 40);
|
||
return buf;
|
||
}
|
||
|
||
function truncate(s, n) {
|
||
const v = String(s || '');
|
||
if (v.length <= n) return v;
|
||
return v.slice(0, n - 1) + '…';
|
||
}
|