feat(media): add image/video/audio project kinds via od media generate

Introduce non-web media surfaces (image, video, audio) as first-class
project kinds. The unifying contract is "skill workflow + project
metadata tell the agent WHAT to make; one shell command — od media
generate — is HOW bytes are produced", so any code-agent CLI with
shell access can drive it without bespoke tools.

- Frontend: New Project panel gains Image/Video/Audio tabs with model
  picker, aspect/length/duration controls, and audio kind/voice
  selection. Examples and Design Systems tabs gain layered sections.
  FileViewer renders the generated image/video/audio files.
- Shared registry: src/media/models.ts is the single source of truth
  for image/video/audio model IDs, aspects, and defaults — consumed
  by the picker AND the daemon dispatcher.
- Prompts: media-contract.ts is pinned LAST in the system prompt for
  media surfaces so its hard rules (call od media generate, don't
  emit binary in <artifact>, allowed model IDs) win over softer
  earlier wording.
- Daemon: new media.js dispatcher + media-models.js JSON view of the
  registry; cli.js gets the `od media generate` subcommand wired up
  via server.js / projects.js so the daemon writes files back into
  the project dir.
- Skills: audio-jingle, image-poster, video-shortform seed examples
  for the three surfaces.

Made-with: Cursor
This commit is contained in:
pftom
2026-04-28 22:41:14 +08:00
parent 0b61be5d96
commit 976a6eadf2
28 changed files with 2902 additions and 78 deletions
+263
View File
@@ -0,0 +1,263 @@
// Media-generation dispatcher. The unifying contract is:
//
// skills + metadata + system-prompt
// ↓ (the code agent decides what to make)
// `od media generate --surface … --model … --output … --prompt …`
// ↓ (this module routes to a provider)
// bytes written to <projectsRoot>/<projectId>/<output>
// ↓
// FileViewer renders it.
//
// Every surface (image / video / audio) flows through this single
// entrypoint. Providers are pluggable: each file under ./media-providers/
// (or inline below) registers handlers keyed by (surface, model). The
// fallback handlers emit a deterministic, lightweight placeholder
// (labeled SVG-PNG, silent WAV/MP3, blank MP4) so the framework works
// without API keys — real provider integrations slot in later by
// replacing the handler.
import { mkdir, stat, writeFile } from 'node:fs/promises';
import path from 'node:path';
import { findMediaModel } from './media-models.js';
import {
ensureProject,
kindFor,
mimeFor,
sanitizeName,
} from './projects.js';
const DEFAULT_OUTPUT_BY_SURFACE = {
image: 'image.png',
video: 'video.mp4',
audio: 'audio.mp3',
};
const SURFACES = new Set(['image', 'video', 'audio']);
/**
* Generate a media artifact and write it into the project's files dir.
*
* @param {Object} args
* @param {string} args.projectsRoot - Absolute path to <repo>/.od/projects.
* @param {string} args.projectId
* @param {'image'|'video'|'audio'} args.surface
* @param {string} args.model - Must be a registered model id.
* @param {string} [args.prompt]
* @param {string} [args.output] - Optional filename; auto-named if missing.
* @param {string} [args.aspect] - 1:1 / 16:9 / 9:16 / 4:3 / 3:4
* @param {number} [args.length] - Video length, seconds.
* @param {number} [args.duration] - Audio duration, seconds.
* @param {string} [args.voice]
* @param {string} [args.audioKind] - music | speech | sfx
* @returns {Promise<{ name: string, size: number, mtime: number, kind: string, mime: string, model: string, surface: string, providerNote: string }>}
*/
export async function generateMedia(args) {
const {
projectsRoot,
projectId,
surface,
model,
prompt,
output,
aspect,
length,
duration,
voice,
audioKind,
} = args;
if (!projectsRoot) throw new Error('projectsRoot required');
if (typeof projectId !== 'string' || !projectId) {
throw new Error('projectId required');
}
if (!SURFACES.has(surface)) {
throw new Error(`unsupported surface: ${surface}`);
}
if (typeof model !== 'string' || !model) {
throw new Error('model required');
}
const def = findMediaModel(model);
if (!def) {
throw new Error(
`unknown model: ${model}. Pass --model from the registered list (see /api/media/models).`,
);
}
const dir = await ensureProject(projectsRoot, projectId);
const safeOut = sanitizeName(
output || autoOutputName(surface, model, audioKind),
);
const target = path.join(dir, safeOut);
await mkdir(path.dirname(target), { recursive: true });
const ctx = {
surface,
model,
prompt: prompt || '',
aspect: aspect || defaultAspectFor(surface),
length: typeof length === 'number' ? length : undefined,
duration: typeof duration === 'number' ? duration : undefined,
voice: voice || '',
audioKind: audioKind || (surface === 'audio' ? 'music' : undefined),
};
let bytes;
let providerNote;
if (surface === 'image') {
({ bytes, providerNote } = await renderImage(ctx, safeOut));
} else if (surface === 'video') {
({ bytes, providerNote } = await renderVideo(ctx, safeOut));
} else {
({ bytes, providerNote } = await renderAudio(ctx, safeOut));
}
await writeFile(target, bytes);
const st = await stat(target);
return {
name: safeOut,
size: st.size,
mtime: st.mtimeMs,
kind: kindFor(safeOut),
mime: mimeFor(safeOut),
model,
surface,
providerNote,
};
}
function autoOutputName(surface, model, audioKind) {
const base = DEFAULT_OUTPUT_BY_SURFACE[surface] || 'artifact.bin';
const stamp = Date.now().toString(36);
const tag = surface === 'audio' && audioKind ? `${audioKind}-${model}` : model;
const dot = base.lastIndexOf('.');
const stem = dot > 0 ? base.slice(0, dot) : base;
const ext = dot > 0 ? base.slice(dot) : '';
return `${stem}-${tag}-${stamp}${ext}`;
}
function defaultAspectFor(surface) {
if (surface === 'image') return '1:1';
if (surface === 'video') return '16:9';
return undefined;
}
// ---------------------------------------------------------------------------
// Provider stubs.
//
// Each renderer returns Buffer bytes that the caller writes to disk. They
// produce real, lightweight placeholder media labelled with the model +
// prompt so the user can verify which call was dispatched while the real
// provider integrations are still pending. To replace a stub with a real
// provider, swap the body — keep the (ctx, fileName) → { bytes, note }
// shape so server.js doesn't change.
async function renderImage(ctx, fileName) {
// SVG-as-image: write SVG bytes into a .png filename only when ext is
// svg; otherwise emit a tiny PNG that browsers can decode. We pick
// PNG-as-bytes by encoding the SVG inside a minimal PNG container —
// simpler: just write SVG XML into a .png, browsers can't render that.
// So instead: for png/jpg, emit a deterministic 1×1 PNG; for svg, emit
// a labelled SVG.
const ext = path.extname(fileName).toLowerCase();
if (ext === '.svg') {
return { bytes: Buffer.from(svgPlaceholder(ctx), 'utf8'), providerNote: 'svg-stub' };
}
// Minimal 1×1 transparent PNG. Real provider would emit a full image.
const png = Buffer.from(
[
0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0x00, 0x00, 0x00, 0x0d,
0x49, 0x48, 0x44, 0x52, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
0x08, 0x06, 0x00, 0x00, 0x00, 0x1f, 0x15, 0xc4, 0x89, 0x00, 0x00, 0x00,
0x0d, 0x49, 0x44, 0x41, 0x54, 0x78, 0x9c, 0x63, 0x00, 0x01, 0x00, 0x00,
0x05, 0x00, 0x01, 0x0d, 0x0a, 0x2d, 0xb4, 0x00, 0x00, 0x00, 0x00, 0x49,
0x45, 0x4e, 0x44, 0xae, 0x42, 0x60, 0x82,
],
);
return {
bytes: png,
providerNote: `stub-png · model=${ctx.model} · aspect=${ctx.aspect} · prompt=${truncate(ctx.prompt, 60)}`,
};
}
async function renderVideo(ctx, _fileName) {
// Tiny but valid mp4 (ftyp + minimal moov). Browsers without a video
// track will show 0 seconds, which is fine — this proves the dispatch
// round-trip; real Seedance/Kling/Veo providers replace this body.
const ftyp = Buffer.from([
0x00, 0x00, 0x00, 0x18, 0x66, 0x74, 0x79, 0x70, 0x69, 0x73, 0x6f, 0x6d,
0x00, 0x00, 0x02, 0x00, 0x69, 0x73, 0x6f, 0x6d, 0x69, 0x73, 0x6f, 0x32,
]);
const mdat = Buffer.from([0x00, 0x00, 0x00, 0x08, 0x6d, 0x64, 0x61, 0x74]);
return {
bytes: Buffer.concat([ftyp, mdat]),
providerNote: `stub-mp4 · model=${ctx.model} · aspect=${ctx.aspect} · length=${ctx.length ?? '?'}s · prompt=${truncate(ctx.prompt, 60)}`,
};
}
async function renderAudio(ctx, fileName) {
const ext = path.extname(fileName).toLowerCase();
if (ext === '.wav') {
return {
bytes: silentWav(0.5),
providerNote: `stub-wav · model=${ctx.model} · kind=${ctx.audioKind} · duration=${ctx.duration ?? '?'}s`,
};
}
// Default: emit a near-empty mp3 frame header so the file is valid but
// tiny. Browsers may report 0:00; replace with real provider output.
const mp3 = Buffer.from([
0xff, 0xfb, 0x90, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
]);
return {
bytes: mp3,
providerNote: `stub-mp3 · model=${ctx.model} · kind=${ctx.audioKind} · voice=${ctx.voice || '-'} · duration=${ctx.duration ?? '?'}s`,
};
}
function svgPlaceholder(ctx) {
const [w, h] = aspectToBox(ctx.aspect, 800);
const safe = (s) =>
String(s || '')
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;');
return [
`<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 ${w} ${h}" width="${w}" height="${h}">`,
`<rect width="${w}" height="${h}" fill="#0f1424"/>`,
`<text x="50%" y="50%" fill="#7da4ff" font-family="ui-sans-serif" font-size="20" text-anchor="middle">${safe(ctx.model)}${safe(ctx.prompt).slice(0, 60)}</text>`,
'</svg>',
].join('');
}
function aspectToBox(aspect, base) {
const [a, b] = String(aspect || '1:1').split(':').map(Number);
if (!a || !b) return [base, base];
if (a >= b) return [base, Math.round((base * b) / a)];
return [Math.round((base * a) / b), base];
}
function silentWav(seconds) {
const sampleRate = 8000;
const numSamples = Math.max(1, Math.round(sampleRate * seconds));
const dataSize = numSamples * 2;
const buf = Buffer.alloc(44 + dataSize);
buf.write('RIFF', 0, 'ascii');
buf.writeUInt32LE(36 + dataSize, 4);
buf.write('WAVE', 8, 'ascii');
buf.write('fmt ', 12, 'ascii');
buf.writeUInt32LE(16, 16);
buf.writeUInt16LE(1, 20); // PCM
buf.writeUInt16LE(1, 22); // mono
buf.writeUInt32LE(sampleRate, 24);
buf.writeUInt32LE(sampleRate * 2, 28);
buf.writeUInt16LE(2, 32);
buf.writeUInt16LE(16, 34);
buf.write('data', 36, 'ascii');
buf.writeUInt32LE(dataSize, 40);
return buf;
}
function truncate(s, n) {
const v = String(s || '');
if (v.length <= n) return v;
return v.slice(0, n - 1) + '…';
}