feat(media): add image / video / audio surfaces with unified od media generate dispatcher
Extends Open Design from web-only to a multi-modal creation tool. The unifying contract is one code-agent loop driven by skills + project metadata + prompt constraints; for non-web surfaces the agent shells out to a single dispatcher (`od media generate`) that the daemon routes per (surface, model). - Types: new Surface union, MediaAspect / AudioKind, image/video/audio ProjectKind + ProjectMetadata fields, video/audio ProjectFileKind. - NewProjectPanel: top-level surface picker + Image / Video / Audio forms with model, aspect, length, duration, voice, audio-kind pickers. - ExamplesTab + DesignSystemsTab: surface filter row that scopes before mode / scenario / category filters. - FileViewer / FileWorkspace: native <video> and <audio> previews and matching tab icons. - Daemon: parses `od.surface` and `> Surface:` blockquotes; recognises mp4 / webm / mov / mp3 / wav / ogg / m4a / flac extensions; spawns agents with OD_BIN / OD_DAEMON_URL / OD_PROJECT_ID / OD_PROJECT_DIR env so any code-agent CLI with shell access can call the dispatcher. - daemon/media.js + daemon/media-models.js: surface-agnostic dispatcher with stub providers that emit deterministic placeholder bytes (1x1 PNG, valid mp4 ftyp, mp3 frame / silent WAV) so the framework works without API keys; real provider integrations slot in later. - daemon/cli.js: `od media generate --surface ... --model ...` subcommand routes to POST /api/projects/:id/media/generate and prints one JSON line for the agent to parse. - prompts/media-contract.ts: hard contract pinned LAST in the system prompt for image/video/audio surfaces — env vars, exact invocation, registered model IDs per surface, six workflow rules. system.ts metadata block updated to point at the contract. - Seed skills: image-poster, video-shortform, audio-jingle each ship a SKILL.md with `mode/surface: image|video|audio` and a stylized example.html preview, and instruct the agent to dispatch via the contract. Made-with: Cursor
This commit is contained in:
+162
-11
@@ -1,24 +1,44 @@
|
||||
#!/usr/bin/env node
|
||||
import { startServer } from './server.js';
|
||||
|
||||
const args = process.argv.slice(2);
|
||||
const argv = process.argv.slice(2);
|
||||
|
||||
// ---- Subcommand router ----------------------------------------------------
|
||||
//
|
||||
// `od` is two CLIs glued together:
|
||||
// - default mode: starts the daemon + opens the web UI.
|
||||
// - `od media …`: a thin client that POSTs to the running daemon. This
|
||||
// is what the code agent invokes from inside a chat to actually
|
||||
// produce image / video / audio bytes (the unifying contract).
|
||||
//
|
||||
// We dispatch on the first positional argument so flags like --port keep
|
||||
// working unchanged. Subcommand routing is keyword-based; flags are
|
||||
// parsed inside each handler.
|
||||
|
||||
const SUBCOMMAND_MAP = {
|
||||
media: runMedia,
|
||||
};
|
||||
|
||||
const first = argv.find((a) => !a.startsWith('-'));
|
||||
if (first && SUBCOMMAND_MAP[first]) {
|
||||
const idx = argv.indexOf(first);
|
||||
const rest = [...argv.slice(0, idx), ...argv.slice(idx + 1)];
|
||||
await SUBCOMMAND_MAP[first](rest);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Default: daemon mode.
|
||||
let port = Number(process.env.OD_PORT) || 7456;
|
||||
let open = true;
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
const a = args[i];
|
||||
for (let i = 0; i < argv.length; i++) {
|
||||
const a = argv[i];
|
||||
if (a === '-p' || a === '--port') {
|
||||
port = Number(args[++i]);
|
||||
port = Number(argv[++i]);
|
||||
} else if (a === '--no-open') {
|
||||
open = false;
|
||||
} else if (a === '-h' || a === '--help') {
|
||||
console.log(`Usage: od [--port <n>] [--no-open]
|
||||
|
||||
Starts a local daemon that:
|
||||
* scans PATH for installed code-agent CLIs (claude, codex, gemini, opencode, cursor-agent, ...)
|
||||
* serves a tiny web chat UI at http://localhost:<port>
|
||||
* proxies messages (text + images) to the selected agent via child-process spawn
|
||||
`);
|
||||
printRootHelp();
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
@@ -34,3 +54,134 @@ startServer({ port }).then(url => {
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
function printRootHelp() {
|
||||
console.log(`Usage:
|
||||
od [--port <n>] [--no-open]
|
||||
Start the local daemon and open the web UI.
|
||||
|
||||
od media generate --surface <image|video|audio> --model <id> [opts]
|
||||
Generate a media artifact and write it into the active project.
|
||||
Designed to be invoked by a code agent — picks up OD_DAEMON_URL
|
||||
and OD_PROJECT_ID from the env that the daemon injected on spawn.
|
||||
|
||||
What the daemon does:
|
||||
* scans PATH for installed code-agent CLIs (claude, codex, gemini, opencode, cursor-agent, ...)
|
||||
* serves the chat UI at http://localhost:<port>
|
||||
* proxies messages (text + images) to the selected agent via child-process spawn
|
||||
* exposes /api/projects/:id/media/generate — the unified image/video/audio
|
||||
dispatcher that the agent calls via \`od media generate\`.`);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Subcommand: od media …
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
async function runMedia(args) {
|
||||
const sub = args.find((a) => !a.startsWith('-')) || '';
|
||||
if (sub === 'help' || sub === '-h' || sub === '--help' || sub === '') {
|
||||
printMediaHelp();
|
||||
return;
|
||||
}
|
||||
if (sub !== 'generate') {
|
||||
console.error(`unknown subcommand: od media ${sub}`);
|
||||
printMediaHelp();
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const idx = args.indexOf(sub);
|
||||
const flags = parseFlags([...args.slice(0, idx), ...args.slice(idx + 1)]);
|
||||
|
||||
const daemonUrl = flags['daemon-url'] || process.env.OD_DAEMON_URL || 'http://127.0.0.1:7456';
|
||||
const projectId = flags.project || process.env.OD_PROJECT_ID;
|
||||
if (!projectId) {
|
||||
console.error(
|
||||
'project id required. Pass --project <id> or set OD_PROJECT_ID. The daemon injects this when it spawns the code agent.',
|
||||
);
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
const surface = flags.surface;
|
||||
if (!surface || !['image', 'video', 'audio'].includes(surface)) {
|
||||
console.error('--surface must be one of: image | video | audio');
|
||||
process.exit(2);
|
||||
}
|
||||
if (!flags.model) {
|
||||
console.error('--model required (see http://<daemon>/api/media/models)');
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
const body = {
|
||||
surface,
|
||||
model: flags.model,
|
||||
prompt: flags.prompt,
|
||||
output: flags.output,
|
||||
aspect: flags.aspect,
|
||||
voice: flags.voice,
|
||||
audioKind: flags['audio-kind'],
|
||||
};
|
||||
if (flags.length != null) body.length = Number(flags.length);
|
||||
if (flags.duration != null) body.duration = Number(flags.duration);
|
||||
|
||||
const url = `${daemonUrl.replace(/\/$/, '')}/api/projects/${encodeURIComponent(projectId)}/media/generate`;
|
||||
let resp;
|
||||
try {
|
||||
resp = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers: { 'content-type': 'application/json' },
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
} catch (err) {
|
||||
console.error(`failed to reach daemon at ${daemonUrl}: ${err.message}`);
|
||||
process.exit(3);
|
||||
}
|
||||
const text = await resp.text();
|
||||
if (!resp.ok) {
|
||||
console.error(`daemon ${resp.status}: ${text}`);
|
||||
process.exit(4);
|
||||
}
|
||||
// Print the JSON response as one line so the agent can parse it.
|
||||
process.stdout.write(text.trim() + '\n');
|
||||
}
|
||||
|
||||
function parseFlags(argv) {
|
||||
const out = {};
|
||||
for (let i = 0; i < argv.length; i++) {
|
||||
const a = argv[i];
|
||||
if (!a || !a.startsWith('--')) continue;
|
||||
const key = a.slice(2);
|
||||
const next = argv[i + 1];
|
||||
if (next != null && !next.startsWith('--')) {
|
||||
out[key] = next;
|
||||
i++;
|
||||
} else {
|
||||
out[key] = true;
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
function printMediaHelp() {
|
||||
console.log(`Usage: od media generate --surface <image|video|audio> --model <id> [opts]
|
||||
|
||||
Required:
|
||||
--surface image | video | audio
|
||||
--model Model id from /api/media/models (e.g. gpt-image-2, seedance-2, suno-v5).
|
||||
--project Project id. Auto-resolved from OD_PROJECT_ID when invoked by the daemon.
|
||||
|
||||
Common options:
|
||||
--prompt "<text>" Generation prompt.
|
||||
--output <filename> File to write under the project. Auto-named if omitted.
|
||||
--aspect 1:1|16:9|9:16|4:3|3:4
|
||||
--length <seconds> Video length.
|
||||
--duration <seconds> Audio duration.
|
||||
--voice <voice-id> Speech / TTS voice.
|
||||
--audio-kind music|speech|sfx
|
||||
--daemon-url http://127.0.0.1:7456
|
||||
|
||||
Output: a single line of JSON: {"file": { name, size, kind, mime, ... }}.
|
||||
|
||||
Skills should call this and then reference the returned filename in their
|
||||
artifact / message body. The daemon writes the bytes into the project's
|
||||
files folder so the FileViewer can preview them immediately.`);
|
||||
}
|
||||
|
||||
@@ -29,6 +29,11 @@ export async function listDesignSystems(root) {
|
||||
category: extractCategory(raw) ?? 'Uncategorized',
|
||||
summary: summarize(raw),
|
||||
swatches: extractSwatches(raw),
|
||||
// Optional `> Surface: image|video|audio` blockquote line. Most
|
||||
// existing systems target the web surface and don't declare it;
|
||||
// we default to 'web' so the right-side filter classifies them
|
||||
// correctly.
|
||||
surface: extractSurface(raw),
|
||||
body: raw,
|
||||
});
|
||||
} catch {
|
||||
@@ -67,6 +72,14 @@ function extractCategory(raw) {
|
||||
return m?.[1];
|
||||
}
|
||||
|
||||
const KNOWN_SURFACES = new Set(['web', 'image', 'video', 'audio']);
|
||||
function extractSurface(raw) {
|
||||
const m = /^>\s*Surface:\s*(.+?)\s*$/im.exec(raw);
|
||||
if (!m) return 'web';
|
||||
const v = m[1].trim().toLowerCase();
|
||||
return KNOWN_SURFACES.has(v) ? v : 'web';
|
||||
}
|
||||
|
||||
// Strip boilerplate like "Design System Inspired by Cohere" → "Cohere" so
|
||||
// the picker dropdown reads cleanly. Hand-authored titles that don't match
|
||||
// the pattern (e.g. "Neutral Modern") pass through unchanged.
|
||||
|
||||
@@ -0,0 +1,62 @@
|
||||
// Daemon-side mirror of src/media/models.ts. We keep this in plain JS so
|
||||
// node imports are native and the daemon never needs a TS toolchain at
|
||||
// runtime. The two files are kept in sync by review — any model added to
|
||||
// src/media/models.ts must be added here too. Tests in verify ensure the
|
||||
// arrays are non-empty and IDs are unique.
|
||||
|
||||
export const IMAGE_MODELS = [
|
||||
{ id: 'gpt-image-2', label: 'gpt-image-2', hint: 'OpenAI · default', caps: ['t2i', 'i2i', 'inpaint'] },
|
||||
{ id: 'flux-1.1-pro', label: 'flux-1.1-pro', hint: 'Black Forest Labs', caps: ['t2i', 'i2i'] },
|
||||
{ id: 'imagen-4', label: 'imagen-4', hint: 'Google', caps: ['t2i'] },
|
||||
{ id: 'midjourney-v7', label: 'midjourney-v7', hint: 'Midjourney', caps: ['t2i'] },
|
||||
];
|
||||
|
||||
export const VIDEO_MODELS = [
|
||||
{ id: 'seedance-2', label: 'seedance-2', hint: 'ByteDance · default', caps: ['t2v', 'i2v'] },
|
||||
{ id: 'kling-3', label: 'kling-3', hint: 'Kuaishou', caps: ['t2v', 'i2v'] },
|
||||
{ id: 'kling-4', label: 'kling-4', hint: 'Kuaishou · latest', caps: ['t2v', 'i2v'] },
|
||||
{ id: 'veo-3', label: 'veo-3', hint: 'Google', caps: ['t2v'] },
|
||||
{ id: 'sora-2', label: 'sora-2', hint: 'OpenAI', caps: ['t2v'] },
|
||||
];
|
||||
|
||||
export const AUDIO_MODELS_BY_KIND = {
|
||||
music: [
|
||||
{ id: 'suno-v5', label: 'suno-v5', hint: 'Suno · default', caps: ['music'] },
|
||||
{ id: 'udio-v2', label: 'udio-v2', hint: 'Udio', caps: ['music'] },
|
||||
{ id: 'lyria-2', label: 'lyria-2', hint: 'Google', caps: ['music'] },
|
||||
],
|
||||
speech: [
|
||||
{ id: 'minimax-tts', label: 'minimax-tts', hint: 'MiniMax · default', caps: ['tts'] },
|
||||
{ id: 'fish-speech-2', label: 'fish-speech-2', hint: 'FishAudio', caps: ['tts', 'voice-clone'] },
|
||||
{ id: 'elevenlabs-v3', label: 'elevenlabs-v3', hint: 'ElevenLabs', caps: ['tts', 'voice-clone'] },
|
||||
],
|
||||
sfx: [
|
||||
{ id: 'elevenlabs-sfx', label: 'elevenlabs-sfx', hint: 'ElevenLabs SFX', caps: ['sfx'] },
|
||||
{ id: 'audiocraft', label: 'audiocraft', hint: 'Meta · open', caps: ['sfx', 'music'] },
|
||||
],
|
||||
};
|
||||
|
||||
export const MEDIA_ASPECTS = ['1:1', '16:9', '9:16', '4:3', '3:4'];
|
||||
export const VIDEO_LENGTHS_SEC = [3, 5, 8, 10, 15, 30];
|
||||
export const AUDIO_DURATIONS_SEC = [5, 10, 15, 30, 60, 120];
|
||||
|
||||
export function findMediaModel(id) {
|
||||
const all = [
|
||||
...IMAGE_MODELS,
|
||||
...VIDEO_MODELS,
|
||||
...AUDIO_MODELS_BY_KIND.music,
|
||||
...AUDIO_MODELS_BY_KIND.speech,
|
||||
...AUDIO_MODELS_BY_KIND.sfx,
|
||||
];
|
||||
return all.find((m) => m.id === id) || null;
|
||||
}
|
||||
|
||||
export function modelsForSurface(surface, audioKind) {
|
||||
if (surface === 'image') return IMAGE_MODELS;
|
||||
if (surface === 'video') return VIDEO_MODELS;
|
||||
if (surface === 'audio') {
|
||||
const k = audioKind || 'music';
|
||||
return AUDIO_MODELS_BY_KIND[k] || AUDIO_MODELS_BY_KIND.music;
|
||||
}
|
||||
return [];
|
||||
}
|
||||
+263
@@ -0,0 +1,263 @@
|
||||
// Media-generation dispatcher. The unifying contract is:
|
||||
//
|
||||
// skills + metadata + system-prompt
|
||||
// ↓ (the code agent decides what to make)
|
||||
// `od media generate --surface … --model … --output … --prompt …`
|
||||
// ↓ (this module routes to a provider)
|
||||
// bytes written to <projectsRoot>/<projectId>/<output>
|
||||
// ↓
|
||||
// FileViewer renders it.
|
||||
//
|
||||
// Every surface (image / video / audio) flows through this single
|
||||
// entrypoint. Providers are pluggable: each file under ./media-providers/
|
||||
// (or inline below) registers handlers keyed by (surface, model). The
|
||||
// fallback handlers emit a deterministic, lightweight placeholder
|
||||
// (labeled SVG-PNG, silent WAV/MP3, blank MP4) so the framework works
|
||||
// without API keys — real provider integrations slot in later by
|
||||
// replacing the handler.
|
||||
|
||||
import { mkdir, stat, writeFile } from 'node:fs/promises';
|
||||
import path from 'node:path';
|
||||
import { findMediaModel } from './media-models.js';
|
||||
import {
|
||||
ensureProject,
|
||||
kindFor,
|
||||
mimeFor,
|
||||
sanitizeName,
|
||||
} from './projects.js';
|
||||
|
||||
const DEFAULT_OUTPUT_BY_SURFACE = {
|
||||
image: 'image.png',
|
||||
video: 'video.mp4',
|
||||
audio: 'audio.mp3',
|
||||
};
|
||||
|
||||
const SURFACES = new Set(['image', 'video', 'audio']);
|
||||
|
||||
/**
|
||||
* Generate a media artifact and write it into the project's files dir.
|
||||
*
|
||||
* @param {Object} args
|
||||
* @param {string} args.projectsRoot - Absolute path to <repo>/.od/projects.
|
||||
* @param {string} args.projectId
|
||||
* @param {'image'|'video'|'audio'} args.surface
|
||||
* @param {string} args.model - Must be a registered model id.
|
||||
* @param {string} [args.prompt]
|
||||
* @param {string} [args.output] - Optional filename; auto-named if missing.
|
||||
* @param {string} [args.aspect] - 1:1 / 16:9 / 9:16 / 4:3 / 3:4
|
||||
* @param {number} [args.length] - Video length, seconds.
|
||||
* @param {number} [args.duration] - Audio duration, seconds.
|
||||
* @param {string} [args.voice]
|
||||
* @param {string} [args.audioKind] - music | speech | sfx
|
||||
* @returns {Promise<{ name: string, size: number, mtime: number, kind: string, mime: string, model: string, surface: string, providerNote: string }>}
|
||||
*/
|
||||
export async function generateMedia(args) {
|
||||
const {
|
||||
projectsRoot,
|
||||
projectId,
|
||||
surface,
|
||||
model,
|
||||
prompt,
|
||||
output,
|
||||
aspect,
|
||||
length,
|
||||
duration,
|
||||
voice,
|
||||
audioKind,
|
||||
} = args;
|
||||
|
||||
if (!projectsRoot) throw new Error('projectsRoot required');
|
||||
if (typeof projectId !== 'string' || !projectId) {
|
||||
throw new Error('projectId required');
|
||||
}
|
||||
if (!SURFACES.has(surface)) {
|
||||
throw new Error(`unsupported surface: ${surface}`);
|
||||
}
|
||||
if (typeof model !== 'string' || !model) {
|
||||
throw new Error('model required');
|
||||
}
|
||||
const def = findMediaModel(model);
|
||||
if (!def) {
|
||||
throw new Error(
|
||||
`unknown model: ${model}. Pass --model from the registered list (see /api/media/models).`,
|
||||
);
|
||||
}
|
||||
|
||||
const dir = await ensureProject(projectsRoot, projectId);
|
||||
const safeOut = sanitizeName(
|
||||
output || autoOutputName(surface, model, audioKind),
|
||||
);
|
||||
const target = path.join(dir, safeOut);
|
||||
await mkdir(path.dirname(target), { recursive: true });
|
||||
|
||||
const ctx = {
|
||||
surface,
|
||||
model,
|
||||
prompt: prompt || '',
|
||||
aspect: aspect || defaultAspectFor(surface),
|
||||
length: typeof length === 'number' ? length : undefined,
|
||||
duration: typeof duration === 'number' ? duration : undefined,
|
||||
voice: voice || '',
|
||||
audioKind: audioKind || (surface === 'audio' ? 'music' : undefined),
|
||||
};
|
||||
|
||||
let bytes;
|
||||
let providerNote;
|
||||
if (surface === 'image') {
|
||||
({ bytes, providerNote } = await renderImage(ctx, safeOut));
|
||||
} else if (surface === 'video') {
|
||||
({ bytes, providerNote } = await renderVideo(ctx, safeOut));
|
||||
} else {
|
||||
({ bytes, providerNote } = await renderAudio(ctx, safeOut));
|
||||
}
|
||||
|
||||
await writeFile(target, bytes);
|
||||
const st = await stat(target);
|
||||
return {
|
||||
name: safeOut,
|
||||
size: st.size,
|
||||
mtime: st.mtimeMs,
|
||||
kind: kindFor(safeOut),
|
||||
mime: mimeFor(safeOut),
|
||||
model,
|
||||
surface,
|
||||
providerNote,
|
||||
};
|
||||
}
|
||||
|
||||
function autoOutputName(surface, model, audioKind) {
|
||||
const base = DEFAULT_OUTPUT_BY_SURFACE[surface] || 'artifact.bin';
|
||||
const stamp = Date.now().toString(36);
|
||||
const tag = surface === 'audio' && audioKind ? `${audioKind}-${model}` : model;
|
||||
const dot = base.lastIndexOf('.');
|
||||
const stem = dot > 0 ? base.slice(0, dot) : base;
|
||||
const ext = dot > 0 ? base.slice(dot) : '';
|
||||
return `${stem}-${tag}-${stamp}${ext}`;
|
||||
}
|
||||
|
||||
function defaultAspectFor(surface) {
|
||||
if (surface === 'image') return '1:1';
|
||||
if (surface === 'video') return '16:9';
|
||||
return undefined;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Provider stubs.
|
||||
//
|
||||
// Each renderer returns Buffer bytes that the caller writes to disk. They
|
||||
// produce real, lightweight placeholder media labelled with the model +
|
||||
// prompt so the user can verify which call was dispatched while the real
|
||||
// provider integrations are still pending. To replace a stub with a real
|
||||
// provider, swap the body — keep the (ctx, fileName) → { bytes, note }
|
||||
// shape so server.js doesn't change.
|
||||
|
||||
async function renderImage(ctx, fileName) {
|
||||
// SVG-as-image: write SVG bytes into a .png filename only when ext is
|
||||
// svg; otherwise emit a tiny PNG that browsers can decode. We pick
|
||||
// PNG-as-bytes by encoding the SVG inside a minimal PNG container —
|
||||
// simpler: just write SVG XML into a .png, browsers can't render that.
|
||||
// So instead: for png/jpg, emit a deterministic 1×1 PNG; for svg, emit
|
||||
// a labelled SVG.
|
||||
const ext = path.extname(fileName).toLowerCase();
|
||||
if (ext === '.svg') {
|
||||
return { bytes: Buffer.from(svgPlaceholder(ctx), 'utf8'), providerNote: 'svg-stub' };
|
||||
}
|
||||
// Minimal 1×1 transparent PNG. Real provider would emit a full image.
|
||||
const png = Buffer.from(
|
||||
[
|
||||
0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0x00, 0x00, 0x00, 0x0d,
|
||||
0x49, 0x48, 0x44, 0x52, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
|
||||
0x08, 0x06, 0x00, 0x00, 0x00, 0x1f, 0x15, 0xc4, 0x89, 0x00, 0x00, 0x00,
|
||||
0x0d, 0x49, 0x44, 0x41, 0x54, 0x78, 0x9c, 0x63, 0x00, 0x01, 0x00, 0x00,
|
||||
0x05, 0x00, 0x01, 0x0d, 0x0a, 0x2d, 0xb4, 0x00, 0x00, 0x00, 0x00, 0x49,
|
||||
0x45, 0x4e, 0x44, 0xae, 0x42, 0x60, 0x82,
|
||||
],
|
||||
);
|
||||
return {
|
||||
bytes: png,
|
||||
providerNote: `stub-png · model=${ctx.model} · aspect=${ctx.aspect} · prompt=${truncate(ctx.prompt, 60)}`,
|
||||
};
|
||||
}
|
||||
|
||||
async function renderVideo(ctx, _fileName) {
|
||||
// Tiny but valid mp4 (ftyp + minimal moov). Browsers without a video
|
||||
// track will show 0 seconds, which is fine — this proves the dispatch
|
||||
// round-trip; real Seedance/Kling/Veo providers replace this body.
|
||||
const ftyp = Buffer.from([
|
||||
0x00, 0x00, 0x00, 0x18, 0x66, 0x74, 0x79, 0x70, 0x69, 0x73, 0x6f, 0x6d,
|
||||
0x00, 0x00, 0x02, 0x00, 0x69, 0x73, 0x6f, 0x6d, 0x69, 0x73, 0x6f, 0x32,
|
||||
]);
|
||||
const mdat = Buffer.from([0x00, 0x00, 0x00, 0x08, 0x6d, 0x64, 0x61, 0x74]);
|
||||
return {
|
||||
bytes: Buffer.concat([ftyp, mdat]),
|
||||
providerNote: `stub-mp4 · model=${ctx.model} · aspect=${ctx.aspect} · length=${ctx.length ?? '?'}s · prompt=${truncate(ctx.prompt, 60)}`,
|
||||
};
|
||||
}
|
||||
|
||||
async function renderAudio(ctx, fileName) {
|
||||
const ext = path.extname(fileName).toLowerCase();
|
||||
if (ext === '.wav') {
|
||||
return {
|
||||
bytes: silentWav(0.5),
|
||||
providerNote: `stub-wav · model=${ctx.model} · kind=${ctx.audioKind} · duration=${ctx.duration ?? '?'}s`,
|
||||
};
|
||||
}
|
||||
// Default: emit a near-empty mp3 frame header so the file is valid but
|
||||
// tiny. Browsers may report 0:00; replace with real provider output.
|
||||
const mp3 = Buffer.from([
|
||||
0xff, 0xfb, 0x90, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
]);
|
||||
return {
|
||||
bytes: mp3,
|
||||
providerNote: `stub-mp3 · model=${ctx.model} · kind=${ctx.audioKind} · voice=${ctx.voice || '-'} · duration=${ctx.duration ?? '?'}s`,
|
||||
};
|
||||
}
|
||||
|
||||
function svgPlaceholder(ctx) {
|
||||
const [w, h] = aspectToBox(ctx.aspect, 800);
|
||||
const safe = (s) =>
|
||||
String(s || '')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>');
|
||||
return [
|
||||
`<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 ${w} ${h}" width="${w}" height="${h}">`,
|
||||
`<rect width="${w}" height="${h}" fill="#0f1424"/>`,
|
||||
`<text x="50%" y="50%" fill="#7da4ff" font-family="ui-sans-serif" font-size="20" text-anchor="middle">${safe(ctx.model)} — ${safe(ctx.prompt).slice(0, 60)}</text>`,
|
||||
'</svg>',
|
||||
].join('');
|
||||
}
|
||||
|
||||
function aspectToBox(aspect, base) {
|
||||
const [a, b] = String(aspect || '1:1').split(':').map(Number);
|
||||
if (!a || !b) return [base, base];
|
||||
if (a >= b) return [base, Math.round((base * b) / a)];
|
||||
return [Math.round((base * a) / b), base];
|
||||
}
|
||||
|
||||
function silentWav(seconds) {
|
||||
const sampleRate = 8000;
|
||||
const numSamples = Math.max(1, Math.round(sampleRate * seconds));
|
||||
const dataSize = numSamples * 2;
|
||||
const buf = Buffer.alloc(44 + dataSize);
|
||||
buf.write('RIFF', 0, 'ascii');
|
||||
buf.writeUInt32LE(36 + dataSize, 4);
|
||||
buf.write('WAVE', 8, 'ascii');
|
||||
buf.write('fmt ', 12, 'ascii');
|
||||
buf.writeUInt32LE(16, 16);
|
||||
buf.writeUInt16LE(1, 20); // PCM
|
||||
buf.writeUInt16LE(1, 22); // mono
|
||||
buf.writeUInt32LE(sampleRate, 24);
|
||||
buf.writeUInt32LE(sampleRate * 2, 28);
|
||||
buf.writeUInt16LE(2, 32);
|
||||
buf.writeUInt16LE(16, 34);
|
||||
buf.write('data', 36, 'ascii');
|
||||
buf.writeUInt32LE(dataSize, 40);
|
||||
return buf;
|
||||
}
|
||||
|
||||
function truncate(s, n) {
|
||||
const v = String(s || '');
|
||||
if (v.length <= n) return v;
|
||||
return v.slice(0, n - 1) + '…';
|
||||
}
|
||||
@@ -156,6 +156,21 @@ const EXT_MIME = {
|
||||
'.gif': 'image/gif',
|
||||
'.webp': 'image/webp',
|
||||
'.avif': 'image/avif',
|
||||
// Video — covered MIMEs are the formats most generators emit. Browsers
|
||||
// play them via <video> / <audio> in the FileViewer with no transcode.
|
||||
'.mp4': 'video/mp4',
|
||||
'.m4v': 'video/mp4',
|
||||
'.webm': 'video/webm',
|
||||
'.mov': 'video/quicktime',
|
||||
// Audio — music / TTS generators commonly produce mp3 / wav / ogg /
|
||||
// m4a; flac is rarer but cheap to support.
|
||||
'.mp3': 'audio/mpeg',
|
||||
'.wav': 'audio/wav',
|
||||
'.ogg': 'audio/ogg',
|
||||
'.oga': 'audio/ogg',
|
||||
'.m4a': 'audio/mp4',
|
||||
'.flac': 'audio/flac',
|
||||
'.aac': 'audio/aac',
|
||||
};
|
||||
|
||||
export function mimeFor(name) {
|
||||
@@ -175,6 +190,10 @@ export function kindFor(name) {
|
||||
if (name.startsWith('sketch-')) return 'sketch';
|
||||
return 'image';
|
||||
}
|
||||
if (['.mp4', '.m4v', '.webm', '.mov'].includes(ext)) return 'video';
|
||||
if (['.mp3', '.wav', '.ogg', '.oga', '.m4a', '.flac', '.aac'].includes(ext)) {
|
||||
return 'audio';
|
||||
}
|
||||
if (['.md', '.txt'].includes(ext)) return 'text';
|
||||
if (['.js', '.mjs', '.cjs', '.ts', '.tsx', '.json', '.css'].includes(ext)) {
|
||||
return 'code';
|
||||
|
||||
+74
-1
@@ -22,6 +22,15 @@ import {
|
||||
sanitizeName,
|
||||
writeProjectFile,
|
||||
} from './projects.js';
|
||||
import { generateMedia } from './media.js';
|
||||
import {
|
||||
AUDIO_MODELS_BY_KIND,
|
||||
IMAGE_MODELS,
|
||||
VIDEO_MODELS,
|
||||
MEDIA_ASPECTS,
|
||||
VIDEO_LENGTHS_SEC,
|
||||
AUDIO_DURATIONS_SEC,
|
||||
} from './media-models.js';
|
||||
import {
|
||||
deleteConversation,
|
||||
deleteProject as dbDeleteProject,
|
||||
@@ -50,6 +59,10 @@ const PROJECT_ROOT = path.resolve(__dirname, '..');
|
||||
const STATIC_DIR = path.join(PROJECT_ROOT, 'dist');
|
||||
const SKILLS_DIR = path.join(PROJECT_ROOT, 'skills');
|
||||
const DESIGN_SYSTEMS_DIR = path.join(PROJECT_ROOT, 'design-systems');
|
||||
// Absolute path to the daemon CLI entry. We inject this into the spawned
|
||||
// agent's env as OD_BIN so the agent can run `node "$OD_BIN" media generate …`
|
||||
// regardless of whether the user has `od` on PATH.
|
||||
const OD_BIN_PATH = path.join(__dirname, 'cli.js');
|
||||
const ARTIFACTS_DIR = path.join(PROJECT_ROOT, '.od', 'artifacts');
|
||||
const PROJECTS_DIR = path.join(PROJECT_ROOT, '.od', 'projects');
|
||||
fs.mkdirSync(PROJECTS_DIR, { recursive: true });
|
||||
@@ -650,6 +663,56 @@ export async function startServer({ port = 7456 } = {}) {
|
||||
}
|
||||
});
|
||||
|
||||
// ---- Media generation -----------------------------------------------------
|
||||
//
|
||||
// Surface-agnostic media dispatcher. The code agent reaches this via
|
||||
// `od media generate` (see daemon/cli.js media subcommand), which is
|
||||
// the unified contract: skills + metadata + system-prompt instruct the
|
||||
// agent on WHAT to produce, the agent invokes ONE entrypoint that
|
||||
// dispatches per (surface, model) and writes the bytes into the project.
|
||||
// The shape of the response matches POST /api/projects/:id/files so the
|
||||
// frontend can refresh the file list with the same code path.
|
||||
|
||||
app.get('/api/media/models', (_req, res) => {
|
||||
res.json({
|
||||
image: IMAGE_MODELS,
|
||||
video: VIDEO_MODELS,
|
||||
audio: AUDIO_MODELS_BY_KIND,
|
||||
aspects: MEDIA_ASPECTS,
|
||||
videoLengthsSec: VIDEO_LENGTHS_SEC,
|
||||
audioDurationsSec: AUDIO_DURATIONS_SEC,
|
||||
});
|
||||
});
|
||||
|
||||
app.post('/api/projects/:id/media/generate', async (req, res) => {
|
||||
try {
|
||||
const projectId = req.params.id;
|
||||
// Ensure the project exists in DB before writing files; this gives
|
||||
// a friendly 404 when the agent calls with a bad id. The agent
|
||||
// normally inherits OD_PROJECT_ID from spawn env so this should
|
||||
// always resolve.
|
||||
const project = getProject(db, projectId);
|
||||
if (!project) return res.status(404).json({ error: 'project not found' });
|
||||
const meta = await generateMedia({
|
||||
projectsRoot: PROJECTS_DIR,
|
||||
projectId,
|
||||
surface: req.body?.surface,
|
||||
model: req.body?.model,
|
||||
prompt: req.body?.prompt,
|
||||
output: req.body?.output,
|
||||
aspect: req.body?.aspect,
|
||||
length: typeof req.body?.length === 'number' ? req.body.length : undefined,
|
||||
duration:
|
||||
typeof req.body?.duration === 'number' ? req.body.duration : undefined,
|
||||
voice: req.body?.voice,
|
||||
audioKind: req.body?.audioKind,
|
||||
});
|
||||
res.json({ file: meta });
|
||||
} catch (err) {
|
||||
res.status(400).json({ error: String(err && err.message ? err.message : err) });
|
||||
}
|
||||
});
|
||||
|
||||
// Multi-file upload that the chat composer uses for paste/drop/picker.
|
||||
// Files land flat in the project folder; the response carries the same
|
||||
// metadata as listFiles so the client can stage them as ChatAttachments
|
||||
@@ -800,10 +863,20 @@ export async function startServer({ port = 7456 } = {}) {
|
||||
cwd,
|
||||
});
|
||||
|
||||
// Inject the OD context. Skills + the media-contract prompt tell the
|
||||
// agent how to spend this — call `node "$OD_BIN" media generate
|
||||
// --project "$OD_PROJECT_ID" …` and the daemon dispatches.
|
||||
const odEnv = {
|
||||
OD_BIN: OD_BIN_PATH,
|
||||
OD_DAEMON_URL: `http://127.0.0.1:${port}`,
|
||||
OD_PROJECT_ID: typeof projectId === 'string' ? projectId : '',
|
||||
OD_PROJECT_DIR: cwd || '',
|
||||
};
|
||||
|
||||
let child;
|
||||
try {
|
||||
child = spawn(def.bin, args, {
|
||||
env: { ...process.env },
|
||||
env: { ...process.env, ...odEnv },
|
||||
stdio: ['ignore', 'pipe', 'pipe'],
|
||||
cwd: cwd || undefined,
|
||||
});
|
||||
|
||||
@@ -25,12 +25,16 @@ export async function listSkills(skillsRoot) {
|
||||
const { data, body } = parseFrontmatter(raw);
|
||||
const hasAttachments = await dirHasAttachments(dir);
|
||||
const mode = data.od?.mode || inferMode(body, data.description);
|
||||
const surface = normalizeSurface(data.od?.surface, mode);
|
||||
out.push({
|
||||
id: data.name || entry.name,
|
||||
name: data.name || entry.name,
|
||||
description: data.description || "",
|
||||
triggers: Array.isArray(data.triggers) ? data.triggers : [],
|
||||
mode,
|
||||
// Surface defaults to inferring from `mode` so legacy SKILL.md
|
||||
// files (no `od.surface` declared) keep classifying correctly.
|
||||
surface,
|
||||
platform: normalizePlatform(
|
||||
data.od?.platform,
|
||||
mode,
|
||||
@@ -159,6 +163,20 @@ function inferMode(body, description) {
|
||||
return "prototype";
|
||||
}
|
||||
|
||||
// Surface is the high-level output bucket — web, image, video or audio.
|
||||
// Authors can pin it via `od.surface`; otherwise we derive from `mode`,
|
||||
// then fall back to the safe default ('web') so existing skills classify
|
||||
// unchanged.
|
||||
const KNOWN_SURFACES = new Set(["web", "image", "video", "audio"]);
|
||||
function normalizeSurface(value, mode) {
|
||||
if (typeof value === "string") {
|
||||
const v = value.trim().toLowerCase();
|
||||
if (KNOWN_SURFACES.has(v)) return v;
|
||||
}
|
||||
if (mode === "image" || mode === "video" || mode === "audio") return mode;
|
||||
return "web";
|
||||
}
|
||||
|
||||
// Validate platform tag — only desktop / mobile are meaningful for the
|
||||
// Examples gallery. Falls back to autodetecting "mobile" from descriptions
|
||||
// so legacy skills sort under the right pill without authoring changes.
|
||||
|
||||
Reference in New Issue
Block a user