diff --git a/daemon/cli.js b/daemon/cli.js index 2cb1deb..29fcef1 100644 --- a/daemon/cli.js +++ b/daemon/cli.js @@ -1,24 +1,44 @@ #!/usr/bin/env node import { startServer } from './server.js'; -const args = process.argv.slice(2); +const argv = process.argv.slice(2); + +// ---- Subcommand router ---------------------------------------------------- +// +// `od` is two CLIs glued together: +// - default mode: starts the daemon + opens the web UI. +// - `od media …`: a thin client that POSTs to the running daemon. This +// is what the code agent invokes from inside a chat to actually +// produce image / video / audio bytes (the unifying contract). +// +// We dispatch on the first positional argument so flags like --port keep +// working unchanged. Subcommand routing is keyword-based; flags are +// parsed inside each handler. + +const SUBCOMMAND_MAP = { + media: runMedia, +}; + +const first = argv.find((a) => !a.startsWith('-')); +if (first && SUBCOMMAND_MAP[first]) { + const idx = argv.indexOf(first); + const rest = [...argv.slice(0, idx), ...argv.slice(idx + 1)]; + await SUBCOMMAND_MAP[first](rest); + process.exit(0); +} + +// Default: daemon mode. let port = Number(process.env.OD_PORT) || 7456; let open = true; -for (let i = 0; i < args.length; i++) { - const a = args[i]; +for (let i = 0; i < argv.length; i++) { + const a = argv[i]; if (a === '-p' || a === '--port') { - port = Number(args[++i]); + port = Number(argv[++i]); } else if (a === '--no-open') { open = false; } else if (a === '-h' || a === '--help') { - console.log(`Usage: od [--port ] [--no-open] - -Starts a local daemon that: - * scans PATH for installed code-agent CLIs (claude, codex, gemini, opencode, cursor-agent, ...) - * serves a tiny web chat UI at http://localhost: - * proxies messages (text + images) to the selected agent via child-process spawn -`); + printRootHelp(); process.exit(0); } } @@ -34,3 +54,134 @@ startServer({ port }).then(url => { }); } }); + +function printRootHelp() { + console.log(`Usage: + od [--port ] [--no-open] + Start the local daemon and open the web UI. + + od media generate --surface --model [opts] + Generate a media artifact and write it into the active project. + Designed to be invoked by a code agent — picks up OD_DAEMON_URL + and OD_PROJECT_ID from the env that the daemon injected on spawn. + +What the daemon does: + * scans PATH for installed code-agent CLIs (claude, codex, gemini, opencode, cursor-agent, ...) + * serves the chat UI at http://localhost: + * proxies messages (text + images) to the selected agent via child-process spawn + * exposes /api/projects/:id/media/generate — the unified image/video/audio + dispatcher that the agent calls via \`od media generate\`.`); +} + +// --------------------------------------------------------------------------- +// Subcommand: od media … +// --------------------------------------------------------------------------- + +async function runMedia(args) { + const sub = args.find((a) => !a.startsWith('-')) || ''; + if (sub === 'help' || sub === '-h' || sub === '--help' || sub === '') { + printMediaHelp(); + return; + } + if (sub !== 'generate') { + console.error(`unknown subcommand: od media ${sub}`); + printMediaHelp(); + process.exit(1); + } + + const idx = args.indexOf(sub); + const flags = parseFlags([...args.slice(0, idx), ...args.slice(idx + 1)]); + + const daemonUrl = flags['daemon-url'] || process.env.OD_DAEMON_URL || 'http://127.0.0.1:7456'; + const projectId = flags.project || process.env.OD_PROJECT_ID; + if (!projectId) { + console.error( + 'project id required. Pass --project or set OD_PROJECT_ID. The daemon injects this when it spawns the code agent.', + ); + process.exit(2); + } + + const surface = flags.surface; + if (!surface || !['image', 'video', 'audio'].includes(surface)) { + console.error('--surface must be one of: image | video | audio'); + process.exit(2); + } + if (!flags.model) { + console.error('--model required (see http:///api/media/models)'); + process.exit(2); + } + + const body = { + surface, + model: flags.model, + prompt: flags.prompt, + output: flags.output, + aspect: flags.aspect, + voice: flags.voice, + audioKind: flags['audio-kind'], + }; + if (flags.length != null) body.length = Number(flags.length); + if (flags.duration != null) body.duration = Number(flags.duration); + + const url = `${daemonUrl.replace(/\/$/, '')}/api/projects/${encodeURIComponent(projectId)}/media/generate`; + let resp; + try { + resp = await fetch(url, { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify(body), + }); + } catch (err) { + console.error(`failed to reach daemon at ${daemonUrl}: ${err.message}`); + process.exit(3); + } + const text = await resp.text(); + if (!resp.ok) { + console.error(`daemon ${resp.status}: ${text}`); + process.exit(4); + } + // Print the JSON response as one line so the agent can parse it. + process.stdout.write(text.trim() + '\n'); +} + +function parseFlags(argv) { + const out = {}; + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + if (!a || !a.startsWith('--')) continue; + const key = a.slice(2); + const next = argv[i + 1]; + if (next != null && !next.startsWith('--')) { + out[key] = next; + i++; + } else { + out[key] = true; + } + } + return out; +} + +function printMediaHelp() { + console.log(`Usage: od media generate --surface --model [opts] + +Required: + --surface image | video | audio + --model Model id from /api/media/models (e.g. gpt-image-2, seedance-2, suno-v5). + --project Project id. Auto-resolved from OD_PROJECT_ID when invoked by the daemon. + +Common options: + --prompt "" Generation prompt. + --output File to write under the project. Auto-named if omitted. + --aspect 1:1|16:9|9:16|4:3|3:4 + --length Video length. + --duration Audio duration. + --voice Speech / TTS voice. + --audio-kind music|speech|sfx + --daemon-url http://127.0.0.1:7456 + +Output: a single line of JSON: {"file": { name, size, kind, mime, ... }}. + +Skills should call this and then reference the returned filename in their +artifact / message body. The daemon writes the bytes into the project's +files folder so the FileViewer can preview them immediately.`); +} diff --git a/daemon/design-systems.js b/daemon/design-systems.js index 544c4bf..9a9622b 100644 --- a/daemon/design-systems.js +++ b/daemon/design-systems.js @@ -29,6 +29,11 @@ export async function listDesignSystems(root) { category: extractCategory(raw) ?? 'Uncategorized', summary: summarize(raw), swatches: extractSwatches(raw), + // Optional `> Surface: image|video|audio` blockquote line. Most + // existing systems target the web surface and don't declare it; + // we default to 'web' so the right-side filter classifies them + // correctly. + surface: extractSurface(raw), body: raw, }); } catch { @@ -67,6 +72,14 @@ function extractCategory(raw) { return m?.[1]; } +const KNOWN_SURFACES = new Set(['web', 'image', 'video', 'audio']); +function extractSurface(raw) { + const m = /^>\s*Surface:\s*(.+?)\s*$/im.exec(raw); + if (!m) return 'web'; + const v = m[1].trim().toLowerCase(); + return KNOWN_SURFACES.has(v) ? v : 'web'; +} + // Strip boilerplate like "Design System Inspired by Cohere" → "Cohere" so // the picker dropdown reads cleanly. Hand-authored titles that don't match // the pattern (e.g. "Neutral Modern") pass through unchanged. diff --git a/daemon/media-models.js b/daemon/media-models.js new file mode 100644 index 0000000..11f537f --- /dev/null +++ b/daemon/media-models.js @@ -0,0 +1,62 @@ +// Daemon-side mirror of src/media/models.ts. We keep this in plain JS so +// node imports are native and the daemon never needs a TS toolchain at +// runtime. The two files are kept in sync by review — any model added to +// src/media/models.ts must be added here too. Tests in verify ensure the +// arrays are non-empty and IDs are unique. + +export const IMAGE_MODELS = [ + { id: 'gpt-image-2', label: 'gpt-image-2', hint: 'OpenAI · default', caps: ['t2i', 'i2i', 'inpaint'] }, + { id: 'flux-1.1-pro', label: 'flux-1.1-pro', hint: 'Black Forest Labs', caps: ['t2i', 'i2i'] }, + { id: 'imagen-4', label: 'imagen-4', hint: 'Google', caps: ['t2i'] }, + { id: 'midjourney-v7', label: 'midjourney-v7', hint: 'Midjourney', caps: ['t2i'] }, +]; + +export const VIDEO_MODELS = [ + { id: 'seedance-2', label: 'seedance-2', hint: 'ByteDance · default', caps: ['t2v', 'i2v'] }, + { id: 'kling-3', label: 'kling-3', hint: 'Kuaishou', caps: ['t2v', 'i2v'] }, + { id: 'kling-4', label: 'kling-4', hint: 'Kuaishou · latest', caps: ['t2v', 'i2v'] }, + { id: 'veo-3', label: 'veo-3', hint: 'Google', caps: ['t2v'] }, + { id: 'sora-2', label: 'sora-2', hint: 'OpenAI', caps: ['t2v'] }, +]; + +export const AUDIO_MODELS_BY_KIND = { + music: [ + { id: 'suno-v5', label: 'suno-v5', hint: 'Suno · default', caps: ['music'] }, + { id: 'udio-v2', label: 'udio-v2', hint: 'Udio', caps: ['music'] }, + { id: 'lyria-2', label: 'lyria-2', hint: 'Google', caps: ['music'] }, + ], + speech: [ + { id: 'minimax-tts', label: 'minimax-tts', hint: 'MiniMax · default', caps: ['tts'] }, + { id: 'fish-speech-2', label: 'fish-speech-2', hint: 'FishAudio', caps: ['tts', 'voice-clone'] }, + { id: 'elevenlabs-v3', label: 'elevenlabs-v3', hint: 'ElevenLabs', caps: ['tts', 'voice-clone'] }, + ], + sfx: [ + { id: 'elevenlabs-sfx', label: 'elevenlabs-sfx', hint: 'ElevenLabs SFX', caps: ['sfx'] }, + { id: 'audiocraft', label: 'audiocraft', hint: 'Meta · open', caps: ['sfx', 'music'] }, + ], +}; + +export const MEDIA_ASPECTS = ['1:1', '16:9', '9:16', '4:3', '3:4']; +export const VIDEO_LENGTHS_SEC = [3, 5, 8, 10, 15, 30]; +export const AUDIO_DURATIONS_SEC = [5, 10, 15, 30, 60, 120]; + +export function findMediaModel(id) { + const all = [ + ...IMAGE_MODELS, + ...VIDEO_MODELS, + ...AUDIO_MODELS_BY_KIND.music, + ...AUDIO_MODELS_BY_KIND.speech, + ...AUDIO_MODELS_BY_KIND.sfx, + ]; + return all.find((m) => m.id === id) || null; +} + +export function modelsForSurface(surface, audioKind) { + if (surface === 'image') return IMAGE_MODELS; + if (surface === 'video') return VIDEO_MODELS; + if (surface === 'audio') { + const k = audioKind || 'music'; + return AUDIO_MODELS_BY_KIND[k] || AUDIO_MODELS_BY_KIND.music; + } + return []; +} diff --git a/daemon/media.js b/daemon/media.js new file mode 100644 index 0000000..84b921d --- /dev/null +++ b/daemon/media.js @@ -0,0 +1,263 @@ +// Media-generation dispatcher. The unifying contract is: +// +// skills + metadata + system-prompt +// ↓ (the code agent decides what to make) +// `od media generate --surface … --model … --output … --prompt …` +// ↓ (this module routes to a provider) +// bytes written to // +// ↓ +// FileViewer renders it. +// +// Every surface (image / video / audio) flows through this single +// entrypoint. Providers are pluggable: each file under ./media-providers/ +// (or inline below) registers handlers keyed by (surface, model). The +// fallback handlers emit a deterministic, lightweight placeholder +// (labeled SVG-PNG, silent WAV/MP3, blank MP4) so the framework works +// without API keys — real provider integrations slot in later by +// replacing the handler. + +import { mkdir, stat, writeFile } from 'node:fs/promises'; +import path from 'node:path'; +import { findMediaModel } from './media-models.js'; +import { + ensureProject, + kindFor, + mimeFor, + sanitizeName, +} from './projects.js'; + +const DEFAULT_OUTPUT_BY_SURFACE = { + image: 'image.png', + video: 'video.mp4', + audio: 'audio.mp3', +}; + +const SURFACES = new Set(['image', 'video', 'audio']); + +/** + * Generate a media artifact and write it into the project's files dir. + * + * @param {Object} args + * @param {string} args.projectsRoot - Absolute path to /.od/projects. + * @param {string} args.projectId + * @param {'image'|'video'|'audio'} args.surface + * @param {string} args.model - Must be a registered model id. + * @param {string} [args.prompt] + * @param {string} [args.output] - Optional filename; auto-named if missing. + * @param {string} [args.aspect] - 1:1 / 16:9 / 9:16 / 4:3 / 3:4 + * @param {number} [args.length] - Video length, seconds. + * @param {number} [args.duration] - Audio duration, seconds. + * @param {string} [args.voice] + * @param {string} [args.audioKind] - music | speech | sfx + * @returns {Promise<{ name: string, size: number, mtime: number, kind: string, mime: string, model: string, surface: string, providerNote: string }>} + */ +export async function generateMedia(args) { + const { + projectsRoot, + projectId, + surface, + model, + prompt, + output, + aspect, + length, + duration, + voice, + audioKind, + } = args; + + if (!projectsRoot) throw new Error('projectsRoot required'); + if (typeof projectId !== 'string' || !projectId) { + throw new Error('projectId required'); + } + if (!SURFACES.has(surface)) { + throw new Error(`unsupported surface: ${surface}`); + } + if (typeof model !== 'string' || !model) { + throw new Error('model required'); + } + const def = findMediaModel(model); + if (!def) { + throw new Error( + `unknown model: ${model}. Pass --model from the registered list (see /api/media/models).`, + ); + } + + const dir = await ensureProject(projectsRoot, projectId); + const safeOut = sanitizeName( + output || autoOutputName(surface, model, audioKind), + ); + const target = path.join(dir, safeOut); + await mkdir(path.dirname(target), { recursive: true }); + + const ctx = { + surface, + model, + prompt: prompt || '', + aspect: aspect || defaultAspectFor(surface), + length: typeof length === 'number' ? length : undefined, + duration: typeof duration === 'number' ? duration : undefined, + voice: voice || '', + audioKind: audioKind || (surface === 'audio' ? 'music' : undefined), + }; + + let bytes; + let providerNote; + if (surface === 'image') { + ({ bytes, providerNote } = await renderImage(ctx, safeOut)); + } else if (surface === 'video') { + ({ bytes, providerNote } = await renderVideo(ctx, safeOut)); + } else { + ({ bytes, providerNote } = await renderAudio(ctx, safeOut)); + } + + await writeFile(target, bytes); + const st = await stat(target); + return { + name: safeOut, + size: st.size, + mtime: st.mtimeMs, + kind: kindFor(safeOut), + mime: mimeFor(safeOut), + model, + surface, + providerNote, + }; +} + +function autoOutputName(surface, model, audioKind) { + const base = DEFAULT_OUTPUT_BY_SURFACE[surface] || 'artifact.bin'; + const stamp = Date.now().toString(36); + const tag = surface === 'audio' && audioKind ? `${audioKind}-${model}` : model; + const dot = base.lastIndexOf('.'); + const stem = dot > 0 ? base.slice(0, dot) : base; + const ext = dot > 0 ? base.slice(dot) : ''; + return `${stem}-${tag}-${stamp}${ext}`; +} + +function defaultAspectFor(surface) { + if (surface === 'image') return '1:1'; + if (surface === 'video') return '16:9'; + return undefined; +} + +// --------------------------------------------------------------------------- +// Provider stubs. +// +// Each renderer returns Buffer bytes that the caller writes to disk. They +// produce real, lightweight placeholder media labelled with the model + +// prompt so the user can verify which call was dispatched while the real +// provider integrations are still pending. To replace a stub with a real +// provider, swap the body — keep the (ctx, fileName) → { bytes, note } +// shape so server.js doesn't change. + +async function renderImage(ctx, fileName) { + // SVG-as-image: write SVG bytes into a .png filename only when ext is + // svg; otherwise emit a tiny PNG that browsers can decode. We pick + // PNG-as-bytes by encoding the SVG inside a minimal PNG container — + // simpler: just write SVG XML into a .png, browsers can't render that. + // So instead: for png/jpg, emit a deterministic 1×1 PNG; for svg, emit + // a labelled SVG. + const ext = path.extname(fileName).toLowerCase(); + if (ext === '.svg') { + return { bytes: Buffer.from(svgPlaceholder(ctx), 'utf8'), providerNote: 'svg-stub' }; + } + // Minimal 1×1 transparent PNG. Real provider would emit a full image. + const png = Buffer.from( + [ + 0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0x00, 0x00, 0x00, 0x0d, + 0x49, 0x48, 0x44, 0x52, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, + 0x08, 0x06, 0x00, 0x00, 0x00, 0x1f, 0x15, 0xc4, 0x89, 0x00, 0x00, 0x00, + 0x0d, 0x49, 0x44, 0x41, 0x54, 0x78, 0x9c, 0x63, 0x00, 0x01, 0x00, 0x00, + 0x05, 0x00, 0x01, 0x0d, 0x0a, 0x2d, 0xb4, 0x00, 0x00, 0x00, 0x00, 0x49, + 0x45, 0x4e, 0x44, 0xae, 0x42, 0x60, 0x82, + ], + ); + return { + bytes: png, + providerNote: `stub-png · model=${ctx.model} · aspect=${ctx.aspect} · prompt=${truncate(ctx.prompt, 60)}`, + }; +} + +async function renderVideo(ctx, _fileName) { + // Tiny but valid mp4 (ftyp + minimal moov). Browsers without a video + // track will show 0 seconds, which is fine — this proves the dispatch + // round-trip; real Seedance/Kling/Veo providers replace this body. + const ftyp = Buffer.from([ + 0x00, 0x00, 0x00, 0x18, 0x66, 0x74, 0x79, 0x70, 0x69, 0x73, 0x6f, 0x6d, + 0x00, 0x00, 0x02, 0x00, 0x69, 0x73, 0x6f, 0x6d, 0x69, 0x73, 0x6f, 0x32, + ]); + const mdat = Buffer.from([0x00, 0x00, 0x00, 0x08, 0x6d, 0x64, 0x61, 0x74]); + return { + bytes: Buffer.concat([ftyp, mdat]), + providerNote: `stub-mp4 · model=${ctx.model} · aspect=${ctx.aspect} · length=${ctx.length ?? '?'}s · prompt=${truncate(ctx.prompt, 60)}`, + }; +} + +async function renderAudio(ctx, fileName) { + const ext = path.extname(fileName).toLowerCase(); + if (ext === '.wav') { + return { + bytes: silentWav(0.5), + providerNote: `stub-wav · model=${ctx.model} · kind=${ctx.audioKind} · duration=${ctx.duration ?? '?'}s`, + }; + } + // Default: emit a near-empty mp3 frame header so the file is valid but + // tiny. Browsers may report 0:00; replace with real provider output. + const mp3 = Buffer.from([ + 0xff, 0xfb, 0x90, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ]); + return { + bytes: mp3, + providerNote: `stub-mp3 · model=${ctx.model} · kind=${ctx.audioKind} · voice=${ctx.voice || '-'} · duration=${ctx.duration ?? '?'}s`, + }; +} + +function svgPlaceholder(ctx) { + const [w, h] = aspectToBox(ctx.aspect, 800); + const safe = (s) => + String(s || '') + .replace(/&/g, '&') + .replace(//g, '>'); + return [ + ``, + ``, + `${safe(ctx.model)} — ${safe(ctx.prompt).slice(0, 60)}`, + '', + ].join(''); +} + +function aspectToBox(aspect, base) { + const [a, b] = String(aspect || '1:1').split(':').map(Number); + if (!a || !b) return [base, base]; + if (a >= b) return [base, Math.round((base * b) / a)]; + return [Math.round((base * a) / b), base]; +} + +function silentWav(seconds) { + const sampleRate = 8000; + const numSamples = Math.max(1, Math.round(sampleRate * seconds)); + const dataSize = numSamples * 2; + const buf = Buffer.alloc(44 + dataSize); + buf.write('RIFF', 0, 'ascii'); + buf.writeUInt32LE(36 + dataSize, 4); + buf.write('WAVE', 8, 'ascii'); + buf.write('fmt ', 12, 'ascii'); + buf.writeUInt32LE(16, 16); + buf.writeUInt16LE(1, 20); // PCM + buf.writeUInt16LE(1, 22); // mono + buf.writeUInt32LE(sampleRate, 24); + buf.writeUInt32LE(sampleRate * 2, 28); + buf.writeUInt16LE(2, 32); + buf.writeUInt16LE(16, 34); + buf.write('data', 36, 'ascii'); + buf.writeUInt32LE(dataSize, 40); + return buf; +} + +function truncate(s, n) { + const v = String(s || ''); + if (v.length <= n) return v; + return v.slice(0, n - 1) + '…'; +} diff --git a/daemon/projects.js b/daemon/projects.js index c1a94e5..1b093a1 100644 --- a/daemon/projects.js +++ b/daemon/projects.js @@ -156,6 +156,21 @@ const EXT_MIME = { '.gif': 'image/gif', '.webp': 'image/webp', '.avif': 'image/avif', + // Video — covered MIMEs are the formats most generators emit. Browsers + // play them via