Merge PR #12 (cursor/47ca13ab) into cursor/289994c1

Bring in the parallel media-surfaces branch from PR #12. Tree is already identical to HEAD (same od media generate work landed independently), so this is a history-only merge to consolidate the two branches.
feat(media): add image/video/audio project kinds via od media generate
2026-04-28 22:46:20 +08:00 · 2026-04-28 22:41:14 +08:00 · 2026-04-28 22:40:58 +08:00 · 2026-04-28 22:31:29 +08:00 · 2026-04-28 22:31:20 +08:00 · 2026-04-28 20:32:39 +08:00
28 changed files with 2902 additions and 78 deletions
@@ -1,24 +1,44 @@
 #!/usr/bin/env node
 import { startServer } from './server.js';
-const args = process.argv.slice(2);
+const argv = process.argv.slice(2);
 // ---- Subcommand router ----------------------------------------------------
 //
 // `od` is two CLIs glued together:
 //   - default mode: starts the daemon + opens the web UI.
 //   - `od media …`: a thin client that POSTs to the running daemon. This
 //     is what the code agent invokes from inside a chat to actually
 //     produce image / video / audio bytes (the unifying contract).
 //
 // We dispatch on the first positional argument so flags like --port keep
 // working unchanged. Subcommand routing is keyword-based; flags are
 // parsed inside each handler.
 const SUBCOMMAND_MAP = {
  media: runMedia,
 };
 const first = argv.find((a) => !a.startsWith('-'));
 if (first && SUBCOMMAND_MAP[first]) {
  const idx = argv.indexOf(first);
  const rest = [...argv.slice(0, idx), ...argv.slice(idx + 1)];
  await SUBCOMMAND_MAP[first](rest);
  process.exit(0);
 }
 // Default: daemon mode.
 let port = Number(process.env.OD_PORT) || 7456;
 let open = true;
-for (let i = 0; i < args.length; i++) {
+for (let i = 0; i < argv.length; i++) {
-  const a = args[i];
+  const a = argv[i];
  if (a === '-p' || a === '--port') {
-    port = Number(args[++i]);
+    port = Number(argv[++i]);
  } else if (a === '--no-open') {
    open = false;
  } else if (a === '-h' || a === '--help') {
-    console.log(`Usage: od [--port <n>] [--no-open]
+    printRootHelp();
 Starts a local daemon that:
  * scans PATH for installed code-agent CLIs (claude, codex, gemini, opencode, cursor-agent, ...)
  * serves a tiny web chat UI at http://localhost:<port>
  * proxies messages (text + images) to the selected agent via child-process spawn
 `);
    process.exit(0);
  }
 }
@@ -34,3 +54,134 @@ startServer({ port }).then(url => {
    });
  }
 });
 function printRootHelp() {
  console.log(`Usage:
  od [--port <n>] [--no-open]
      Start the local daemon and open the web UI.
  od media generate --surface <image|video|audio> --model <id> [opts]
      Generate a media artifact and write it into the active project.
      Designed to be invoked by a code agent — picks up OD_DAEMON_URL
      and OD_PROJECT_ID from the env that the daemon injected on spawn.
 What the daemon does:
  * scans PATH for installed code-agent CLIs (claude, codex, gemini, opencode, cursor-agent, ...)
  * serves the chat UI at http://localhost:<port>
  * proxies messages (text + images) to the selected agent via child-process spawn
  * exposes /api/projects/:id/media/generate — the unified image/video/audio
    dispatcher that the agent calls via \`od media generate\`.`);
 }
 // ---------------------------------------------------------------------------
 // Subcommand: od media …
 // ---------------------------------------------------------------------------
 async function runMedia(args) {
  const sub = args.find((a) => !a.startsWith('-')) || '';
  if (sub === 'help' || sub === '-h' || sub === '--help' || sub === '') {
    printMediaHelp();
    return;
  }
  if (sub !== 'generate') {
    console.error(`unknown subcommand: od media ${sub}`);
    printMediaHelp();
    process.exit(1);
  }
  const idx = args.indexOf(sub);
  const flags = parseFlags([...args.slice(0, idx), ...args.slice(idx + 1)]);
  const daemonUrl = flags['daemon-url'] || process.env.OD_DAEMON_URL || 'http://127.0.0.1:7456';
  const projectId = flags.project || process.env.OD_PROJECT_ID;
  if (!projectId) {
    console.error(
      'project id required. Pass --project <id> or set OD_PROJECT_ID. The daemon injects this when it spawns the code agent.',
    );
    process.exit(2);
  }
  const surface = flags.surface;
  if (!surface || !['image', 'video', 'audio'].includes(surface)) {
    console.error('--surface must be one of: image | video | audio');
    process.exit(2);
  }
  if (!flags.model) {
    console.error('--model required (see http://<daemon>/api/media/models)');
    process.exit(2);
  }
  const body = {
    surface,
    model: flags.model,
    prompt: flags.prompt,
    output: flags.output,
    aspect: flags.aspect,
    voice: flags.voice,
    audioKind: flags['audio-kind'],
  };
  if (flags.length != null) body.length = Number(flags.length);
  if (flags.duration != null) body.duration = Number(flags.duration);
  const url = `${daemonUrl.replace(/\/$/, '')}/api/projects/${encodeURIComponent(projectId)}/media/generate`;
  let resp;
  try {
    resp = await fetch(url, {
      method: 'POST',
      headers: { 'content-type': 'application/json' },
      body: JSON.stringify(body),
    });
  } catch (err) {
    console.error(`failed to reach daemon at ${daemonUrl}: ${err.message}`);
    process.exit(3);
  }
  const text = await resp.text();
  if (!resp.ok) {
    console.error(`daemon ${resp.status}: ${text}`);
    process.exit(4);
  }
  // Print the JSON response as one line so the agent can parse it.
  process.stdout.write(text.trim() + '\n');
 }
 function parseFlags(argv) {
  const out = {};
  for (let i = 0; i < argv.length; i++) {
    const a = argv[i];
    if (!a || !a.startsWith('--')) continue;
    const key = a.slice(2);
    const next = argv[i + 1];
    if (next != null && !next.startsWith('--')) {
      out[key] = next;
      i++;
    } else {
      out[key] = true;
    }
  }
  return out;
 }
 function printMediaHelp() {
  console.log(`Usage: od media generate --surface <image|video|audio> --model <id> [opts]
 Required:
  --surface  image | video | audio
  --model    Model id from /api/media/models (e.g. gpt-image-2, seedance-2, suno-v5).
  --project  Project id. Auto-resolved from OD_PROJECT_ID when invoked by the daemon.
 Common options:
  --prompt "<text>"         Generation prompt.
  --output <filename>       File to write under the project. Auto-named if omitted.
  --aspect 1:1|16:9|9:16|4:3|3:4
  --length <seconds>        Video length.
  --duration <seconds>      Audio duration.
  --voice <voice-id>        Speech / TTS voice.
  --audio-kind music|speech|sfx
  --daemon-url http://127.0.0.1:7456
 Output: a single line of JSON: {"file": { name, size, kind, mime, ... }}.
 Skills should call this and then reference the returned filename in their
 artifact / message body. The daemon writes the bytes into the project's
 files folder so the FileViewer can preview them immediately.`);
 }
@@ -29,6 +29,11 @@ export async function listDesignSystems(root) {
        category: extractCategory(raw) ?? 'Uncategorized',
        summary: summarize(raw),
        swatches: extractSwatches(raw),
        // Optional `> Surface: image|video|audio` blockquote line. Most
        // existing systems target the web surface and don't declare it;
        // we default to 'web' so the right-side filter classifies them
        // correctly.
        surface: extractSurface(raw),
        body: raw,
      });
    } catch {
@@ -67,6 +72,14 @@ function extractCategory(raw) {
  return m?.[1];
 }
 const KNOWN_SURFACES = new Set(['web', 'image', 'video', 'audio']);
 function extractSurface(raw) {
  const m = /^>\s*Surface:\s*(.+?)\s*$/im.exec(raw);
  if (!m) return 'web';
  const v = m[1].trim().toLowerCase();
  return KNOWN_SURFACES.has(v) ? v : 'web';
 }
 // Strip boilerplate like "Design System Inspired by Cohere" → "Cohere" so
 // the picker dropdown reads cleanly. Hand-authored titles that don't match
 // the pattern (e.g. "Neutral Modern") pass through unchanged.
@@ -0,0 +1,62 @@
 // Daemon-side mirror of src/media/models.ts. We keep this in plain JS so
 // node imports are native and the daemon never needs a TS toolchain at
 // runtime. The two files are kept in sync by review — any model added to
 // src/media/models.ts must be added here too. Tests in verify ensure the
 // arrays are non-empty and IDs are unique.
 export const IMAGE_MODELS = [
  { id: 'gpt-image-2', label: 'gpt-image-2', hint: 'OpenAI · default', caps: ['t2i', 'i2i', 'inpaint'] },
  { id: 'flux-1.1-pro', label: 'flux-1.1-pro', hint: 'Black Forest Labs', caps: ['t2i', 'i2i'] },
  { id: 'imagen-4', label: 'imagen-4', hint: 'Google', caps: ['t2i'] },
  { id: 'midjourney-v7', label: 'midjourney-v7', hint: 'Midjourney', caps: ['t2i'] },
 ];
 export const VIDEO_MODELS = [
  { id: 'seedance-2', label: 'seedance-2', hint: 'ByteDance · default', caps: ['t2v', 'i2v'] },
  { id: 'kling-3', label: 'kling-3', hint: 'Kuaishou', caps: ['t2v', 'i2v'] },
  { id: 'kling-4', label: 'kling-4', hint: 'Kuaishou · latest', caps: ['t2v', 'i2v'] },
  { id: 'veo-3', label: 'veo-3', hint: 'Google', caps: ['t2v'] },
  { id: 'sora-2', label: 'sora-2', hint: 'OpenAI', caps: ['t2v'] },
 ];
 export const AUDIO_MODELS_BY_KIND = {
  music: [
    { id: 'suno-v5', label: 'suno-v5', hint: 'Suno · default', caps: ['music'] },
    { id: 'udio-v2', label: 'udio-v2', hint: 'Udio', caps: ['music'] },
    { id: 'lyria-2', label: 'lyria-2', hint: 'Google', caps: ['music'] },
  ],
  speech: [
    { id: 'minimax-tts', label: 'minimax-tts', hint: 'MiniMax · default', caps: ['tts'] },
    { id: 'fish-speech-2', label: 'fish-speech-2', hint: 'FishAudio', caps: ['tts', 'voice-clone'] },
    { id: 'elevenlabs-v3', label: 'elevenlabs-v3', hint: 'ElevenLabs', caps: ['tts', 'voice-clone'] },
  ],
  sfx: [
    { id: 'elevenlabs-sfx', label: 'elevenlabs-sfx', hint: 'ElevenLabs SFX', caps: ['sfx'] },
    { id: 'audiocraft', label: 'audiocraft', hint: 'Meta · open', caps: ['sfx', 'music'] },
  ],
 };
 export const MEDIA_ASPECTS = ['1:1', '16:9', '9:16', '4:3', '3:4'];
 export const VIDEO_LENGTHS_SEC = [3, 5, 8, 10, 15, 30];
 export const AUDIO_DURATIONS_SEC = [5, 10, 15, 30, 60, 120];
 export function findMediaModel(id) {
  const all = [
    ...IMAGE_MODELS,
    ...VIDEO_MODELS,
    ...AUDIO_MODELS_BY_KIND.music,
    ...AUDIO_MODELS_BY_KIND.speech,
    ...AUDIO_MODELS_BY_KIND.sfx,
  ];
  return all.find((m) => m.id === id) || null;
 }
 export function modelsForSurface(surface, audioKind) {
  if (surface === 'image') return IMAGE_MODELS;
  if (surface === 'video') return VIDEO_MODELS;
  if (surface === 'audio') {
    const k = audioKind || 'music';
    return AUDIO_MODELS_BY_KIND[k] || AUDIO_MODELS_BY_KIND.music;
  }
  return [];
 }
@@ -0,0 +1,263 @@
 // Media-generation dispatcher. The unifying contract is:
 //
 //   skills + metadata + system-prompt
 //        ↓ (the code agent decides what to make)
 //   `od media generate --surface … --model … --output … --prompt …`
 //        ↓ (this module routes to a provider)
 //   bytes written to <projectsRoot>/<projectId>/<output>
 //        ↓
 //   FileViewer renders it.
 //
 // Every surface (image / video / audio) flows through this single
 // entrypoint. Providers are pluggable: each file under ./media-providers/
 // (or inline below) registers handlers keyed by (surface, model). The
 // fallback handlers emit a deterministic, lightweight placeholder
 // (labeled SVG-PNG, silent WAV/MP3, blank MP4) so the framework works
 // without API keys — real provider integrations slot in later by
 // replacing the handler.
 import { mkdir, stat, writeFile } from 'node:fs/promises';
 import path from 'node:path';
 import { findMediaModel } from './media-models.js';
 import {
  ensureProject,
  kindFor,
  mimeFor,
  sanitizeName,
 } from './projects.js';
 const DEFAULT_OUTPUT_BY_SURFACE = {
  image: 'image.png',
  video: 'video.mp4',
  audio: 'audio.mp3',
 };
 const SURFACES = new Set(['image', 'video', 'audio']);
 /**
 * Generate a media artifact and write it into the project's files dir.
 *
 * @param {Object} args
 * @param {string} args.projectsRoot - Absolute path to <repo>/.od/projects.
 * @param {string} args.projectId
 * @param {'image'|'video'|'audio'} args.surface
 * @param {string} args.model - Must be a registered model id.
 * @param {string} [args.prompt]
 * @param {string} [args.output] - Optional filename; auto-named if missing.
 * @param {string} [args.aspect] - 1:1 / 16:9 / 9:16 / 4:3 / 3:4
 * @param {number} [args.length] - Video length, seconds.
 * @param {number} [args.duration] - Audio duration, seconds.
 * @param {string} [args.voice]
 * @param {string} [args.audioKind] - music | speech | sfx
 * @returns {Promise<{ name: string, size: number, mtime: number, kind: string, mime: string, model: string, surface: string, providerNote: string }>}
 */
 export async function generateMedia(args) {
  const {
    projectsRoot,
    projectId,
    surface,
    model,
    prompt,
    output,
    aspect,
    length,
    duration,
    voice,
    audioKind,
  } = args;
  if (!projectsRoot) throw new Error('projectsRoot required');
  if (typeof projectId !== 'string' || !projectId) {
    throw new Error('projectId required');
  }
  if (!SURFACES.has(surface)) {
    throw new Error(`unsupported surface: ${surface}`);
  }
  if (typeof model !== 'string' || !model) {
    throw new Error('model required');
  }
  const def = findMediaModel(model);
  if (!def) {
    throw new Error(
      `unknown model: ${model}. Pass --model from the registered list (see /api/media/models).`,
    );
  }
  const dir = await ensureProject(projectsRoot, projectId);
  const safeOut = sanitizeName(
    output || autoOutputName(surface, model, audioKind),
  );
  const target = path.join(dir, safeOut);
  await mkdir(path.dirname(target), { recursive: true });
  const ctx = {
    surface,
    model,
    prompt: prompt || '',
    aspect: aspect || defaultAspectFor(surface),
    length: typeof length === 'number' ? length : undefined,
    duration: typeof duration === 'number' ? duration : undefined,
    voice: voice || '',
    audioKind: audioKind || (surface === 'audio' ? 'music' : undefined),
  };
  let bytes;
  let providerNote;
  if (surface === 'image') {
    ({ bytes, providerNote } = await renderImage(ctx, safeOut));
  } else if (surface === 'video') {
    ({ bytes, providerNote } = await renderVideo(ctx, safeOut));
  } else {
    ({ bytes, providerNote } = await renderAudio(ctx, safeOut));
  }
  await writeFile(target, bytes);
  const st = await stat(target);
  return {
    name: safeOut,
    size: st.size,
    mtime: st.mtimeMs,
    kind: kindFor(safeOut),
    mime: mimeFor(safeOut),
    model,
    surface,
    providerNote,
  };
 }
 function autoOutputName(surface, model, audioKind) {
  const base = DEFAULT_OUTPUT_BY_SURFACE[surface] || 'artifact.bin';
  const stamp = Date.now().toString(36);
  const tag = surface === 'audio' && audioKind ? `${audioKind}-${model}` : model;
  const dot = base.lastIndexOf('.');
  const stem = dot > 0 ? base.slice(0, dot) : base;
  const ext = dot > 0 ? base.slice(dot) : '';
  return `${stem}-${tag}-${stamp}${ext}`;
 }
 function defaultAspectFor(surface) {
  if (surface === 'image') return '1:1';
  if (surface === 'video') return '16:9';
  return undefined;
 }
 // ---------------------------------------------------------------------------
 // Provider stubs.
 //
 // Each renderer returns Buffer bytes that the caller writes to disk. They
 // produce real, lightweight placeholder media labelled with the model +
 // prompt so the user can verify which call was dispatched while the real
 // provider integrations are still pending. To replace a stub with a real
 // provider, swap the body — keep the (ctx, fileName) → { bytes, note }
 // shape so server.js doesn't change.
 async function renderImage(ctx, fileName) {
  // SVG-as-image: write SVG bytes into a .png filename only when ext is
  // svg; otherwise emit a tiny PNG that browsers can decode. We pick
  // PNG-as-bytes by encoding the SVG inside a minimal PNG container —
  // simpler: just write SVG XML into a .png, browsers can't render that.
  // So instead: for png/jpg, emit a deterministic 1×1 PNG; for svg, emit
  // a labelled SVG.
  const ext = path.extname(fileName).toLowerCase();
  if (ext === '.svg') {
    return { bytes: Buffer.from(svgPlaceholder(ctx), 'utf8'), providerNote: 'svg-stub' };
  }
  // Minimal 1×1 transparent PNG. Real provider would emit a full image.
  const png = Buffer.from(
    [
      0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0x00, 0x00, 0x00, 0x0d,
      0x49, 0x48, 0x44, 0x52, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
      0x08, 0x06, 0x00, 0x00, 0x00, 0x1f, 0x15, 0xc4, 0x89, 0x00, 0x00, 0x00,
      0x0d, 0x49, 0x44, 0x41, 0x54, 0x78, 0x9c, 0x63, 0x00, 0x01, 0x00, 0x00,
      0x05, 0x00, 0x01, 0x0d, 0x0a, 0x2d, 0xb4, 0x00, 0x00, 0x00, 0x00, 0x49,
      0x45, 0x4e, 0x44, 0xae, 0x42, 0x60, 0x82,
    ],
  );
  return {
    bytes: png,
    providerNote: `stub-png · model=${ctx.model} · aspect=${ctx.aspect} · prompt=${truncate(ctx.prompt, 60)}`,
  };
 }
 async function renderVideo(ctx, _fileName) {
  // Tiny but valid mp4 (ftyp + minimal moov). Browsers without a video
  // track will show 0 seconds, which is fine — this proves the dispatch
  // round-trip; real Seedance/Kling/Veo providers replace this body.
  const ftyp = Buffer.from([
    0x00, 0x00, 0x00, 0x18, 0x66, 0x74, 0x79, 0x70, 0x69, 0x73, 0x6f, 0x6d,
    0x00, 0x00, 0x02, 0x00, 0x69, 0x73, 0x6f, 0x6d, 0x69, 0x73, 0x6f, 0x32,
  ]);
  const mdat = Buffer.from([0x00, 0x00, 0x00, 0x08, 0x6d, 0x64, 0x61, 0x74]);
  return {
    bytes: Buffer.concat([ftyp, mdat]),
    providerNote: `stub-mp4 · model=${ctx.model} · aspect=${ctx.aspect} · length=${ctx.length ?? '?'}s · prompt=${truncate(ctx.prompt, 60)}`,
  };
 }
 async function renderAudio(ctx, fileName) {
  const ext = path.extname(fileName).toLowerCase();
  if (ext === '.wav') {
    return {
      bytes: silentWav(0.5),
      providerNote: `stub-wav · model=${ctx.model} · kind=${ctx.audioKind} · duration=${ctx.duration ?? '?'}s`,
    };
  }
  // Default: emit a near-empty mp3 frame header so the file is valid but
  // tiny. Browsers may report 0:00; replace with real provider output.
  const mp3 = Buffer.from([
    0xff, 0xfb, 0x90, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  ]);
  return {
    bytes: mp3,
    providerNote: `stub-mp3 · model=${ctx.model} · kind=${ctx.audioKind} · voice=${ctx.voice || '-'} · duration=${ctx.duration ?? '?'}s`,
  };
 }
 function svgPlaceholder(ctx) {
  const [w, h] = aspectToBox(ctx.aspect, 800);
  const safe = (s) =>
    String(s || '')
      .replace(/&/g, '&amp;')
      .replace(/</g, '&lt;')
      .replace(/>/g, '&gt;');
  return [
    `<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 ${w} ${h}" width="${w}" height="${h}">`,
    `<rect width="${w}" height="${h}" fill="#0f1424"/>`,
    `<text x="50%" y="50%" fill="#7da4ff" font-family="ui-sans-serif" font-size="20" text-anchor="middle">${safe(ctx.model)} — ${safe(ctx.prompt).slice(0, 60)}</text>`,
    '</svg>',
  ].join('');
 }
 function aspectToBox(aspect, base) {
  const [a, b] = String(aspect || '1:1').split(':').map(Number);
  if (!a || !b) return [base, base];
  if (a >= b) return [base, Math.round((base * b) / a)];
  return [Math.round((base * a) / b), base];
 }
 function silentWav(seconds) {
  const sampleRate = 8000;
  const numSamples = Math.max(1, Math.round(sampleRate * seconds));
  const dataSize = numSamples * 2;
  const buf = Buffer.alloc(44 + dataSize);
  buf.write('RIFF', 0, 'ascii');
  buf.writeUInt32LE(36 + dataSize, 4);
  buf.write('WAVE', 8, 'ascii');
  buf.write('fmt ', 12, 'ascii');
  buf.writeUInt32LE(16, 16);
  buf.writeUInt16LE(1, 20); // PCM
  buf.writeUInt16LE(1, 22); // mono
  buf.writeUInt32LE(sampleRate, 24);
  buf.writeUInt32LE(sampleRate * 2, 28);
  buf.writeUInt16LE(2, 32);
  buf.writeUInt16LE(16, 34);
  buf.write('data', 36, 'ascii');
  buf.writeUInt32LE(dataSize, 40);
  return buf;
 }
 function truncate(s, n) {
  const v = String(s || '');
  if (v.length <= n) return v;
  return v.slice(0, n - 1) + '…';
 }
@@ -156,6 +156,21 @@ const EXT_MIME = {
  '.gif': 'image/gif',
  '.webp': 'image/webp',
  '.avif': 'image/avif',
  // Video — covered MIMEs are the formats most generators emit. Browsers
  // play them via <video> / <audio> in the FileViewer with no transcode.
  '.mp4': 'video/mp4',
  '.m4v': 'video/mp4',
  '.webm': 'video/webm',
  '.mov': 'video/quicktime',
  // Audio — music / TTS generators commonly produce mp3 / wav / ogg /
  // m4a; flac is rarer but cheap to support.
  '.mp3': 'audio/mpeg',
  '.wav': 'audio/wav',
  '.ogg': 'audio/ogg',
  '.oga': 'audio/ogg',
  '.m4a': 'audio/mp4',
  '.flac': 'audio/flac',
  '.aac': 'audio/aac',
 };
 export function mimeFor(name) {
@@ -175,6 +190,10 @@ export function kindFor(name) {
    if (name.startsWith('sketch-')) return 'sketch';
    return 'image';
  }
  if (['.mp4', '.m4v', '.webm', '.mov'].includes(ext)) return 'video';
  if (['.mp3', '.wav', '.ogg', '.oga', '.m4a', '.flac', '.aac'].includes(ext)) {
    return 'audio';
  }
  if (['.md', '.txt'].includes(ext)) return 'text';
  if (['.js', '.mjs', '.cjs', '.ts', '.tsx', '.json', '.css'].includes(ext)) {
    return 'code';
@@ -22,6 +22,15 @@ import {
  sanitizeName,
  writeProjectFile,
 } from './projects.js';
 import { generateMedia } from './media.js';
 import {
  AUDIO_MODELS_BY_KIND,
  IMAGE_MODELS,
  VIDEO_MODELS,
  MEDIA_ASPECTS,
  VIDEO_LENGTHS_SEC,
  AUDIO_DURATIONS_SEC,
 } from './media-models.js';
 import {
  deleteConversation,
  deleteProject as dbDeleteProject,
@@ -50,6 +59,10 @@ const PROJECT_ROOT = path.resolve(__dirname, '..');
 const STATIC_DIR = path.join(PROJECT_ROOT, 'dist');
 const SKILLS_DIR = path.join(PROJECT_ROOT, 'skills');
 const DESIGN_SYSTEMS_DIR = path.join(PROJECT_ROOT, 'design-systems');
 // Absolute path to the daemon CLI entry. We inject this into the spawned
 // agent's env as OD_BIN so the agent can run `node "$OD_BIN" media generate …`
 // regardless of whether the user has `od` on PATH.
 const OD_BIN_PATH = path.join(__dirname, 'cli.js');
 const ARTIFACTS_DIR = path.join(PROJECT_ROOT, '.od', 'artifacts');
 const PROJECTS_DIR = path.join(PROJECT_ROOT, '.od', 'projects');
 fs.mkdirSync(PROJECTS_DIR, { recursive: true });
@@ -650,6 +663,56 @@ export async function startServer({ port = 7456 } = {}) {
    }
  });
  // ---- Media generation -----------------------------------------------------
  //
  // Surface-agnostic media dispatcher. The code agent reaches this via
  // `od media generate` (see daemon/cli.js media subcommand), which is
  // the unified contract: skills + metadata + system-prompt instruct the
  // agent on WHAT to produce, the agent invokes ONE entrypoint that
  // dispatches per (surface, model) and writes the bytes into the project.
  // The shape of the response matches POST /api/projects/:id/files so the
  // frontend can refresh the file list with the same code path.
  app.get('/api/media/models', (_req, res) => {
    res.json({
      image: IMAGE_MODELS,
      video: VIDEO_MODELS,
      audio: AUDIO_MODELS_BY_KIND,
      aspects: MEDIA_ASPECTS,
      videoLengthsSec: VIDEO_LENGTHS_SEC,
      audioDurationsSec: AUDIO_DURATIONS_SEC,
    });
  });
  app.post('/api/projects/:id/media/generate', async (req, res) => {
    try {
      const projectId = req.params.id;
      // Ensure the project exists in DB before writing files; this gives
      // a friendly 404 when the agent calls with a bad id. The agent
      // normally inherits OD_PROJECT_ID from spawn env so this should
      // always resolve.
      const project = getProject(db, projectId);
      if (!project) return res.status(404).json({ error: 'project not found' });
      const meta = await generateMedia({
        projectsRoot: PROJECTS_DIR,
        projectId,
        surface: req.body?.surface,
        model: req.body?.model,
        prompt: req.body?.prompt,
        output: req.body?.output,
        aspect: req.body?.aspect,
        length: typeof req.body?.length === 'number' ? req.body.length : undefined,
        duration:
          typeof req.body?.duration === 'number' ? req.body.duration : undefined,
        voice: req.body?.voice,
        audioKind: req.body?.audioKind,
      });
      res.json({ file: meta });
    } catch (err) {
      res.status(400).json({ error: String(err && err.message ? err.message : err) });
    }
  });
  // Multi-file upload that the chat composer uses for paste/drop/picker.
  // Files land flat in the project folder; the response carries the same
  // metadata as listFiles so the client can stage them as ChatAttachments
@@ -800,10 +863,20 @@ export async function startServer({ port = 7456 } = {}) {
      cwd,
    });
    // Inject the OD context. Skills + the media-contract prompt tell the
    // agent how to spend this — call `node "$OD_BIN" media generate
    // --project "$OD_PROJECT_ID" …` and the daemon dispatches.
    const odEnv = {
      OD_BIN: OD_BIN_PATH,
      OD_DAEMON_URL: `http://127.0.0.1:${port}`,
      OD_PROJECT_ID: typeof projectId === 'string' ? projectId : '',
      OD_PROJECT_DIR: cwd || '',
    };
    let child;
    try {
      child = spawn(def.bin, args, {
-        env: { ...process.env },
+        env: { ...process.env, ...odEnv },
        stdio: ['ignore', 'pipe', 'pipe'],
        cwd: cwd || undefined,
      });
@@ -25,12 +25,16 @@ export async function listSkills(skillsRoot) {
      const { data, body } = parseFrontmatter(raw);
      const hasAttachments = await dirHasAttachments(dir);
      const mode = data.od?.mode || inferMode(body, data.description);
      const surface = normalizeSurface(data.od?.surface, mode);
      out.push({
        id: data.name || entry.name,
        name: data.name || entry.name,
        description: data.description || "",
        triggers: Array.isArray(data.triggers) ? data.triggers : [],
        mode,
        // Surface defaults to inferring from `mode` so legacy SKILL.md
        // files (no `od.surface` declared) keep classifying correctly.
        surface,
        platform: normalizePlatform(
          data.od?.platform,
          mode,
@@ -159,6 +163,20 @@ function inferMode(body, description) {
  return "prototype";
 }
 // Surface is the high-level output bucket — web, image, video or audio.
 // Authors can pin it via `od.surface`; otherwise we derive from `mode`,
 // then fall back to the safe default ('web') so existing skills classify
 // unchanged.
 const KNOWN_SURFACES = new Set(["web", "image", "video", "audio"]);
 function normalizeSurface(value, mode) {
  if (typeof value === "string") {
    const v = value.trim().toLowerCase();
    if (KNOWN_SURFACES.has(v)) return v;
  }
  if (mode === "image" || mode === "video" || mode === "audio") return mode;
  return "web";
 }
 // Validate platform tag — only desktop / mobile are meaningful for the
 // Examples gallery. Falls back to autodetecting "mobile" from descriptions
 // so legacy skills sort under the right pill without authoring changes.
@@ -0,0 +1,121 @@
 ---
 name: audio-jingle
 description: |
  Audio generation skill — jingles, beds, voiceover, and sound effects.
  Routes music requests to Suno V5 / Udio / Lyria, speech to MiniMax
  TTS / FishAudio / ElevenLabs V3, and SFX to ElevenLabs SFX or
  AudioCraft. Output is one MP3/WAV file saved to the project folder.
 triggers:
  - "music"
  - "jingle"
  - "bed"
  - "voiceover"
  - "tts"
  - "sound effect"
  - "音乐"
  - "配音"
  - "音效"
 od:
  mode: audio
  surface: audio
  scenario: marketing
  preview:
    type: html
    entry: example.html
  design_system:
    requires: false
  example_prompt: |
    A 30-second upbeat indie-pop jingle for a coffee shop launch — warm
    electric piano lead, brushed drums, gentle bass, a single sun-soaked
    "ahhh" choir on the chorus. No vocals. Loop-friendly tail.
 ---
 # Audio Jingle Skill
 Three sub-modes. The active project's `audioKind` decides which one
 runs:
 | `audioKind` | Models we route to | Plan focus |
 |---|---|---|
 | `music` | Suno V5 (default), Udio, Lyria 2 | genre + tempo + instrumentation |
 | `speech` | MiniMax TTS (default), Fish, ElevenLabs V3 | script + voice + pacing |
 | `sfx` | ElevenLabs SFX (default), AudioCraft | texture + impact + duration |
 ## Resource map
 ```
 audio-jingle/
 ├── SKILL.md
 └── example.html
 ```
 ## Workflow
 ### Step 0 — Read the project metadata
 `audioKind`, `audioModel`, `audioDuration` (seconds), and (for speech)
 `voice`. Branch by `audioKind` and use the values verbatim — no
 clarifying form unless something is marked `(unknown — ask)`.
 ### Step 1 — Plan
 **Music**
 - Genre + reference artists (1-2)
 - Tempo (BPM) + key
 - Instrumentation (3-5 instruments max)
 - Vocals: yes / no / hummed / choir
 - Mood arc (intro → chorus → outro)
 **Speech**
 - Script (final, not draft — TTS runs verbatim)
 - Voice description (warmth, age, accent, pacing)
 - Pronunciation hints for proper nouns / acronyms
 **SFX**
 - Texture (impact / whoosh / ambience / foley)
 - Duration + envelope (sharp attack vs. gentle swell)
 - Layering note (single hit vs. stacked)
 State the plan in 2-3 sentences before dispatching.
 ### Step 2 — Compose the prompt
 Use the format the upstream model prefers. Bind `audioDuration` to the
 API parameter directly; never put "make it 30 seconds" in prose.
 ### Step 3 — Dispatch via the media contract
 Use the unified dispatcher — do **not** call provider APIs by hand:
 ```bash
 node "$OD_BIN" media generate \
  --project "$OD_PROJECT_ID" \
  --surface audio \
  --audio-kind "<music|speech|sfx>" \
  --model "<audioModel from metadata>" \
  --duration <audioDuration seconds> \
  --voice "<voice (speech only)>" \
  --output "<short-slug>-<duration>s.mp3" \
  --prompt "<assembled prompt from Step 2 — for speech, the literal script>"
 ```
 The command prints one line of JSON: `{"file": {"name": "...", ...}}`.
 The bytes land in the project; the FileViewer renders the audio
 transport controls automatically.
 ### Step 4 — Hand off
 Reply with: plan summary, the filename returned by the dispatcher, and
 one sentence on what to try if the user wants a variation (e.g. "swap
 tempo from 92 to 108 BPM" rather than "make it different").
 ## Hard rules
 - TTS runs your script **literally**. Proof it before dispatching —
  even one stray comma changes the cadence.
 - Music: under 30s = single section; 30–90s = intro + body; 90s+ =
  full arc. Don't try to fit a 3-act song into 15 seconds.
 - SFX: prefer one well-described layer over a paragraph of "make it
  cool" — generators reward specific texture words.
 - Save the file every turn. The audio viewer shows transport controls
  the moment the file lands.
@@ -0,0 +1,128 @@
 <!doctype html>
 <html lang="en">
  <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <title>Audio jingle — example</title>
    <style>
      :root {
        --bg: #f5efe5;
        --panel: #ffffff;
        --ink: #1c1b1a;
        --muted: #8b8579;
        --accent: #c96442;
        --grid: #e6dfd1;
      }
      * { box-sizing: border-box; }
      html, body { margin: 0; padding: 0; background: var(--bg); color: var(--ink);
        font-family: 'Iowan Old Style', 'Charter', Georgia, serif; }
      body { min-height: 100dvh; display: grid; place-items: center; padding: 32px; }
      .card {
        width: min(640px, 92vw);
        background: var(--panel);
        border-radius: 8px;
        padding: 26px 28px 22px;
        box-shadow: 0 16px 40px rgba(28,27,26,0.10), 0 1px 2px rgba(28,27,26,0.05);
        border: 1px solid rgba(28,27,26,0.06);
      }
      .row1 { display: flex; align-items: center; gap: 14px; margin-bottom: 18px; }
      .icon {
        width: 44px; height: 44px; border-radius: 50%;
        background: var(--accent); color: #fff;
        display: grid; place-items: center;
        box-shadow: 0 6px 18px rgba(201, 100, 66, 0.35);
      }
      .icon svg { width: 22px; height: 22px; }
      .title { margin: 0; font-size: 20px; line-height: 1.2; }
      .sub { font-family: ui-monospace, 'SF Mono', Menlo, monospace;
        font-size: 11px; color: var(--muted); letter-spacing: 0.14em; text-transform: uppercase; margin-top: 2px; }
      .wave {
        display: flex; align-items: end; gap: 3px;
        height: 96px; padding: 0 4px;
        border-top: 1px dashed var(--grid);
        border-bottom: 1px dashed var(--grid);
      }
      .wave span {
        flex: 1; background: linear-gradient(180deg, var(--accent), #a4502f);
        border-radius: 2px;
        animation: bob 2s ease-in-out infinite;
        animation-delay: var(--d, 0s);
      }
      @keyframes bob {
        0%, 100% { height: var(--h, 30%); }
        50% { height: calc(var(--h, 30%) * 1.6); }
      }
      .transport {
        margin-top: 14px;
        display: grid; grid-template-columns: auto 1fr auto auto; gap: 12px;
        align-items: center;
      }
      .play {
        width: 36px; height: 36px; border-radius: 50%;
        background: var(--ink); color: #fff;
        display: grid; place-items: center;
      }
      .timeline {
        height: 4px; border-radius: 2px;
        background: linear-gradient(90deg, var(--accent) 0 32%, var(--grid) 32% 100%);
      }
      .time {
        font-family: ui-monospace, 'SF Mono', Menlo, monospace;
        font-size: 11px; color: var(--muted);
        letter-spacing: 0.08em;
      }
      .badge {
        font-family: ui-monospace, 'SF Mono', Menlo, monospace;
        font-size: 10px; color: var(--accent);
        letter-spacing: 0.18em; text-transform: uppercase;
        padding: 4px 8px; border-radius: 999px;
        background: rgba(201, 100, 66, 0.1);
      }
    </style>
  </head>
  <body>
    <div class="card">
      <div class="row1">
        <div class="icon" aria-hidden>
          <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M9 18V5l12-2v13"/><circle cx="6" cy="18" r="3"/><circle cx="18" cy="16" r="3"/></svg>
        </div>
        <div>
          <h1 class="title">A 30s coffee-shop launch jingle.</h1>
          <div class="sub">suno-v5 · 92 BPM · loop-friendly tail</div>
        </div>
      </div>
      <div class="wave" aria-hidden>
        <span style="--h:24%;--d:0s"></span>
        <span style="--h:38%;--d:.05s"></span>
        <span style="--h:52%;--d:.1s"></span>
        <span style="--h:64%;--d:.15s"></span>
        <span style="--h:48%;--d:.2s"></span>
        <span style="--h:70%;--d:.25s"></span>
        <span style="--h:42%;--d:.3s"></span>
        <span style="--h:58%;--d:.35s"></span>
        <span style="--h:36%;--d:.4s"></span>
        <span style="--h:62%;--d:.45s"></span>
        <span style="--h:26%;--d:.5s"></span>
        <span style="--h:50%;--d:.55s"></span>
        <span style="--h:34%;--d:.6s"></span>
        <span style="--h:46%;--d:.65s"></span>
        <span style="--h:58%;--d:.7s"></span>
        <span style="--h:30%;--d:.75s"></span>
        <span style="--h:44%;--d:.8s"></span>
        <span style="--h:54%;--d:.85s"></span>
        <span style="--h:28%;--d:.9s"></span>
        <span style="--h:48%;--d:.95s"></span>
      </div>
      <div class="transport">
        <div class="play" aria-hidden>
          <svg viewBox="0 0 24 24" width="14" height="14" fill="currentColor"><path d="M6 4v16l14-8z"/></svg>
        </div>
        <div class="timeline" aria-hidden></div>
        <span class="time">00:09 / 00:30</span>
        <span class="badge">MP3</span>
      </div>
    </div>
  </body>
 </html>
@@ -0,0 +1,104 @@
 ---
 name: image-poster
 description: |
  Single-image generation skill for posters, key art, and editorial
  illustrations. Defaults to gpt-image-2 but is provider-agnostic — the
  same workflow drives Flux, Imagen, or Midjourney via the active
  upstream tooling. Output is one or more PNG/JPEG files saved to the
  project folder.
 triggers:
  - "poster"
  - "key art"
  - "illustration"
  - "image"
  - "cover art"
  - "海报"
  - "插画"
 od:
  mode: image
  surface: image
  scenario: design
  preview:
    type: html
    entry: example.html
  design_system:
    requires: false
  example_prompt: |
    Editorial poster for an indie film festival — one bold abstract
    silhouette over a warm, slightly grainy paper background; hand-set
    sans serif title at the top, festival dates and venue at the bottom
    in monospace. Muted ochre + ink palette.
 ---
 # Image Poster Skill
 Produce **one** finished image asset per turn unless the user asks for
 variations. Image generation rewards a tight, structured prompt — your
 job is to assemble that prompt from the user's brief, then dispatch.
 ## Resource map
 ```
 image-poster/
 ├── SKILL.md         ← you're reading this
 └── example.html     ← what the resulting card looks like in Examples
 ```
 ## Workflow
 ### Step 0 — Read the project metadata
 The active project carries `imageModel`, `imageAspect`, and (optional)
 `imageStyle` notes. Use them as the upstream model + canvas + style
 anchor; only ask the user to fill them in if they're marked `(unknown
 — ask)`.
 ### Step 1 — Compose the prompt
 Plan in this exact order before calling any tool:
 1. **Subject + composition** — what is in the frame, where, at what
   scale; eye-line and crop.
 2. **Lighting + mood** — natural / studio / moody; warm / cool; key
   plus rim plus fill; time of day if outdoor.
 3. **Palette + textures** — hex anchors when the user gave a brand
   palette; otherwise a 3-word mood tag (e.g. "muted ochre + ink").
 4. **Camera / lens** — only if the user wants photographic realism
   ("85mm portrait, shallow DOF") or a specific film stock.
 5. **What to avoid** — common AI-slop patterns ("no extra fingers, no
   warped text, no logo placeholders").
 ### Step 2 — Dispatch via the media contract
 Use the unified dispatcher — do **not** call upstream provider APIs by
 hand. Run from your shell tool:
 ```bash
 node "$OD_BIN" media generate \
  --project "$OD_PROJECT_ID" \
  --surface image \
  --model "<imageModel from metadata>" \
  --aspect "<imageAspect from metadata>" \
  --output "<short-descriptive-name>.png" \
  --prompt "<the full assembled prompt from Step 1>"
 ```
 The command prints one line of JSON: `{"file": {"name": "...", ...}}`.
 The daemon writes the bytes into the project folder; the FileViewer
 picks it up automatically.
 ### Step 3 — Hand off
 Reply with a one-paragraph summary of the prompt you used and the
 filename returned by the dispatcher (e.g. *I generated `hero-poster.png`
 with `gpt-image-2` at 1:1.*). Do **not** emit an `<artifact>` tag.
 ## Hard rules
 - One image per turn unless asked for variations.
 - Honor `imageAspect` exactly — the upstream cost is the same; matching
  the aspect avoids a re-render.
 - No filler typography in the image itself unless the user asked for
  in-frame text. Real copy beats lorem.
 - Save every render — never describe an image without producing the
  file. The user expects something to open in the file viewer.
@@ -0,0 +1,113 @@
 <!doctype html>
 <html lang="en">
  <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <title>Image poster — example</title>
    <style>
      :root {
        --bg: #f5efe5;
        --ink: #1c1b1a;
        --accent: #c96442;
        --muted: #8b8579;
        --paper: #efe7d7;
      }
      * { box-sizing: border-box; }
      html, body { margin: 0; padding: 0; background: var(--bg); color: var(--ink);
        font-family: 'Iowan Old Style', 'Charter', Georgia, serif; }
      body { min-height: 100dvh; display: grid; place-items: center; padding: 32px; }
      .poster {
        width: min(640px, 92vw);
        aspect-ratio: 3 / 4;
        background: var(--paper);
        border: 1px solid rgba(28, 27, 26, 0.08);
        border-radius: 6px;
        box-shadow: 0 16px 48px rgba(28, 27, 26, 0.12), 0 1px 2px rgba(28, 27, 26, 0.06);
        display: grid;
        grid-template-rows: auto 1fr auto;
        padding: 38px 32px;
        position: relative;
        overflow: hidden;
      }
      .poster::after {
        content: '';
        position: absolute; inset: 0;
        pointer-events: none;
        background:
          radial-gradient(circle at 30% 18%, rgba(255,255,255,0.7), transparent 60%),
          repeating-linear-gradient(0deg, rgba(28,27,26,0.025) 0 1px, transparent 1px 2px);
      }
      .eyebrow {
        font-family: ui-monospace, 'SF Mono', Menlo, monospace;
        font-size: 11px;
        letter-spacing: 0.18em;
        text-transform: uppercase;
        color: var(--muted);
        display: flex;
        justify-content: space-between;
        align-items: center;
      }
      .accent-dot {
        width: 8px; height: 8px; border-radius: 50%;
        background: var(--accent);
      }
      .silhouette {
        align-self: center;
        justify-self: center;
        width: 70%;
        aspect-ratio: 1 / 1;
        position: relative;
      }
      .silhouette svg { width: 100%; height: 100%; display: block; }
      .meta {
        font-family: ui-monospace, 'SF Mono', Menlo, monospace;
        font-size: 10.5px;
        letter-spacing: 0.14em;
        text-transform: uppercase;
        color: var(--muted);
        display: grid;
        grid-template-columns: 1fr auto 1fr;
        gap: 12px;
        align-items: end;
      }
      .meta strong { color: var(--ink); font-weight: 600; }
      .title {
        font-size: 44px;
        line-height: 0.95;
        margin: 18px 0 0;
        letter-spacing: -0.01em;
      }
      .title em { font-style: italic; color: var(--accent); }
      .footer {
        margin-top: 12px;
        font-size: 13px;
        color: var(--muted);
        font-family: ui-monospace, 'SF Mono', Menlo, monospace;
      }
    </style>
  </head>
  <body>
    <div class="poster">
      <div class="eyebrow">
        <span>Open Design · Image</span>
        <span class="accent-dot" aria-hidden></span>
      </div>
      <div class="silhouette" aria-hidden>
        <svg viewBox="0 0 100 100">
          <circle cx="50" cy="38" r="18" fill="#1c1b1a" />
          <path d="M22 100 C 22 70, 78 70, 78 100 Z" fill="#1c1b1a" />
          <circle cx="68" cy="22" r="6" fill="#c96442" />
        </svg>
      </div>
      <div>
        <h1 class="title">An <em>image</em> project<br />produced by the agent.</h1>
        <div class="meta">
          <span><strong>gpt-image-2</strong></span>
          <span>·</span>
          <span style="text-align:right">3:4 · poster</span>
        </div>
        <p class="footer">Saved as PNG into the project folder.</p>
      </div>
    </div>
  </body>
 </html>
@@ -0,0 +1,108 @@
 ---
 name: video-shortform
 description: |
  Short-form video generation skill — 3-10 second clips for product
  reveals, motion teasers, ambient loops. Defaults to Seedance 2 but
  works the same with Kling 3 / 4, Veo 3 or Sora 2. Output is one MP4
  saved to the project folder. When the workspace also ships an
  interactive-video / hyperframes skill, prefer composing several short
  shots into a single timeline rather than one long monolithic clip.
 triggers:
  - "video"
  - "clip"
  - "shortform"
  - "reel"
  - "短视频"
  - "动效"
 od:
  mode: video
  surface: video
  scenario: marketing
  preview:
    type: html
    entry: example.html
  design_system:
    requires: false
  example_prompt: |
    5-second product reveal — ceramic coffee mug rotating on a soft
    paper backdrop, warm side-light from camera-left, micro dust motes
    drifting through the beam. Cinematic, 16:9, slow drift on the camera.
 ---
 # Video Shortform Skill
 Short-form (≤ 10s) is the sweet spot for current text-to-video models —
 they're great at one **shot** with one **idea**, weaker at multi-cut
 narratives. Plan one shot per call.
 ## Resource map
 ```
 video-shortform/
 ├── SKILL.md
 └── example.html
 ```
 ## Workflow
 ### Step 0 — Read the project metadata
 `videoModel`, `videoLength` (seconds), `videoAspect`. These are
 hard-locks — clamp the prompt to whatever the chosen model supports
 (Seedance 2 caps at 10s; Kling 4 supports up to 10s + image-to-video;
 Veo 3 supports 8s with audio).
 ### Step 1 — Plan the shot
 Write the shotlist BEFORE calling the model:
 | Slot | Content |
 |---|---|
 | Subject | What's in frame? |
 | Camera | Static / pan / push-in / orbit? |
 | Lighting | Key direction + temperature |
 | Motion | What moves, at what pace? Subject motion vs camera motion. |
 | Sound | Ambient bed? (only if the model supports audio) |
 Show this to the user as a one-sentence plan before dispatching — they
 can redirect cheaply.
 ### Step 2 — Compose the prompt
 Use the format the upstream model prefers (Seedance: motion + camera +
 mood; Kling: subject + camera + style; Veo: subject + cinematography +
 sound). Bind the project's `videoAspect` and `videoLength` directly to
 the API parameters; never put them in prose.
 ### Step 3 — Dispatch via the media contract
 Use the unified dispatcher — do **not** call provider APIs by hand:
 ```bash
 node "$OD_BIN" media generate \
  --project "$OD_PROJECT_ID" \
  --surface video \
  --model "<videoModel from metadata>" \
  --aspect "<videoAspect from metadata>" \
  --length <videoLength seconds> \
  --output "<short-slug>-<seconds>s.mp4" \
  --prompt "<assembled shot prompt from Step 2>"
 ```
 The command prints one line of JSON: `{"file": {"name": "...", ...}}`.
 The bytes land in the project; the FileViewer plays it automatically.
 ### Step 4 — Hand off
 Reply with: shot summary, the filename returned by the dispatcher, and
 one sentence on what to try if the user wants a variation.
 ## Hard rules
 - One shot per turn. Multi-shot timelines belong in a hyperframes /
  interactive-video skill, not here.
 - Match `videoAspect` exactly — re-renders are slow.
 - Never ship a video without saving the file — the user expects
  something to play in the file viewer.
 - When the underlying model fails (NSFW filter, content policy,
  timeout), report the error verbatim. Don't silently retry.
@@ -0,0 +1,90 @@
 <!doctype html>
 <html lang="en">
  <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <title>Short-form video — example</title>
    <style>
      :root {
        --bg: #0e0d0c;
        --panel: #1a1816;
        --ink: #f5efe5;
        --muted: #8b8579;
        --accent: #c96442;
      }
      * { box-sizing: border-box; }
      html, body { margin: 0; padding: 0; background: var(--bg); color: var(--ink);
        font-family: 'Iowan Old Style', 'Charter', Georgia, serif; }
      body { min-height: 100dvh; display: grid; place-items: center; padding: 32px; }
      .stage {
        width: min(720px, 92vw);
        background: var(--panel);
        border-radius: 8px;
        padding: 22px;
        box-shadow: 0 24px 60px rgba(0,0,0,0.45);
      }
      .frame {
        position: relative;
        aspect-ratio: 16 / 9;
        border-radius: 6px;
        overflow: hidden;
        background:
          radial-gradient(circle at 30% 35%, #d8b08b 0%, #6f4a35 40%, #1a120c 80%);
      }
      .frame::after {
        content: ''; position: absolute; inset: 0;
        background: repeating-linear-gradient(0deg, rgba(0,0,0,0.18) 0 1px, transparent 1px 4px);
        pointer-events: none;
        animation: scan 12s linear infinite;
      }
      @keyframes scan { from { background-position-y: 0; } to { background-position-y: 200px; } }
      .frame .mug {
        position: absolute; left: 50%; top: 56%; transform: translate(-50%, -50%);
        width: 28%; aspect-ratio: 1 / 1;
        background: radial-gradient(ellipse at 35% 35%, #f5efe5 0%, #c2b8a7 50%, #6f6757 100%);
        border-radius: 18% 18% 22% 22% / 28% 28% 18% 18%;
        box-shadow: 18px 6px 30px rgba(0,0,0,0.45);
        animation: turn 6s ease-in-out infinite alternate;
      }
      .frame .mug::after {
        content: ''; position: absolute; right: -14%; top: 28%;
        width: 18%; height: 44%;
        border: 6px solid #c2b8a7; border-left: none; border-radius: 0 100% 100% 0 / 0 50% 50% 0;
      }
      @keyframes turn { from { transform: translate(-50%, -50%) rotate(-6deg); } to { transform: translate(-50%, -50%) rotate(6deg); } }
      .frame .timecode {
        position: absolute; left: 14px; bottom: 12px;
        font-family: ui-monospace, 'SF Mono', Menlo, monospace;
        font-size: 11px; letter-spacing: 0.16em;
        color: var(--muted);
        background: rgba(0,0,0,0.4);
        padding: 4px 8px; border-radius: 999px;
      }
      .frame .badge {
        position: absolute; left: 14px; top: 12px;
        font-family: ui-monospace, 'SF Mono', Menlo, monospace;
        font-size: 10.5px; letter-spacing: 0.2em; text-transform: uppercase;
        color: var(--accent);
      }
      .meta {
        display: grid; grid-template-columns: 1fr auto; gap: 10px;
        align-items: end; margin-top: 18px;
      }
      .title { font-size: 22px; line-height: 1.1; margin: 0; }
      .sub { font-family: ui-monospace, 'SF Mono', Menlo, monospace; font-size: 11px; color: var(--muted); letter-spacing: 0.14em; text-transform: uppercase; }
    </style>
  </head>
  <body>
    <div class="stage">
      <div class="frame">
        <span class="badge">● REC</span>
        <div class="mug" aria-hidden></div>
        <span class="timecode">00:05 · 16:9 · seedance-2</span>
      </div>
      <div class="meta">
        <h1 class="title">A 5-second product reveal — saved as MP4.</h1>
        <span class="sub">Open Design · Video</span>
      </div>
    </div>
  </body>
 </html>
@@ -1,6 +1,8 @@
 import { useMemo, useState } from 'react';
 import { useT } from '../i18n';
-import type { DesignSystemSummary } from '../types';
+import type { Dict } from '../i18n/types';
 import type { DesignSystemSummary, Surface } from '../types';
 import { Icon } from './Icon';
 interface Props {
  systems: DesignSystemSummary[];
@@ -9,6 +11,20 @@ interface Props {
  onPreview: (id: string) => void;
 }
 type SurfaceFilter = 'all' | Surface;
 const SURFACE_PILLS: { value: SurfaceFilter; labelKey: keyof Dict; icon: 'grid' | 'image' | 'video' | 'music' | null }[] = [
  { value: 'all', labelKey: 'common.all', icon: null },
  { value: 'web', labelKey: 'ds.surfaceWeb', icon: 'grid' },
  { value: 'image', labelKey: 'ds.surfaceImage', icon: 'image' },
  { value: 'video', labelKey: 'ds.surfaceVideo', icon: 'video' },
  { value: 'audio', labelKey: 'ds.surfaceAudio', icon: 'music' },
 ];
 function surfaceOf(system: DesignSystemSummary): Surface {
  return system.surface ?? 'web';
 }
 const CATEGORY_ORDER = [
  'Starter',
  'AI & LLM',
@@ -26,19 +42,43 @@ export function DesignSystemsTab({ systems, selectedId, onSelect, onPreview }: P
  const t = useT();
  const [filter, setFilter] = useState('');
  const [category, setCategory] = useState<string>('All');
  const [surfaceFilter, setSurfaceFilter] = useState<SurfaceFilter>('all');
  // Pre-scope by surface so the category dropdown only lists categories
  // that exist within the active surface — avoids ghost options that
  // would yield zero rows.
  const surfaceScoped = useMemo(
    () =>
      surfaceFilter === 'all'
        ? systems
        : systems.filter((s) => surfaceOf(s) === surfaceFilter),
    [systems, surfaceFilter],
  );
  const surfaceCounts = useMemo(() => {
    const counts: Record<SurfaceFilter, number> = {
      all: systems.length,
      web: 0,
      image: 0,
      video: 0,
      audio: 0,
    };
    for (const s of systems) counts[surfaceOf(s)]++;
    return counts;
  }, [systems]);
  const categories = useMemo(() => {
    const cats = new Set<string>();
-    for (const s of systems) cats.add(s.category || 'Uncategorized');
+    for (const s of surfaceScoped) cats.add(s.category || 'Uncategorized');
    const ordered: string[] = [];
    for (const c of CATEGORY_ORDER) if (cats.has(c)) ordered.push(c);
    for (const c of [...cats].sort()) if (!ordered.includes(c)) ordered.push(c);
    return ['All', ...ordered];
-  }, [systems]);
+  }, [surfaceScoped]);
  const filtered = useMemo(() => {
    const q = filter.trim().toLowerCase();
-    return systems.filter((s) => {
+    return surfaceScoped.filter((s) => {
      if (category !== 'All' && (s.category || 'Uncategorized') !== category) return false;
      if (!q) return true;
      return (
@@ -46,7 +86,7 @@ export function DesignSystemsTab({ systems, selectedId, onSelect, onPreview }: P
        s.summary.toLowerCase().includes(q)
      );
    });
-  }, [systems, filter, category]);
+  }, [surfaceScoped, filter, category]);
  // The category metadata coming from each design system is authored in
  // English. We translate the well-known buckets (All / Uncategorized) but
@@ -60,6 +100,30 @@ export function DesignSystemsTab({ systems, selectedId, onSelect, onPreview }: P
  return (
    <div className="tab-panel">
      <div
        className="examples-filter-row"
        role="tablist"
        aria-label={t('ds.surfaceLabel')}
      >
        <span className="examples-filter-label">{t('ds.surfaceLabel')}</span>
        {SURFACE_PILLS.map((p) => (
          <button
            key={p.value}
            type="button"
            role="tab"
            aria-selected={surfaceFilter === p.value}
            className={`filter-pill ${surfaceFilter === p.value ? 'active' : ''}`}
            onClick={() => {
              setSurfaceFilter(p.value);
              setCategory('All');
            }}
          >
            {p.icon ? <Icon name={p.icon} size={12} /> : null}
            {t(p.labelKey)}
            <span className="filter-pill-count">{surfaceCounts[p.value]}</span>
          </button>
        ))}
      </div>
      <div className="tab-panel-toolbar">
        <input
          placeholder={t('ds.searchPlaceholder')}
@@ -330,6 +330,30 @@ function metadataForSkill(skill: SkillSummary): ProjectMetadata {
        typeof skill.animations === 'boolean' ? skill.animations : false,
    };
  }
  // Media surfaces — defaults match the new-project form so the
  // 'Use this prompt' fast-create produces sensible metadata even
  // when the SKILL.md doesn't pin a specific model. Skills can pin
  // a model later via `od.image_model` etc.; for now we fall back to
  // the surface's first default.
  if (kind === 'image') {
    return { kind, imageModel: 'gpt-image-2', imageAspect: '1:1' };
  }
  if (kind === 'video') {
    return {
      kind,
      videoModel: 'seedance-2',
      videoLength: 5,
      videoAspect: '16:9',
    };
  }
  if (kind === 'audio') {
    return {
      kind,
      audioKind: 'music',
      audioModel: 'suno-v5',
      audioDuration: 30,
    };
  }
  return { kind: 'other' };
 }
@@ -337,5 +361,8 @@ function kindForSkill(skill: SkillSummary): ProjectKind {
  if (skill.mode === 'deck') return 'deck';
  if (skill.mode === 'prototype') return 'prototype';
  if (skill.mode === 'template') return 'template';
  if (skill.mode === 'image') return 'image';
  if (skill.mode === 'video') return 'video';
  if (skill.mode === 'audio') return 'audio';
  return 'other';
 }
@@ -4,7 +4,8 @@ import type { Dict } from '../i18n/types';
 import { fetchSkillExample } from '../providers/registry';
 import { exportAsHtml, exportAsPdf, exportAsZip } from '../runtime/exports';
 import { buildSrcdoc } from '../runtime/srcdoc';
-import type { SkillSummary } from '../types';
+import type { SkillSummary, Surface } from '../types';
 import { Icon } from './Icon';
 import { PreviewModal } from './PreviewModal';
 type TranslateFn = (key: keyof Dict, vars?: Record<string, string | number>) => string;
@@ -14,16 +15,73 @@ interface Props {
  onUsePrompt: (skill: SkillSummary) => void;
 }
-type ModeFilter = 'all' | 'prototype-desktop' | 'prototype-mobile' | 'deck' | 'document';
+type SurfaceFilter = 'all' | Surface;
 type ModeFilter =
  | 'all'
  | 'prototype-desktop'
  | 'prototype-mobile'
  | 'deck'
  | 'document'
  | 'image'
  | 'video'
  | 'audio';
 type ScenarioFilter = string;
-const MODE_PILLS: { value: ModeFilter; labelKey: keyof Dict }[] = [
+// Each surface gets its own type pills. We branch on `SURFACE_PILLS` so
 // the mode row reflects what makes sense within the active surface
 // (web has the most granularity; image / video / audio collapse to a
 // single mode pill so the pill count stays reasonable).
 const SURFACE_PILLS: { value: SurfaceFilter; labelKey: keyof Dict; icon: 'grid' | 'image' | 'video' | 'music' | null }[] = [
  { value: 'all', labelKey: 'examples.modeAll', icon: null },
  { value: 'web', labelKey: 'examples.surfaceWeb', icon: 'grid' },
  { value: 'image', labelKey: 'examples.surfaceImage', icon: 'image' },
  { value: 'video', labelKey: 'examples.surfaceVideo', icon: 'video' },
  { value: 'audio', labelKey: 'examples.surfaceAudio', icon: 'music' },
 ];
 const WEB_MODE_PILLS: { value: ModeFilter; labelKey: keyof Dict }[] = [
  { value: 'all', labelKey: 'examples.modeAll' },
  { value: 'prototype-desktop', labelKey: 'examples.modePrototypeDesktop' },
  { value: 'prototype-mobile', labelKey: 'examples.modePrototypeMobile' },
  { value: 'deck', labelKey: 'examples.modeDeck' },
  { value: 'document', labelKey: 'examples.modeDocument' },
 ];
 const IMAGE_MODE_PILLS: { value: ModeFilter; labelKey: keyof Dict }[] = [
  { value: 'all', labelKey: 'examples.modeAll' },
  { value: 'image', labelKey: 'examples.modeImage' },
 ];
 const VIDEO_MODE_PILLS: { value: ModeFilter; labelKey: keyof Dict }[] = [
  { value: 'all', labelKey: 'examples.modeAll' },
  { value: 'video', labelKey: 'examples.modeVideo' },
 ];
 const AUDIO_MODE_PILLS: { value: ModeFilter; labelKey: keyof Dict }[] = [
  { value: 'all', labelKey: 'examples.modeAll' },
  { value: 'audio', labelKey: 'examples.modeAudio' },
 ];
 // Convenience — the union pill list for the "All surfaces" view.
 const ALL_MODE_PILLS: { value: ModeFilter; labelKey: keyof Dict }[] = [
  ...WEB_MODE_PILLS,
  { value: 'image', labelKey: 'examples.modeImage' },
  { value: 'video', labelKey: 'examples.modeVideo' },
  { value: 'audio', labelKey: 'examples.modeAudio' },
 ];
 function surfaceOf(skill: SkillSummary): Surface {
  if (skill.surface) return skill.surface;
  if (skill.mode === 'image') return 'image';
  if (skill.mode === 'video') return 'video';
  if (skill.mode === 'audio') return 'audio';
  return 'web';
 }
 function pillsForSurface(surface: SurfaceFilter): { value: ModeFilter; labelKey: keyof Dict }[] {
  if (surface === 'web') return WEB_MODE_PILLS;
  if (surface === 'image') return IMAGE_MODE_PILLS;
  if (surface === 'video') return VIDEO_MODE_PILLS;
  if (surface === 'audio') return AUDIO_MODE_PILLS;
  return ALL_MODE_PILLS;
 }
 const SCENARIO_LABEL_KEY: Record<string, keyof Dict> = {
  general: 'examples.scenarioGeneral',
@@ -71,13 +129,22 @@ function matchesMode(skill: SkillSummary, filter: ModeFilter): boolean {
  if (filter === 'prototype-mobile')
    return skill.mode === 'prototype' && skill.platform === 'mobile';
  if (filter === 'document') return skill.mode === 'template';
  if (filter === 'image') return surfaceOf(skill) === 'image';
  if (filter === 'video') return surfaceOf(skill) === 'video';
  if (filter === 'audio') return surfaceOf(skill) === 'audio';
  return true;
 }
 function matchesSurface(skill: SkillSummary, filter: SurfaceFilter): boolean {
  if (filter === 'all') return true;
  return surfaceOf(skill) === filter;
 }
 export function ExamplesTab({ skills, onUsePrompt }: Props) {
  const t = useT();
  // Hold preview HTML per skill across re-renders so cards never re-flicker.
  const [previews, setPreviews] = useState<Record<string, string | null>>({});
  const [surfaceFilter, setSurfaceFilter] = useState<SurfaceFilter>('all');
  const [modeFilter, setModeFilter] = useState<ModeFilter>('all');
  const [scenarioFilter, setScenarioFilter] = useState<ScenarioFilter>('all');
  const [previewSkillId, setPreviewSkillId] = useState<string | null>(null);
@@ -106,32 +173,46 @@ export function ExamplesTab({ skills, onUsePrompt }: Props) {
    [skills, previewSkillId],
  );
-  const modeCounts = useMemo(() => {
+  const surfaceCounts = useMemo(() => {
-    const c: Record<ModeFilter, number> = {
+    const counts: Record<SurfaceFilter, number> = {
      all: skills.length,
-      'prototype-desktop': 0,
+      web: 0,
-      'prototype-mobile': 0,
+      image: 0,
-      deck: 0,
+      video: 0,
-      document: 0,
+      audio: 0,
    };
    for (const s of skills) {
-      if (matchesMode(s, 'prototype-desktop')) c['prototype-desktop']++;
+      const sf = surfaceOf(s);
-      if (matchesMode(s, 'prototype-mobile')) c['prototype-mobile']++;
+      counts[sf] = (counts[sf] ?? 0) + 1;
-      if (matchesMode(s, 'deck')) c.deck++;
+    }
-      if (matchesMode(s, 'document')) c.document++;
+    return counts;
  }, [skills]);
  const surfaceScopedSkills = useMemo(
    () => skills.filter((s) => matchesSurface(s, surfaceFilter)),
    [skills, surfaceFilter],
  );
  const modePills = useMemo(() => pillsForSurface(surfaceFilter), [surfaceFilter]);
  const modeCounts = useMemo(() => {
    const c: Record<string, number> = { all: surfaceScopedSkills.length };
    for (const p of modePills) {
      if (p.value === 'all') continue;
      c[p.value] = surfaceScopedSkills.filter((s) => matchesMode(s, p.value)).length;
    }
    return c;
-  }, [skills]);
+  }, [surfaceScopedSkills, modePills]);
  const scenarioCounts = useMemo(() => {
    const counts = new Map<string, number>();
-    for (const s of skills) {
+    for (const s of surfaceScopedSkills) {
      if (!matchesMode(s, modeFilter)) continue;
      const tag = s.scenario || 'general';
      counts.set(tag, (counts.get(tag) ?? 0) + 1);
    }
    return counts;
-  }, [skills, modeFilter]);
+  }, [surfaceScopedSkills, modeFilter]);
  const scenarioOptions = useMemo(() => {
    const have = new Set(scenarioCounts.keys());
@@ -142,7 +223,7 @@ export function ExamplesTab({ skills, onUsePrompt }: Props) {
  }, [scenarioCounts]);
  const filtered = useMemo(() => {
-    const matched = skills.filter((s) => {
+    const matched = surfaceScopedSkills.filter((s) => {
      if (!matchesMode(s, modeFilter)) return false;
      if (scenarioFilter === 'all') return true;
      return (s.scenario || 'general') === scenarioFilter;
@@ -159,7 +240,7 @@ export function ExamplesTab({ skills, onUsePrompt }: Props) {
        return a.idx - b.idx;
      })
      .map(({ s }) => s);
-  }, [skills, modeFilter, scenarioFilter]);
+  }, [surfaceScopedSkills, modeFilter, scenarioFilter]);
  if (skills.length === 0) {
    return <div className="tab-empty">{t('examples.emptyNoSkills')}</div>;
@@ -168,13 +249,38 @@ export function ExamplesTab({ skills, onUsePrompt }: Props) {
  return (
    <div className="tab-panel examples-panel">
      <div className="examples-toolbar">
        <div
          className="examples-filter-row"
          role="tablist"
          aria-label={t('examples.surfaceLabel')}
        >
          <span className="examples-filter-label">{t('examples.surfaceLabel')}</span>
          {SURFACE_PILLS.map((p) => (
            <button
              key={p.value}
              type="button"
              role="tab"
              aria-selected={surfaceFilter === p.value}
              className={`filter-pill ${surfaceFilter === p.value ? 'active' : ''}`}
              onClick={() => {
                setSurfaceFilter(p.value);
                setModeFilter('all');
                setScenarioFilter('all');
              }}
            >
              {p.icon ? <Icon name={p.icon} size={12} /> : null}
              {t(p.labelKey)}
              <span className="filter-pill-count">{surfaceCounts[p.value]}</span>
            </button>
          ))}
        </div>
        <div
          className="examples-filter-row"
          role="tablist"
          aria-label={t('examples.typeLabel')}
        >
          <span className="examples-filter-label">{t('examples.typeLabel')}</span>
-          {MODE_PILLS.map((p) => (
+          {modePills.map((p) => (
            <button
              key={p.value}
              type="button"
@@ -187,7 +293,9 @@ export function ExamplesTab({ skills, onUsePrompt }: Props) {
              }}
            >
              {t(p.labelKey)}
-              <span className="filter-pill-count">{modeCounts[p.value]}</span>
+              <span className="filter-pill-count">
                {p.value === 'all' ? surfaceScopedSkills.length : (modeCounts[p.value] ?? 0)}
              </span>
            </button>
          ))}
        </div>
@@ -445,6 +553,9 @@ function ExampleCard({
 }
 function tagForSkill(skill: SkillSummary, t: TranslateFn): string {
  if (skill.mode === 'image') return t('examples.tagImage');
  if (skill.mode === 'video') return t('examples.tagVideo');
  if (skill.mode === 'audio') return t('examples.tagAudio');
  if (skill.mode === 'deck') return t('examples.tagSlideDeck');
  if (skill.mode === 'template') return t('examples.tagTemplate');
  if (skill.mode === 'design-system') return t('examples.tagDesignSystem');
@@ -42,6 +42,12 @@ export function FileViewer({
  if (file.kind === 'sketch') {
    return <ImageViewer projectId={projectId} file={file} />;
  }
  if (file.kind === 'video') {
    return <VideoViewer projectId={projectId} file={file} />;
  }
  if (file.kind === 'audio') {
    return <AudioViewer projectId={projectId} file={file} />;
  }
  if (file.kind === 'text' || file.kind === 'code') {
    return <TextViewer projectId={projectId} file={file} />;
  }
@@ -679,6 +685,95 @@ function ImageViewer({
  );
 }
 function VideoViewer({
  projectId,
  file,
 }: {
  projectId: string;
  file: ProjectFile;
 }) {
  const t = useT();
  // Bust the browser cache when the agent regenerates the file in place.
  const url = `${projectFileUrl(projectId, file.name)}?v=${Math.round(file.mtime)}`;
  return (
    <div className="viewer video-viewer">
      <div className="viewer-toolbar">
        <div className="viewer-toolbar-left">
          <span className="viewer-meta">
            {t('fileViewer.videoMeta', { size: humanSize(file.size) })}
          </span>
        </div>
        <div className="viewer-toolbar-actions">
          <a
            className="ghost-link"
            href={projectFileUrl(projectId, file.name)}
            download={file.name}
          >
            {t('fileViewer.download')}
          </a>
          <a
            className="ghost-link"
            href={projectFileUrl(projectId, file.name)}
            target="_blank"
            rel="noreferrer noopener"
          >
            {t('fileViewer.open')}
          </a>
        </div>
      </div>
      <div className="viewer-body video-body">
        <video src={url} controls preload="metadata" />
      </div>
    </div>
  );
 }
 function AudioViewer({
  projectId,
  file,
 }: {
  projectId: string;
  file: ProjectFile;
 }) {
  const t = useT();
  const url = `${projectFileUrl(projectId, file.name)}?v=${Math.round(file.mtime)}`;
  return (
    <div className="viewer audio-viewer">
      <div className="viewer-toolbar">
        <div className="viewer-toolbar-left">
          <span className="viewer-meta">
            {t('fileViewer.audioMeta', { size: humanSize(file.size) })}
          </span>
        </div>
        <div className="viewer-toolbar-actions">
          <a
            className="ghost-link"
            href={projectFileUrl(projectId, file.name)}
            download={file.name}
          >
            {t('fileViewer.download')}
          </a>
          <a
            className="ghost-link"
            href={projectFileUrl(projectId, file.name)}
            target="_blank"
            rel="noreferrer noopener"
          >
            {t('fileViewer.open')}
          </a>
        </div>
      </div>
      <div className="viewer-body audio-body">
        <div className="audio-card">
          <Icon name="music" size={28} />
          <div className="audio-card-name">{file.name}</div>
          <audio src={url} controls preload="metadata" />
        </div>
      </div>
    </div>
  );
 }
 function TextViewer({
  projectId,
  file,
@@ -397,7 +397,7 @@ function Tab({
  onActivate: () => void;
  onClose?: () => void;
  closable?: boolean;
-  kind?: 'html' | 'image' | 'sketch' | 'text' | 'code' | 'binary';
+  kind?: 'html' | 'image' | 'video' | 'audio' | 'sketch' | 'text' | 'code' | 'binary';
 }) {
  const t = useT();
  const iconName = kindIconName(kind);
@@ -439,9 +439,13 @@ function kindIconName(
  | 'image'
  | 'pencil'
  | 'file'
  | 'video'
  | 'music'
  | null {
  if (kind === 'html') return 'file-code';
  if (kind === 'image') return 'image';
  if (kind === 'video') return 'video';
  if (kind === 'audio') return 'music';
  if (kind === 'sketch') return 'pencil';
  if (kind === 'code') return 'file-code';
  if (kind === 'text') return 'file';
@@ -24,6 +24,8 @@ type IconName =
  | 'link'
  | 'mic'
  | 'minus'
  | 'music'
  | 'video'
  | 'pencil'
  | 'plus'
  | 'play'
@@ -232,6 +234,21 @@ export function Icon({ name, size = 14, strokeWidth = 1.6, ...rest }: Props) {
          <path d="M5 12h14" />
        </svg>
      );
    case 'music':
      return (
        <svg {...common}>
          <path d="M9 18V5l12-2v13" />
          <circle cx="6" cy="18" r="3" />
          <circle cx="18" cy="16" r="3" />
        </svg>
      );
    case 'video':
      return (
        <svg {...common}>
          <rect x="2" y="6" width="14" height="12" rx="2" />
          <path d="m16 10 6-3v10l-6-3z" />
        </svg>
      );
    case 'pencil':
      return (
        <svg {...common}>
@@ -1,18 +1,31 @@
 import { useEffect, useMemo, useRef, useState } from 'react';
 import { useT } from '../i18n';
 import type { Dict } from '../i18n/types';
 import {
  AUDIO_MODELS_BY_KIND,
  DEFAULT_AUDIO_MODEL,
  DEFAULT_IMAGE_MODEL,
  DEFAULT_VIDEO_MODEL,
  IMAGE_MODELS,
  VIDEO_MODELS,
 } from '../media/models';
 import type {
  AudioKind,
  DesignSystemSummary,
  MediaAspect,
  ProjectKind,
  ProjectMetadata,
  ProjectTemplate,
  SkillSummary,
  Surface,
 } from '../types';
 import { Icon } from './Icon';
 import { Skeleton } from './Loading';
 type TranslateFn = (key: keyof Dict, vars?: Record<string, string | number>) => string;
 // Tabs that live INSIDE the Web surface. Image / Video / Audio surfaces
 // don't expose a tab row — they each have a single, dedicated form.
 export type CreateTab = 'prototype' | 'deck' | 'template' | 'other';
 export interface CreateInput {
@@ -38,6 +51,33 @@ const TAB_LABEL_KEYS: Record<CreateTab, keyof Dict> = {
  other: 'newproj.tabOther',
 };
 // Per-surface model lists are maintained in src/media/models.ts (and
 // daemon/media-models.js for the dispatcher). Both the picker below and
 // the agent's `od media generate --model …` invocation read the same
 // registry so the metadata captured here is what the daemon dispatches.
 // Surface vocab shared by the surface picker and the create-flow.
 const SURFACES: Surface[] = ['web', 'image', 'video', 'audio'];
 const SURFACE_LABEL_KEY: Record<Surface, keyof Dict> = {
  web: 'newproj.surfaceWeb',
  image: 'newproj.surfaceImage',
  video: 'newproj.surfaceVideo',
  audio: 'newproj.surfaceAudio',
 };
 const SURFACE_HINT_KEY: Record<Surface, keyof Dict> = {
  web: 'newproj.surfaceWebHint',
  image: 'newproj.surfaceImageHint',
  video: 'newproj.surfaceVideoHint',
  audio: 'newproj.surfaceAudioHint',
 };
 const SURFACE_ICON: Record<Surface, 'grid' | 'image' | 'video' | 'music'> = {
  web: 'grid',
  image: 'image',
  video: 'video',
  audio: 'music',
 };
 export function NewProjectPanel({
  skills,
  designSystems,
@@ -47,6 +87,10 @@ export function NewProjectPanel({
  loading = false,
 }: Props) {
  const t = useT();
  // Top-level surface — controls which sub-form renders below. We keep
  // it separate from the Web tab state so users can flip between
  // surfaces without losing their per-surface choices.
  const [surface, setSurface] = useState<Surface>('web');
  const [tab, setTab] = useState<CreateTab>('prototype');
  const [name, setName] = useState('');
  // Design-system selection is now an *array* internally so the same
@@ -64,12 +108,32 @@ export function NewProjectPanel({
  const [animations, setAnimations] = useState(false);
  const [templateId, setTemplateId] = useState<string | null>(null);
  // Image / Video / Audio metadata. Kept independently so flipping
  // surfaces preserves each surface's last pick instead of resetting.
  const [imageModel, setImageModel] = useState<string>(DEFAULT_IMAGE_MODEL);
  const [imageAspect, setImageAspect] = useState<MediaAspect>('1:1');
  const [imageStyle, setImageStyle] = useState('');
  const [videoModel, setVideoModel] = useState<string>(DEFAULT_VIDEO_MODEL);
  const [videoLength, setVideoLength] = useState<number>(5);
  const [videoAspect, setVideoAspect] = useState<MediaAspect>('16:9');
  const [audioKind, setAudioKind] = useState<AudioKind>('music');
  const [audioModel, setAudioModel] = useState<string>(DEFAULT_AUDIO_MODEL.music);
  const [audioDuration, setAudioDuration] = useState<number>(30);
  const [voice, setVoice] = useState('');
  // When the audio kind flips, reset the model to that kind's default.
  // This keeps users from accidentally creating a "music" project that
  // has `audioModel: minimax-tts` because they last visited speech.
  useEffect(() => {
    setAudioModel(DEFAULT_AUDIO_MODEL[audioKind]);
  }, [audioKind]);
  // When entering the template tab, snap to the first user-saved template
  // if there is one (and we don't already have a valid pick). The template
  // tab no longer offers a built-in fallback — the entire point is to
  // start from a template *the user* created via Share.
  useEffect(() => {
-    if (tab !== 'template') return;
+    if (surface !== 'web' || tab !== 'template') return;
    if (templates.length === 0) {
      setTemplateId(null);
      return;
@@ -77,12 +141,24 @@ export function NewProjectPanel({
    if (templateId == null || !templates.some((t) => t.id === templateId)) {
      setTemplateId(templates[0]!.id);
    }
-  }, [tab, templates, templateId]);
+  }, [surface, tab, templates, templateId]);
  // The skill the request still routes through — kept so prototype/deck
  // pick a default-rendered skill (so the agent gets the right SKILL.md
-  // body) without requiring the user to choose one explicitly.
+  // body) without requiring the user to choose one explicitly. For
  // image / video / audio surfaces we look up a skill that targets that
  // surface; if none ships yet the request still flies (skill_id null),
  // and the agent falls back to its base behavior + project metadata.
  const skillIdForTab = useMemo(() => {
    if (surface === 'image') {
      return pickDefaultSkill(skills, 'image');
    }
    if (surface === 'video') {
      return pickDefaultSkill(skills, 'video');
    }
    if (surface === 'audio') {
      return pickDefaultSkill(skills, 'audio');
    }
    if (tab === 'other') return null;
    if (tab === 'prototype') {
      const list = skills.filter((s) => s.mode === 'prototype');
@@ -97,16 +173,18 @@ export function NewProjectPanel({
        ?? null;
    }
    return null;
-  }, [tab, skills]);
+  }, [surface, tab, skills]);
-  const canCreate =
+  const canCreate = !loading && (
-    !loading && (tab !== 'template' || templateId != null);
+    surface !== 'web' || tab !== 'template' || templateId != null
  );
  function handleCreate() {
    if (!canCreate) return;
    const primaryDs = selectedDsIds[0] ?? null;
    const inspirations = selectedDsIds.slice(1);
    const metadata = buildMetadata({
      surface,
      tab,
      fidelity,
      speakerNotes,
@@ -114,32 +192,58 @@ export function NewProjectPanel({
      templateId,
      templates,
      inspirationIds: inspirations,
      imageModel,
      imageAspect,
      imageStyle,
      videoModel,
      videoLength,
      videoAspect,
      audioKind,
      audioModel,
      audioDuration,
      voice,
    });
    const fallbackName = surface === 'web'
      ? autoName(tab, t)
      : autoNameForSurface(surface, t);
    onCreate({
-      name: name.trim() || autoName(tab, t),
+      name: name.trim() || fallbackName,
      skillId: skillIdForTab,
      designSystemId: primaryDs,
      metadata,
    });
  }
  // Web surface needs a design-system picker; the media surfaces
  // currently don't bind tokens to a system so we hide it to reduce
  // noise. (When image/video DS surfaces ship, this will swap to a
  // surface-filtered picker variant.)
  const showDesignSystemPicker = surface === 'web';
  // Web surface still uses the four sub-tabs; the media surfaces
  // skip the row entirely because each has a single dedicated form.
  const showWebTabs = surface === 'web';
  return (
    <div className="newproj">
-      <div className="newproj-tabs" role="tablist">
+      <SurfacePicker value={surface} onChange={setSurface} />
-        {(Object.keys(TAB_LABEL_KEYS) as CreateTab[]).map((entry) => (
+      {showWebTabs ? (
-          <button
+        <div className="newproj-tabs" role="tablist">
-            key={entry}
+          {(Object.keys(TAB_LABEL_KEYS) as CreateTab[]).map((entry) => (
-            role="tab"
+            <button
-            aria-selected={tab === entry}
+              key={entry}
-            className={`newproj-tab ${tab === entry ? 'active' : ''}`}
+              role="tab"
-            onClick={() => setTab(entry)}
+              aria-selected={tab === entry}
-          >
+              className={`newproj-tab ${tab === entry ? 'active' : ''}`}
-            {t(TAB_LABEL_KEYS[entry])}
+              onClick={() => setTab(entry)}
-          </button>
+            >
-        ))}
+              {t(TAB_LABEL_KEYS[entry])}
-      </div>
+            </button>
          ))}
        </div>
      ) : null}
      <div className="newproj-body">
-        <h3 className="newproj-title">{titleForTab(tab, t)}</h3>
+        <h3 className="newproj-title">{titleForView(surface, tab, t)}</h3>
        <input
          className="newproj-name"
@@ -148,21 +252,23 @@ export function NewProjectPanel({
          onChange={(e) => setName(e.target.value)}
        />
-        <DesignSystemPicker
+        {showDesignSystemPicker ? (
-          designSystems={designSystems}
+          <DesignSystemPicker
-          defaultDesignSystemId={defaultDesignSystemId}
+            designSystems={designSystems}
-          selectedIds={selectedDsIds}
+            defaultDesignSystemId={defaultDesignSystemId}
-          multi={dsMulti}
+            selectedIds={selectedDsIds}
-          onChangeMulti={setDsMulti}
+            multi={dsMulti}
-          onChange={setSelectedDsIds}
+            onChangeMulti={setDsMulti}
-          loading={loading}
+            onChange={setSelectedDsIds}
-        />
+            loading={loading}
          />
        ) : null}
-        {tab === 'prototype' ? (
+        {surface === 'web' && tab === 'prototype' ? (
          <FidelityPicker value={fidelity} onChange={setFidelity} />
        ) : null}
-        {tab === 'deck' ? (
+        {surface === 'web' && tab === 'deck' ? (
          <ToggleRow
            label={t('newproj.toggleSpeakerNotes')}
            hint={t('newproj.toggleSpeakerNotesHint')}
@@ -171,7 +277,7 @@ export function NewProjectPanel({
          />
        ) : null}
-        {tab === 'template' ? (
+        {surface === 'web' && tab === 'template' ? (
          <>
            <TemplatePicker
              templates={templates}
@@ -187,19 +293,54 @@ export function NewProjectPanel({
          </>
        ) : null}
        {surface === 'image' ? (
          <ImageForm
            model={imageModel}
            onChangeModel={setImageModel}
            aspect={imageAspect}
            onChangeAspect={setImageAspect}
            style={imageStyle}
            onChangeStyle={setImageStyle}
          />
        ) : null}
        {surface === 'video' ? (
          <VideoForm
            model={videoModel}
            onChangeModel={setVideoModel}
            length={videoLength}
            onChangeLength={setVideoLength}
            aspect={videoAspect}
            onChangeAspect={setVideoAspect}
          />
        ) : null}
        {surface === 'audio' ? (
          <AudioForm
            kind={audioKind}
            onChangeKind={setAudioKind}
            model={audioModel}
            onChangeModel={setAudioModel}
            duration={audioDuration}
            onChangeDuration={setAudioDuration}
            voice={voice}
            onChangeVoice={setVoice}
          />
        ) : null}
        <button
          className="primary newproj-create"
          onClick={handleCreate}
          disabled={!canCreate}
          title={
-            tab === 'template' && templateId == null
+            surface === 'web' && tab === 'template' && templateId == null
              ? t('newproj.createDisabledTitle')
              : undefined
          }
        >
          <Icon name="plus" size={13} />
          <span>
-            {tab === 'template'
+            {surface === 'web' && tab === 'template'
              ? t('newproj.createFromTemplate')
              : t('newproj.create')}
          </span>
@@ -210,6 +351,290 @@ export function NewProjectPanel({
  );
 }
 function pickDefaultSkill(
  skills: SkillSummary[],
  surface: Surface,
 ): string | null {
  // Prefer a skill that explicitly declares `od.surface: <surface>` AND
  // matches the corresponding mode. Fall back to mode-only match so even
  // legacy skills authored without `surface` still get picked up.
  const surfaceMatch = skills.find(
    (s) => s.surface === surface && s.mode === surface,
  );
  if (surfaceMatch) return surfaceMatch.id;
  const modeMatch = skills.find((s) => s.mode === surface);
  if (modeMatch) return modeMatch.id;
  return null;
 }
 function SurfacePicker({
  value,
  onChange,
 }: {
  value: Surface;
  onChange: (s: Surface) => void;
 }) {
  const t = useT();
  return (
    <div className="newproj-surfaces" role="tablist" aria-label={t('newproj.surfaceLabel')}>
      {SURFACES.map((s) => (
        <button
          key={s}
          type="button"
          role="tab"
          aria-selected={value === s}
          className={`newproj-surface${value === s ? ' active' : ''}`}
          onClick={() => onChange(s)}
        >
          <Icon name={SURFACE_ICON[s]} size={15} />
          <span className="newproj-surface-label">{t(SURFACE_LABEL_KEY[s])}</span>
          <span className="newproj-surface-hint">{t(SURFACE_HINT_KEY[s])}</span>
        </button>
      ))}
    </div>
  );
 }
 function ImageForm({
  model,
  onChangeModel,
  aspect,
  onChangeAspect,
  style,
  onChangeStyle,
 }: {
  model: string;
  onChangeModel: (id: string) => void;
  aspect: MediaAspect;
  onChangeAspect: (a: MediaAspect) => void;
  style: string;
  onChangeStyle: (s: string) => void;
 }) {
  const t = useT();
  return (
    <>
      <ModelPicker
        value={model}
        onChange={onChangeModel}
        options={IMAGE_MODELS}
      />
      <AspectPicker
        value={aspect}
        onChange={onChangeAspect}
        options={['1:1', '16:9', '9:16', '4:3', '3:4']}
      />
      <div className="newproj-section">
        <label className="newproj-label">{t('newproj.imageStyleLabel')}</label>
        <textarea
          className="newproj-textarea"
          rows={3}
          placeholder={t('newproj.imageStylePlaceholder')}
          value={style}
          onChange={(e) => onChangeStyle(e.target.value)}
        />
      </div>
    </>
  );
 }
 function VideoForm({
  model,
  onChangeModel,
  length,
  onChangeLength,
  aspect,
  onChangeAspect,
 }: {
  model: string;
  onChangeModel: (id: string) => void;
  length: number;
  onChangeLength: (n: number) => void;
  aspect: MediaAspect;
  onChangeAspect: (a: MediaAspect) => void;
 }) {
  const t = useT();
  const lengths = [3, 5, 10];
  return (
    <>
      <ModelPicker value={model} onChange={onChangeModel} options={VIDEO_MODELS} />
      <div className="newproj-section">
        <label className="newproj-label">{t('newproj.videoLengthLabel')}</label>
        <div className="pill-grid">
          {lengths.map((s) => (
            <button
              key={s}
              type="button"
              className={`pill-grid-btn${length === s ? ' active' : ''}`}
              onClick={() => onChangeLength(s)}
              aria-pressed={length === s}
            >
              {t('newproj.videoLengthSeconds', { n: s })}
            </button>
          ))}
        </div>
      </div>
      <AspectPicker
        value={aspect}
        onChange={onChangeAspect}
        options={['16:9', '9:16', '1:1']}
      />
    </>
  );
 }
 function AudioForm({
  kind,
  onChangeKind,
  model,
  onChangeModel,
  duration,
  onChangeDuration,
  voice,
  onChangeVoice,
 }: {
  kind: AudioKind;
  onChangeKind: (k: AudioKind) => void;
  model: string;
  onChangeModel: (id: string) => void;
  duration: number;
  onChangeDuration: (n: number) => void;
  voice: string;
  onChangeVoice: (v: string) => void;
 }) {
  const t = useT();
  const kinds: { id: AudioKind; labelKey: keyof Dict }[] = [
    { id: 'music', labelKey: 'newproj.audioKindMusic' },
    { id: 'speech', labelKey: 'newproj.audioKindSpeech' },
    { id: 'sfx', labelKey: 'newproj.audioKindSfx' },
  ];
  // Music tracks are usually 30s-2min; speech / sfx work in shorter
  // chunks. We expose three buckets per kind so users don't have to
  // free-form-input a number.
  const durations = kind === 'music' ? [30, 60, 120] : [10, 30, 60];
  return (
    <>
      <div className="newproj-section">
        <label className="newproj-label">{t('newproj.audioKindLabel')}</label>
        <div className="pill-grid">
          {kinds.map((k) => (
            <button
              key={k.id}
              type="button"
              className={`pill-grid-btn${kind === k.id ? ' active' : ''}`}
              onClick={() => onChangeKind(k.id)}
              aria-pressed={kind === k.id}
            >
              {t(k.labelKey)}
            </button>
          ))}
        </div>
      </div>
      <ModelPicker
        value={model}
        onChange={onChangeModel}
        options={AUDIO_MODELS_BY_KIND[kind]}
      />
      <div className="newproj-section">
        <label className="newproj-label">{t('newproj.audioDurationLabel')}</label>
        <div className="pill-grid">
          {durations.map((s) => (
            <button
              key={s}
              type="button"
              className={`pill-grid-btn${duration === s ? ' active' : ''}`}
              onClick={() => onChangeDuration(s)}
              aria-pressed={duration === s}
            >
              {t('newproj.audioDurationSeconds', { n: s })}
            </button>
          ))}
        </div>
      </div>
      {kind === 'speech' ? (
        <div className="newproj-section">
          <label className="newproj-label">{t('newproj.voiceLabel')}</label>
          <textarea
            className="newproj-textarea"
            rows={2}
            placeholder={t('newproj.voicePlaceholder')}
            value={voice}
            onChange={(e) => onChangeVoice(e.target.value)}
          />
        </div>
      ) : null}
    </>
  );
 }
 function ModelPicker({
  value,
  onChange,
  options,
 }: {
  value: string;
  onChange: (id: string) => void;
  options: { id: string; label: string; hint: string }[];
 }) {
  const t = useT();
  return (
    <div className="newproj-section">
      <label className="newproj-label">{t('newproj.modelLabel')}</label>
      <div className="model-grid">
        {options.map((o) => (
          <button
            key={o.id}
            type="button"
            className={`model-card${value === o.id ? ' active' : ''}`}
            onClick={() => onChange(o.id)}
            aria-pressed={value === o.id}
          >
            <span className="model-card-name">{o.label}</span>
            <span className="model-card-hint">{o.hint}</span>
          </button>
        ))}
      </div>
    </div>
  );
 }
 function AspectPicker({
  value,
  onChange,
  options,
 }: {
  value: MediaAspect;
  onChange: (a: MediaAspect) => void;
  options: MediaAspect[];
 }) {
  const t = useT();
  const labelKeyFor: Record<MediaAspect, keyof Dict> = {
    '1:1': 'newproj.aspectSquare',
    '16:9': 'newproj.aspectLandscape',
    '9:16': 'newproj.aspectPortrait',
    '4:3': 'newproj.aspect43',
    '3:4': 'newproj.aspect34',
  };
  return (
    <div className="newproj-section">
      <label className="newproj-label">{t('newproj.aspectLabel')}</label>
      <div className="aspect-grid">
        {options.map((a) => (
          <button
            key={a}
            type="button"
            className={`aspect-card${value === a ? ' active' : ''}`}
            onClick={() => onChange(a)}
            aria-pressed={value === a}
          >
            <span className={`aspect-thumb aspect-thumb-${a.replace(':', 'x')}`} aria-hidden />
            <span className="aspect-label">{t(labelKeyFor[a])}</span>
          </button>
        ))}
      </div>
    </div>
  );
 }
 function FidelityPicker({
  value,
  onChange,
@@ -764,6 +1189,7 @@ function fallbackSwatches(seed: string): string[] {
 }
 function buildMetadata(input: {
  surface: Surface;
  tab: CreateTab;
  fidelity: 'wireframe' | 'high-fidelity';
  speakerNotes: boolean;
@@ -771,11 +1197,54 @@ function buildMetadata(input: {
  templateId: string | null;
  templates: ProjectTemplate[];
  inspirationIds: string[];
  imageModel: string;
  imageAspect: MediaAspect;
  imageStyle: string;
  videoModel: string;
  videoLength: number;
  videoAspect: MediaAspect;
  audioKind: AudioKind;
  audioModel: string;
  audioDuration: number;
  voice: string;
 }): ProjectMetadata {
  const kind: ProjectKind = input.tab;
  const inspirations = input.inspirationIds.length > 0
    ? { inspirationDesignSystemIds: input.inspirationIds }
    : {};
  if (input.surface === 'image') {
    return {
      kind: 'image',
      imageModel: input.imageModel,
      imageAspect: input.imageAspect,
      imageStyle: input.imageStyle.trim() || undefined,
      ...inspirations,
    };
  }
  if (input.surface === 'video') {
    return {
      kind: 'video',
      videoModel: input.videoModel,
      videoLength: input.videoLength,
      videoAspect: input.videoAspect,
      ...inspirations,
    };
  }
  if (input.surface === 'audio') {
    return {
      kind: 'audio',
      audioKind: input.audioKind,
      audioModel: input.audioModel,
      audioDuration: input.audioDuration,
      voice:
        input.audioKind === 'speech' && input.voice.trim()
          ? input.voice.trim()
          : undefined,
      ...inspirations,
    };
  }
  const kind: ProjectKind = input.tab;
  if (input.tab === 'prototype') {
    return { kind, fidelity: input.fidelity, ...inspirations };
  }
@@ -800,7 +1269,10 @@ function buildMetadata(input: {
  return { kind: 'other', ...inspirations };
 }
-function titleForTab(tab: CreateTab, t: TranslateFn): string {
+function titleForView(surface: Surface, tab: CreateTab, t: TranslateFn): string {
  if (surface === 'image') return t('newproj.titleImage');
  if (surface === 'video') return t('newproj.titleVideo');
  if (surface === 'audio') return t('newproj.titleAudio');
  switch (tab) {
    case 'prototype':
      return t('newproj.titlePrototype');
@@ -817,3 +1289,8 @@ function autoName(tab: CreateTab, t: TranslateFn): string {
  const stamp = new Date().toLocaleDateString();
  return `${t(TAB_LABEL_KEYS[tab])} · ${stamp}`;
 }
 function autoNameForSurface(surface: Surface, t: TranslateFn): string {
  const stamp = new Date().toLocaleDateString();
  return `${t(SURFACE_LABEL_KEY[surface])} · ${stamp}`;
 }
@@ -92,6 +92,16 @@ export const en: Dict = {
  'entry.resizeAria': 'Resize sidebar',
  'entry.loadingWorkspace': 'Loading workspace…',
  'newproj.surfaceLabel': 'Surface',
  'newproj.surfaceWeb': 'Web',
  'newproj.surfaceImage': 'Image',
  'newproj.surfaceVideo': 'Video',
  'newproj.surfaceAudio': 'Audio',
  'newproj.surfaceWebHint': 'Prototypes, decks, docs',
  'newproj.surfaceImageHint': 'Posters, illustrations, art',
  'newproj.surfaceVideoHint': 'Short-form clips, motion',
  'newproj.surfaceAudioHint': 'Music, voice, sfx',
  'newproj.tabPrototype': 'Prototype',
  'newproj.tabDeck': 'Slide deck',
  'newproj.tabTemplate': 'From template',
@@ -100,6 +110,32 @@ export const en: Dict = {
  'newproj.titleDeck': 'New slide deck',
  'newproj.titleTemplate': 'Start from a template',
  'newproj.titleOther': 'New project',
  'newproj.titleImage': 'New image',
  'newproj.titleVideo': 'New video',
  'newproj.titleAudio': 'New audio',
  'newproj.modelLabel': 'Model',
  'newproj.modelHint': 'Pick the upstream provider the agent should call.',
  'newproj.aspectLabel': 'Aspect ratio',
  'newproj.aspectSquare': 'Square · 1:1',
  'newproj.aspectLandscape': 'Landscape · 16:9',
  'newproj.aspectPortrait': 'Portrait · 9:16',
  'newproj.aspect43': 'Wide · 4:3',
  'newproj.aspect34': 'Tall · 3:4',
  'newproj.imageStyleLabel': 'Style notes (optional)',
  'newproj.imageStylePlaceholder':
    'e.g. editorial photography, muted earth tones, soft daylight',
  'newproj.videoLengthLabel': 'Length',
  'newproj.videoLengthSeconds': '{n}s',
  'newproj.audioKindLabel': 'What are we making?',
  'newproj.audioKindMusic': 'Music',
  'newproj.audioKindSpeech': 'Voice / TTS',
  'newproj.audioKindSfx': 'SFX / foley',
  'newproj.audioDurationLabel': 'Duration',
  'newproj.audioDurationSeconds': '{n}s',
  'newproj.voiceLabel': 'Voice (TTS only)',
  'newproj.voicePlaceholder':
    'e.g. warm female narrator, British English, calm pacing',
  'newproj.namePlaceholder': 'Project name',
  'newproj.fidelityLabel': 'Fidelity',
  'newproj.fidelityWireframe': 'Wireframe',
@@ -156,6 +192,17 @@ export const en: Dict = {
  'examples.modePrototypeMobile': 'Prototypes · Mobile',
  'examples.modeDeck': 'Slides',
  'examples.modeDocument': 'Docs & templates',
  'examples.modeImage': 'Images',
  'examples.modeVideo': 'Videos',
  'examples.modeAudio': 'Audio',
  'examples.surfaceLabel': 'Surface',
  'examples.surfaceWeb': 'Web',
  'examples.surfaceImage': 'Image',
  'examples.surfaceVideo': 'Video',
  'examples.surfaceAudio': 'Audio',
  'examples.tagImage': 'Image',
  'examples.tagVideo': 'Video',
  'examples.tagAudio': 'Audio',
  'examples.scenarioGeneral': 'General',
  'examples.scenarioEngineering': 'Engineering',
  'examples.scenarioProduct': 'Product',
@@ -197,6 +244,11 @@ export const en: Dict = {
  'ds.categoryUncategorized': 'Uncategorized',
  'ds.showcase': 'Showcase',
  'ds.tokens': 'Tokens',
  'ds.surfaceLabel': 'Surface',
  'ds.surfaceWeb': 'Web',
  'ds.surfaceImage': 'Image',
  'ds.surfaceVideo': 'Video',
  'ds.surfaceAudio': 'Audio',
  'avatar.title': 'Account & settings',
  'avatar.localCli': 'Local CLI',
@@ -351,6 +403,8 @@ export const en: Dict = {
  'fileViewer.open': 'Open',
  'fileViewer.imageMeta': 'Image · {size}',
  'fileViewer.sketchMeta': 'Sketch · {size}',
  'fileViewer.videoMeta': 'Video · {size}',
  'fileViewer.audioMeta': 'Audio · {size}',
  'fileViewer.reload': 'Reload',
  'fileViewer.reloadDisk': 'Reload from disk',
  'fileViewer.copy': 'Copy',
@@ -91,6 +91,16 @@ export const zhCN: Dict = {
  'entry.resizeAria': '调整侧边栏宽度',
  'entry.loadingWorkspace': '正在加载工作区…',
  'newproj.surfaceLabel': '类型',
  'newproj.surfaceWeb': '网页',
  'newproj.surfaceImage': '图片',
  'newproj.surfaceVideo': '视频',
  'newproj.surfaceAudio': '音频',
  'newproj.surfaceWebHint': '原型 / 幻灯 / 文档',
  'newproj.surfaceImageHint': '海报 / 插画 / 设计稿',
  'newproj.surfaceVideoHint': '短视频 / 动效',
  'newproj.surfaceAudioHint': '音乐 / 配音 / 音效',
  'newproj.tabPrototype': '原型',
  'newproj.tabDeck': '幻灯片',
  'newproj.tabTemplate': '从模板',
@@ -99,6 +109,30 @@ export const zhCN: Dict = {
  'newproj.titleDeck': '新建幻灯片',
  'newproj.titleTemplate': '从模板开始',
  'newproj.titleOther': '新建项目',
  'newproj.titleImage': '新建图片',
  'newproj.titleVideo': '新建视频',
  'newproj.titleAudio': '新建音频',
  'newproj.modelLabel': '模型',
  'newproj.modelHint': '选择代理调用的上游模型。',
  'newproj.aspectLabel': '画幅比例',
  'newproj.aspectSquare': '方形 · 1:1',
  'newproj.aspectLandscape': '横版 · 16:9',
  'newproj.aspectPortrait': '竖版 · 9:16',
  'newproj.aspect43': '宽屏 · 4:3',
  'newproj.aspect34': '高屏 · 3:4',
  'newproj.imageStyleLabel': '风格备注（可选）',
  'newproj.imageStylePlaceholder': '例如：编辑摄影、低饱和大地色、柔光日光',
  'newproj.videoLengthLabel': '时长',
  'newproj.videoLengthSeconds': '{n}秒',
  'newproj.audioKindLabel': '生成什么？',
  'newproj.audioKindMusic': '音乐',
  'newproj.audioKindSpeech': '配音 / TTS',
  'newproj.audioKindSfx': '音效 / 拟音',
  'newproj.audioDurationLabel': '时长',
  'newproj.audioDurationSeconds': '{n}秒',
  'newproj.voiceLabel': '声线（仅 TTS）',
  'newproj.voicePlaceholder': '例如：温暖女声旁白，普通话，平稳语速',
  'newproj.namePlaceholder': '项目名称',
  'newproj.fidelityLabel': '精度',
  'newproj.fidelityWireframe': '线框图',
@@ -153,6 +187,17 @@ export const zhCN: Dict = {
  'examples.modePrototypeMobile': '原型 · 移动端',
  'examples.modeDeck': '幻灯片',
  'examples.modeDocument': '文档与模板',
  'examples.modeImage': '图片',
  'examples.modeVideo': '视频',
  'examples.modeAudio': '音频',
  'examples.surfaceLabel': '类型',
  'examples.surfaceWeb': '网页',
  'examples.surfaceImage': '图片',
  'examples.surfaceVideo': '视频',
  'examples.surfaceAudio': '音频',
  'examples.tagImage': '图片',
  'examples.tagVideo': '视频',
  'examples.tagAudio': '音频',
  'examples.scenarioGeneral': '通用',
  'examples.scenarioEngineering': '工程',
  'examples.scenarioProduct': '产品',
@@ -194,6 +239,11 @@ export const zhCN: Dict = {
  'ds.categoryUncategorized': '未分类',
  'ds.showcase': '展示',
  'ds.tokens': 'Token',
  'ds.surfaceLabel': '类型',
  'ds.surfaceWeb': '网页',
  'ds.surfaceImage': '图片',
  'ds.surfaceVideo': '视频',
  'ds.surfaceAudio': '音频',
  'avatar.title': '账户与设置',
  'avatar.localCli': '本机 CLI',
@@ -342,6 +392,8 @@ export const zhCN: Dict = {
  'fileViewer.open': '打开',
  'fileViewer.imageMeta': '图片 · {size}',
  'fileViewer.sketchMeta': '草图 · {size}',
  'fileViewer.videoMeta': '视频 · {size}',
  'fileViewer.audioMeta': '音频 · {size}',
  'fileViewer.reload': '重新加载',
  'fileViewer.reloadDisk': '从磁盘重新加载',
  'fileViewer.copy': '复制',
@@ -104,6 +104,19 @@ export interface Dict {
  'entry.loadingWorkspace': string;
  // New project panel
  // Top-level surface picker — sits above the existing tabs and switches
  // the form between Web (prototype/deck/template/other), Image, Video,
  // and Audio surfaces.
  'newproj.surfaceLabel': string;
  'newproj.surfaceWeb': string;
  'newproj.surfaceImage': string;
  'newproj.surfaceVideo': string;
  'newproj.surfaceAudio': string;
  'newproj.surfaceWebHint': string;
  'newproj.surfaceImageHint': string;
  'newproj.surfaceVideoHint': string;
  'newproj.surfaceAudioHint': string;
  'newproj.tabPrototype': string;
  'newproj.tabDeck': string;
  'newproj.tabTemplate': string;
@@ -112,6 +125,31 @@ export interface Dict {
  'newproj.titleDeck': string;
  'newproj.titleTemplate': string;
  'newproj.titleOther': string;
  'newproj.titleImage': string;
  'newproj.titleVideo': string;
  'newproj.titleAudio': string;
  // Media-specific labels for the Image / Video / Audio forms.
  'newproj.modelLabel': string;
  'newproj.modelHint': string;
  'newproj.aspectLabel': string;
  'newproj.aspectSquare': string;
  'newproj.aspectLandscape': string;
  'newproj.aspectPortrait': string;
  'newproj.aspect43': string;
  'newproj.aspect34': string;
  'newproj.imageStyleLabel': string;
  'newproj.imageStylePlaceholder': string;
  'newproj.videoLengthLabel': string;
  'newproj.videoLengthSeconds': string;
  'newproj.audioKindLabel': string;
  'newproj.audioKindMusic': string;
  'newproj.audioKindSpeech': string;
  'newproj.audioKindSfx': string;
  'newproj.audioDurationLabel': string;
  'newproj.audioDurationSeconds': string;
  'newproj.voiceLabel': string;
  'newproj.voicePlaceholder': string;
  'newproj.namePlaceholder': string;
  'newproj.fidelityLabel': string;
  'newproj.fidelityWireframe': string;
@@ -167,6 +205,17 @@ export interface Dict {
  'examples.modePrototypeMobile': string;
  'examples.modeDeck': string;
  'examples.modeDocument': string;
  'examples.modeImage': string;
  'examples.modeVideo': string;
  'examples.modeAudio': string;
  'examples.surfaceLabel': string;
  'examples.surfaceWeb': string;
  'examples.surfaceImage': string;
  'examples.surfaceVideo': string;
  'examples.surfaceAudio': string;
  'examples.tagImage': string;
  'examples.tagVideo': string;
  'examples.tagAudio': string;
  'examples.scenarioGeneral': string;
  'examples.scenarioEngineering': string;
  'examples.scenarioProduct': string;
@@ -209,6 +258,12 @@ export interface Dict {
  'ds.categoryUncategorized': string;
  'ds.showcase': string;
  'ds.tokens': string;
  // Surface filter row in the Design systems tab.
  'ds.surfaceLabel': string;
  'ds.surfaceWeb': string;
  'ds.surfaceImage': string;
  'ds.surfaceVideo': string;
  'ds.surfaceAudio': string;
  // Avatar menu (project topbar)
  'avatar.title': string;
@@ -358,6 +413,8 @@ export interface Dict {
  'fileViewer.open': string;
  'fileViewer.imageMeta': string;
  'fileViewer.sketchMeta': string;
  'fileViewer.videoMeta': string;
  'fileViewer.audioMeta': string;
  'fileViewer.reload': string;
  'fileViewer.reloadDisk': string;
  'fileViewer.copy': string;
@@ -1091,6 +1091,212 @@ code {
  text-align: center;
 }
 /* -------- Surface picker (top-level Web/Image/Video/Audio) ----------- */
 .newproj-surfaces {
  display: grid;
  grid-template-columns: repeat(2, 1fr);
  gap: 6px;
  padding: 10px 10px 8px;
  border-bottom: 1px solid var(--border);
 }
 .newproj-surface {
  display: flex;
  flex-direction: column;
  align-items: flex-start;
  gap: 4px;
  padding: 10px 10px 9px;
  background: var(--bg-panel);
  border: 1px solid var(--border);
  border-radius: var(--radius-sm);
  cursor: pointer;
  text-align: left;
  color: var(--text);
  transition: border-color 120ms ease, background 120ms ease, box-shadow 120ms ease;
  min-width: 0;
 }
 .newproj-surface:hover:not(:disabled) { border-color: var(--border-strong); }
 .newproj-surface.active {
  border-color: var(--accent);
  background: var(--accent-tint);
  box-shadow: 0 0 0 1px var(--accent);
 }
 .newproj-surface > svg { color: var(--text-muted); }
 .newproj-surface.active > svg { color: var(--accent); }
 .newproj-surface-label {
  font-size: 12.5px;
  font-weight: 600;
 }
 .newproj-surface-hint {
  font-size: 10.5px;
  color: var(--text-muted);
  line-height: 1.3;
  white-space: nowrap;
  overflow: hidden;
  text-overflow: ellipsis;
  max-width: 100%;
 }
 /* -------- Model / aspect / pill grids (image/video/audio forms) ----- */
 .newproj-textarea {
  width: 100%;
  resize: vertical;
  min-height: 60px;
  padding: 10px 12px;
  border: 1px solid var(--border);
  border-radius: var(--radius-sm);
  background: var(--bg-panel);
  font: inherit;
  font-size: 13px;
  color: var(--text);
  line-height: 1.45;
 }
 .newproj-textarea:focus {
  outline: none;
  border-color: var(--accent);
  box-shadow: 0 0 0 1px var(--accent);
 }
 .model-grid {
  display: grid;
  grid-template-columns: 1fr 1fr;
  gap: 6px;
 }
 .model-card {
  display: flex;
  flex-direction: column;
  align-items: flex-start;
  gap: 2px;
  padding: 8px 10px;
  background: var(--bg-panel);
  border: 1px solid var(--border);
  border-radius: var(--radius-sm);
  cursor: pointer;
  text-align: left;
  transition: border-color 120ms ease, background 120ms ease;
 }
 .model-card:hover:not(:disabled) { border-color: var(--border-strong); }
 .model-card.active {
  border-color: var(--accent);
  background: var(--accent-tint);
 }
 .model-card-name {
  font-size: 12.5px;
  font-weight: 600;
  color: var(--text);
  font-family: var(--font-mono, ui-monospace, SFMono-Regular, Menlo, monospace);
 }
 .model-card-hint {
  font-size: 10.5px;
  color: var(--text-muted);
 }
 .aspect-grid {
  display: grid;
  grid-template-columns: repeat(auto-fit, minmax(64px, 1fr));
  gap: 6px;
 }
 .aspect-card {
  display: flex;
  flex-direction: column;
  align-items: center;
  gap: 6px;
  padding: 8px 6px 9px;
  background: var(--bg-panel);
  border: 1px solid var(--border);
  border-radius: var(--radius-sm);
  cursor: pointer;
  transition: border-color 120ms ease, background 120ms ease;
 }
 .aspect-card:hover:not(:disabled) { border-color: var(--border-strong); }
 .aspect-card.active {
  border-color: var(--accent);
  background: var(--accent-tint);
 }
 .aspect-thumb {
  background: var(--bg-subtle);
  border: 1px solid var(--border-soft);
  border-radius: 3px;
  display: block;
 }
 .aspect-thumb-1x1 { width: 24px; height: 24px; }
 .aspect-thumb-16x9 { width: 32px; height: 18px; }
 .aspect-thumb-9x16 { width: 18px; height: 32px; }
 .aspect-thumb-4x3 { width: 28px; height: 21px; }
 .aspect-thumb-3x4 { width: 21px; height: 28px; }
 .aspect-label {
  font-size: 10.5px;
  color: var(--text-muted);
  text-align: center;
  white-space: nowrap;
 }
 .aspect-card.active .aspect-label { color: var(--text); }
 .pill-grid {
  display: flex;
  flex-wrap: wrap;
  gap: 6px;
 }
 .pill-grid-btn {
  padding: 6px 12px;
  background: var(--bg-panel);
  border: 1px solid var(--border);
  border-radius: 999px;
  font-size: 12px;
  color: var(--text);
  cursor: pointer;
  transition: border-color 120ms ease, background 120ms ease;
 }
 .pill-grid-btn:hover:not(:disabled) { border-color: var(--border-strong); }
 .pill-grid-btn.active {
  border-color: var(--accent);
  background: var(--accent-tint);
  color: var(--text);
  font-weight: 500;
 }
 /* -------- Video / audio viewers -------------------------------------- */
 .video-body, .audio-body {
  display: flex;
  align-items: center;
  justify-content: center;
  padding: 24px;
  background: var(--bg-subtle);
  min-height: 0;
  flex: 1;
 }
 .video-body video {
  max-width: 100%;
  max-height: 100%;
  border-radius: var(--radius-sm);
  background: #000;
  box-shadow: var(--shadow-md, 0 8px 28px rgba(0, 0, 0, 0.18));
 }
 .audio-card {
  display: flex;
  flex-direction: column;
  align-items: center;
  gap: 10px;
  padding: 28px 32px;
  background: var(--bg-panel);
  border: 1px solid var(--border);
  border-radius: var(--radius);
  min-width: 280px;
  max-width: 480px;
  width: 100%;
  box-shadow: var(--shadow-xs);
  color: var(--text-muted);
 }
 .audio-card-name {
  font-size: 13px;
  font-weight: 500;
  color: var(--text);
  word-break: break-all;
  text-align: center;
 }
 .audio-card audio {
  width: 100%;
 }
 /* -------- Fidelity cards (prototype tab) ---------------------------- */
 .fidelity-grid {
  display: grid;
@@ -0,0 +1,107 @@
 /**
 * Single source of truth for the media-generation model registry.
 *
 * Both the frontend (NewProjectPanel model pickers) and the daemon
 * (od media generate dispatcher) consume this list. When you add a new
 * model entry here, the picker shows it AND the daemon can route to it —
 * the unifying contract is "skills + metadata + prompt → code agent →
 * od media generate", and this file pins down what `--model` IDs the
 * agent is allowed to pass.
 *
 * The daemon imports the JSON view of this file via fs.readFile so we
 * don't fork the registry between frontend and Node code paths.
 */
 import type { AudioKind, MediaAspect } from '../types';
 export interface MediaModel {
  /** Stable ID used in metadata.imageModel / videoModel / audioModel. */
  id: string;
  /** Short label shown in pickers — usually equals id. */
  label: string;
  /** Vendor / context hint shown under the label. */
  hint: string;
  /**
   * Capabilities the agent may rely on when planning. Used downstream by
   * the dispatcher to decide which provider call to make.
   */
  caps?: string[];
 }
 export const IMAGE_MODELS: MediaModel[] = [
  { id: 'gpt-image-2', label: 'gpt-image-2', hint: 'OpenAI · default', caps: ['t2i', 'i2i', 'inpaint'] },
  { id: 'flux-1.1-pro', label: 'flux-1.1-pro', hint: 'Black Forest Labs', caps: ['t2i', 'i2i'] },
  { id: 'imagen-4', label: 'imagen-4', hint: 'Google', caps: ['t2i'] },
  { id: 'midjourney-v7', label: 'midjourney-v7', hint: 'Midjourney', caps: ['t2i'] },
 ];
 export const VIDEO_MODELS: MediaModel[] = [
  { id: 'seedance-2', label: 'seedance-2', hint: 'ByteDance · default', caps: ['t2v', 'i2v'] },
  { id: 'kling-3', label: 'kling-3', hint: 'Kuaishou', caps: ['t2v', 'i2v'] },
  { id: 'kling-4', label: 'kling-4', hint: 'Kuaishou · latest', caps: ['t2v', 'i2v'] },
  { id: 'veo-3', label: 'veo-3', hint: 'Google', caps: ['t2v'] },
  { id: 'sora-2', label: 'sora-2', hint: 'OpenAI', caps: ['t2v'] },
 ];
 export const AUDIO_MODELS_BY_KIND: Record<AudioKind, MediaModel[]> = {
  music: [
    { id: 'suno-v5', label: 'suno-v5', hint: 'Suno · default', caps: ['music'] },
    { id: 'udio-v2', label: 'udio-v2', hint: 'Udio', caps: ['music'] },
    { id: 'lyria-2', label: 'lyria-2', hint: 'Google', caps: ['music'] },
  ],
  speech: [
    { id: 'minimax-tts', label: 'minimax-tts', hint: 'MiniMax · default', caps: ['tts'] },
    { id: 'fish-speech-2', label: 'fish-speech-2', hint: 'FishAudio', caps: ['tts', 'voice-clone'] },
    { id: 'elevenlabs-v3', label: 'elevenlabs-v3', hint: 'ElevenLabs', caps: ['tts', 'voice-clone'] },
  ],
  sfx: [
    { id: 'elevenlabs-sfx', label: 'elevenlabs-sfx', hint: 'ElevenLabs SFX', caps: ['sfx'] },
    { id: 'audiocraft', label: 'audiocraft', hint: 'Meta · open', caps: ['sfx', 'music'] },
  ],
 };
 export const MEDIA_ASPECTS: MediaAspect[] = ['1:1', '16:9', '9:16', '4:3', '3:4'];
 export const VIDEO_LENGTHS_SEC: number[] = [3, 5, 8, 10, 15, 30];
 export const AUDIO_DURATIONS_SEC: number[] = [5, 10, 15, 30, 60, 120];
 export const DEFAULT_IMAGE_MODEL = IMAGE_MODELS[0]!.id;
 export const DEFAULT_VIDEO_MODEL = VIDEO_MODELS[0]!.id;
 export const DEFAULT_AUDIO_MODEL: Record<AudioKind, string> = {
  music: AUDIO_MODELS_BY_KIND.music[0]!.id,
  speech: AUDIO_MODELS_BY_KIND.speech[0]!.id,
  sfx: AUDIO_MODELS_BY_KIND.sfx[0]!.id,
 };
 /**
 * Look up a model record across all surfaces by ID. Returns null if the
 * agent passes an unknown model — the dispatcher rejects with a clear
 * error so the agent re-plans instead of silently falling back.
 */
 export function findMediaModel(id: string): MediaModel | null {
  const all: MediaModel[] = [
    ...IMAGE_MODELS,
    ...VIDEO_MODELS,
    ...AUDIO_MODELS_BY_KIND.music,
    ...AUDIO_MODELS_BY_KIND.speech,
    ...AUDIO_MODELS_BY_KIND.sfx,
  ];
  return all.find((m) => m.id === id) ?? null;
 }
 /** All model IDs grouped by surface, used for prompt-side disclosure. */
 export function modelIdsBySurface(): {
  image: string[];
  video: string[];
  audio: { music: string[]; speech: string[]; sfx: string[] };
 } {
  return {
    image: IMAGE_MODELS.map((m) => m.id),
    video: VIDEO_MODELS.map((m) => m.id),
    audio: {
      music: AUDIO_MODELS_BY_KIND.music.map((m) => m.id),
      speech: AUDIO_MODELS_BY_KIND.speech.map((m) => m.id),
      sfx: AUDIO_MODELS_BY_KIND.sfx.map((m) => m.id),
    },
  };
 }
@@ -0,0 +1,135 @@
 /**
 * Media generation contract. Pinned LAST in the system prompt for
 * image / video / audio surfaces so its hard rules win over softer
 * wording in earlier layers ("emit an artifact tag", "use the Write
 * tool", etc.).
 *
 * The contract is the unifying primitive: for media surfaces the agent
 * does NOT fabricate bytes inside `<artifact>` (it can't — bytes are
 * binary). Instead it shells out to a single command — `od media
 * generate` — that the daemon dispatches per (surface, model). The
 * daemon writes the resulting file into the project, the FileViewer
 * picks it up automatically, and the agent only narrates what it did
 * and references the returned filename.
 *
 * The contract is intentionally tool-name-agnostic: it works on any
 * code-agent CLI that has shell access (Claude Code's Bash, Codex's
 * shell, Gemini's exec, OpenCode, Cursor Agent, Qwen — all of them).
 * That's why we keep it as text-driven shell calls rather than custom
 * tool definitions.
 */
 import {
  AUDIO_MODELS_BY_KIND,
  IMAGE_MODELS,
  VIDEO_MODELS,
 } from '../media/models';
 function fmtList(ids: string[]): string {
  return ids.map((id) => `\`${id}\``).join(', ');
 }
 const IMAGE_IDS = fmtList(IMAGE_MODELS.map((m) => m.id));
 const VIDEO_IDS = fmtList(VIDEO_MODELS.map((m) => m.id));
 const AUDIO_MUSIC_IDS = fmtList(AUDIO_MODELS_BY_KIND.music.map((m) => m.id));
 const AUDIO_SPEECH_IDS = fmtList(AUDIO_MODELS_BY_KIND.speech.map((m) => m.id));
 const AUDIO_SFX_IDS = fmtList(AUDIO_MODELS_BY_KIND.sfx.map((m) => m.id));
 export const MEDIA_GENERATION_CONTRACT = `
 ---
 ## Media generation contract (load-bearing — overrides softer wording above)
 This project is a **non-web** surface (image / video / audio). The unifying
 contract is: skill workflow + project metadata tell you WHAT to make; one
 shell command — \`od media generate\` — is HOW you actually produce bytes.
 Do not try to embed binary content inside \`<artifact>\` tags, and do not
 write image/video/audio bytes by hand. Always call out to the dispatcher.
 ### Environment the daemon injected for you
 The daemon spawns you with these env vars set (verify with \`echo\`):
 - \`OD_BIN\`         — absolute path to the \`od\` CLI script. Run with \`node "$OD_BIN" …\`.
 - \`OD_PROJECT_ID\`  — the active project's id. Pass it as \`--project "$OD_PROJECT_ID"\`.
 - \`OD_PROJECT_DIR\` — the project's files folder (your cwd). Generated files land here.
 - \`OD_DAEMON_URL\`  — base URL of the local daemon, e.g. \`http://127.0.0.1:7456\`.
 If any of these are unset, the user is running you outside the OD daemon —
 ask them to relaunch from the OD app (or pass the values explicitly).
 ### Invocation
 Run via your shell tool (Bash on Claude Code, exec on Codex/Gemini, etc.):
 \`\`\`bash
 node "$OD_BIN" media generate \\
  --project "$OD_PROJECT_ID" \\
  --surface <image|video|audio> \\
  --model <model-id> \\
  --output <filename> \\
  --prompt "<full prompt>" \\
  [--aspect 1:1|16:9|9:16|4:3|3:4] \\
  [--length <seconds>]              # video only
  [--duration <seconds>]            # audio only
  [--audio-kind music|speech|sfx]   # audio only
  [--voice <voice-id>]              # audio:speech only
 \`\`\`
 The command prints a single line of JSON describing the written file:
 \`\`\`json
 { "file": { "name": "poster.png", "size": 12345, "kind": "image", "mime": "image/png", ... } }
 \`\`\`
 Save the \`file.name\` and reference it in your reply ("I generated
 \`poster.png\`."). The user's FileViewer renders it automatically.
 ### Allowed model IDs (per surface)
 - **image**:   ${IMAGE_IDS}
 - **video**:   ${VIDEO_IDS}
 - **audio · music**:  ${AUDIO_MUSIC_IDS}
 - **audio · speech**: ${AUDIO_SPEECH_IDS}
 - **audio · sfx**:    ${AUDIO_SFX_IDS}
 If the user requests a model that is not in this list, surface a warning
 in your reply and either (a) ask them to pick a registered ID or (b)
 proceed with the project metadata's default model and explain the
 substitution. Do not silently fall back.
 ### Workflow rules
 1. **Read project metadata first.** The "Project metadata" block above
   tells you the user's pre-selected model, aspect, length, voice, audio
   kind, etc. Treat those as authoritative defaults — only override if
   the user's chat message explicitly contradicts them.
 2. **One discovery turn before generating.** Even with metadata defaults
   present, restate what you're about to make and ask one targeted
   question if anything is ambiguous (subject, mood, brand, voice). The
   discovery rules from the philosophy layer still apply — emit a
   question form on turn 1 unless the user's prompt already pins every
   variable.
 3. **Generate by shell, narrate in chat.** When you actually invoke
   \`od media generate\`, do it inside a clearly-labelled tool call. After
   it returns, write a short reply: what was produced, the filename,
   and any notes (model substitutions, retries, follow-up suggestions).
 4. **Iterate by re-running.** To revise, call \`od media generate\` again
   with a new \`--output\` filename (or omit \`--output\` to auto-name).
   Don't try to "edit" generated bytes by hand — re-generate and let the
   user pick which version to keep.
 5. **Don't emit \`<artifact>\` blocks for media.** They're for HTML/text
   artifacts. For media surfaces your "artifact" is the file written by
   the dispatcher. The artifact lint and PDF-stitching layers don't
   apply.
 6. **Filenames are slugged.** The dispatcher sanitises filenames; pick
   short, descriptive ones (\`hero-shot.png\`, \`intro-jingle.mp3\`,
   \`teaser-15s.mp4\`) so the user's file list stays readable.
 ### Stub-provider note
 The provider integrations behind specific models (gpt-image-2,
 seedance-2, suno-v5, …) may still be stubs in this build — the
 dispatcher will return success and a placeholder file. That's fine: the
 contract you follow is the same; the bytes get sharper as real
 provider integrations land. The user has been told to expect this.
 `;
@@ -33,13 +33,22 @@ import type { ProjectMetadata, ProjectTemplate } from '../types';
 import { OFFICIAL_DESIGNER_PROMPT } from './official-system';
 import { DISCOVERY_AND_PHILOSOPHY } from './discovery';
 import { DECK_FRAMEWORK_DIRECTIVE } from './deck-framework';
 import { MEDIA_GENERATION_CONTRACT } from './media-contract';
 export const BASE_SYSTEM_PROMPT = OFFICIAL_DESIGNER_PROMPT;
 export interface ComposeInput {
  skillBody?: string | undefined;
  skillName?: string | undefined;
-  skillMode?: 'prototype' | 'deck' | 'template' | 'design-system' | undefined;
+  skillMode?:
    | 'prototype'
    | 'deck'
    | 'template'
    | 'design-system'
    | 'image'
    | 'video'
    | 'audio'
    | undefined;
  designSystemBody?: string | undefined;
  designSystemTitle?: string | undefined;
  // Project-level metadata captured by the new-project panel. Drives the
@@ -111,6 +120,24 @@ export function composeSystemPrompt({
    parts.push(`\n\n---\n\n${DECK_FRAMEWORK_DIRECTIVE}`);
  }
  // Image / video / audio surfaces share one invocation contract:
  // `od media generate`. We pin it LAST (and only when the project is
  // actually a media surface) so its rules ("don't fabricate bytes",
  // "shell out to OD_BIN", "reference the returned filename") override
  // any softer wording earlier in the stack about emitting <artifact>
  // tags. We fire on either skillMode OR metadata.kind so a media
  // project without a bound skill still gets the contract.
  const isMediaSurface =
    skillMode === 'image' ||
    skillMode === 'video' ||
    skillMode === 'audio' ||
    metadata?.kind === 'image' ||
    metadata?.kind === 'video' ||
    metadata?.kind === 'audio';
  if (isMediaSurface) {
    parts.push(MEDIA_GENERATION_CONTRACT);
  }
  return parts.join('');
 }
@@ -145,6 +172,56 @@ function renderMetadataBlock(
      lines.push(`- **template**: ${metadata.templateLabel}`);
    }
  }
  if (metadata.kind === 'image') {
    lines.push(
      `- **imageModel**: ${metadata.imageModel ?? '(unknown — ask: which image model to use)'}`,
    );
    lines.push(
      `- **aspectRatio**: ${metadata.imageAspect ?? '(unknown — ask: 1:1, 16:9, 9:16, 4:3, 3:4)'}`,
    );
    if (metadata.imageStyle) {
      lines.push(`- **styleNotes**: ${metadata.imageStyle}`);
    }
    lines.push('');
    lines.push(
      'This is an **image** project. Plan the prompt carefully — describe subject, composition, lighting, palette, and references — then dispatch via the **media generation contract** (see the contract block at the end of this prompt) using `od media generate --surface image --model <imageModel>`. Reference the returned filename in your reply. Do NOT emit `<artifact>` HTML for media surfaces.',
    );
  }
  if (metadata.kind === 'video') {
    lines.push(
      `- **videoModel**: ${metadata.videoModel ?? '(unknown — ask: which video model to use)'}`,
    );
    lines.push(
      `- **lengthSeconds**: ${typeof metadata.videoLength === 'number' ? metadata.videoLength : '(unknown — ask: 3s / 5s / 10s)'}`,
    );
    lines.push(
      `- **aspectRatio**: ${metadata.videoAspect ?? '(unknown — ask: 16:9, 9:16, 1:1)'}`,
    );
    lines.push('');
    lines.push(
      'This is a **video** project. Plan the shotlist (1-3 shots for short clips), describe motion + camera, then dispatch via the **media generation contract** using `od media generate --surface video --model <videoModel> --length <seconds> --aspect <ratio>`. If the active workspace also ships a hyperframes-style interactive-video skill, prefer composing several shorter clips into a timeline rather than one monolithic generation. Do NOT emit `<artifact>` HTML.',
    );
  }
  if (metadata.kind === 'audio') {
    lines.push(
      `- **audioKind**: ${metadata.audioKind ?? '(unknown — ask: music / speech / sfx)'}`,
    );
    lines.push(
      `- **audioModel**: ${metadata.audioModel ?? '(unknown — ask: which audio model to use)'}`,
    );
    lines.push(
      `- **durationSeconds**: ${typeof metadata.audioDuration === 'number' ? metadata.audioDuration : '(unknown — ask: target duration)'}`,
    );
    if (metadata.voice) {
      lines.push(`- **voice**: ${metadata.voice}`);
    } else if (metadata.audioKind === 'speech') {
      lines.push('- **voice**: (unknown — ask: voice / accent / pacing)');
    }
    lines.push('');
    lines.push(
      'This is an **audio** project. Music: lock genre + tempo + instrumentation. Speech: confirm script + voice + pacing. SFX: be precise about texture (impact, ambience, foley layer). Then dispatch via the **media generation contract** using `od media generate --surface audio --audio-kind <kind> --model <audioModel> --duration <seconds>` (add `--voice <voice-id>` for speech). Do NOT emit `<artifact>` HTML.',
    );
  }
  if (metadata.inspirationDesignSystemIds && metadata.inspirationDesignSystemIds.length > 0) {
    lines.push(
@@ -72,12 +72,34 @@ export interface AgentInfo {
  version?: string | null;
 }
 // The four "surfaces" Open Design now produces. Web covers HTML
 // prototypes, decks, docs and templates; Image / Video / Audio cover
 // generated visual / motion / sound artifacts respectively. Every skill
 // and every design system declares one surface; the Examples and
 // Design-systems tabs filter by it so users can navigate the multi-modal
 // catalog without scrolling past surfaces they're not interested in.
 export type Surface = 'web' | 'image' | 'video' | 'audio';
 export interface SkillSummary {
  id: string;
  name: string;
  description: string;
  triggers: string[];
-  mode: 'prototype' | 'deck' | 'template' | 'design-system';
+  // 'design-system' is a meta-mode used by the design-systems registry,
  // not by user-facing skills; the rest map 1:1 onto ProjectKind. Image
  // / video / audio modes drive the matching project kind so the
  // 'Use this prompt' fast-create produces a coherent media project.
  mode:
    | 'prototype'
    | 'deck'
    | 'template'
    | 'design-system'
    | 'image'
    | 'video'
    | 'audio';
  /** Which output surface the skill targets — defaults to 'web' for
   *  backward compatibility when SKILL.md doesn't declare `od.surface`. */
  surface?: Surface;
  platform?: 'desktop' | 'mobile' | null;
  scenario?: string | null;
  previewType: string;
@@ -113,6 +135,10 @@ export interface DesignSystemSummary {
  /** 4 representative hex strings extracted from DESIGN.md: [bg, support, fg, accent].
   *  Empty when DESIGN.md doesn't expose its tokens in the bold-and-hex format. */
  swatches?: string[];
  /** Which surface the system targets. Web is the default — most ship
   *  HTML/CSS tokens. Image / video / audio systems carry palettes,
   *  shotlists, voice presets etc. that drive non-web generations. */
  surface?: Surface;
 }
 export interface DesignSystemDetail extends DesignSystemSummary {
@@ -122,6 +148,8 @@ export interface DesignSystemDetail extends DesignSystemSummary {
 export type ProjectFileKind =
  | 'html'
  | 'image'
  | 'video'
  | 'audio'
  | 'sketch'
  | 'text'
  | 'code'
@@ -147,7 +175,28 @@ export interface ProjectFile {
 // Per-project metadata captured at creation time. The agent reads this
 // during chat (via the system prompt) and the question-form re-asks for
 // any field that's missing. Each `kind` carries a different shape.
-export type ProjectKind = 'prototype' | 'deck' | 'template' | 'other';
+//
 // 'prototype' / 'deck' / 'template' / 'other' all live on the Web
 // surface; 'image' / 'video' / 'audio' are the new media surfaces.
 export type ProjectKind =
  | 'prototype'
  | 'deck'
  | 'template'
  | 'other'
  | 'image'
  | 'video'
  | 'audio';
 // Aspect ratios offered to image / video projects. Kept as a small fixed
 // vocabulary (vs free-form WxH) so the system prompt can describe them
 // to the agent in concrete terms, and so we can render fixed thumbnails
 // in the picker without a custom-input branch.
 export type MediaAspect = '1:1' | '16:9' | '9:16' | '4:3' | '3:4';
 // Audio kind — what *kind* of sound the user wants. The model + prompt
 // pattern differ noticeably between music (Suno-style), TTS (MiniMax,
 // Fish), and SFX/foley, so we capture the intent at create time.
 export type AudioKind = 'music' | 'speech' | 'sfx';
 export interface ProjectMetadata {
  kind: ProjectKind;
@@ -172,6 +221,35 @@ export interface ProjectMetadata {
  // generated artifact should *also* draw from. Empty / undefined when the
  // user stayed in single-select mode.
  inspirationDesignSystemIds?: string[];
  // -- Image projects ------------------------------------------------
  // The model the user wants generations to flow through. We keep this
  // as a free-form string (rather than a strict enum) so new providers
  // can be wired up by editing skills alone, without a frontend change.
  imageModel?: string;
  // Aspect ratio. Defaults to 1:1 if unset. Drives the canvas the agent
  // requests from the underlying image API.
  imageAspect?: MediaAspect;
  // Free-form palette / mood hint. Carried into the system prompt so the
  // agent can echo the user's style intent into the upstream prompt.
  imageStyle?: string;
  // -- Video projects ------------------------------------------------
  videoModel?: string;
  // Length in seconds. Most providers cap at 10s today; we don't enforce
  // here — the skill body is the right place to clamp by model.
  videoLength?: number;
  videoAspect?: MediaAspect;
  // -- Audio projects ------------------------------------------------
  audioKind?: AudioKind;
  audioModel?: string;
  // Duration in seconds. Music generators interpret this as song length;
  // TTS uses it as an upper bound on the spoken passage.
  audioDuration?: number;
  // Free-form voice description for TTS (e.g. "warm female narrator,
  // British English"). Ignored for music / SFX.
  voice?: string;
 }
 export interface Project {
Author	SHA1	Message	Date
pftom	8719c082ea	Merge PR #12 (cursor/47ca13ab) into cursor/289994c1 Bring in the parallel media-surfaces branch from PR #12. Tree is already identical to HEAD (same od media generate work landed independently), so this is a history-only merge to consolidate the two branches.	2026-04-28 22:46:20 +08:00
pftom	976a6eadf2	feat(media): add image/video/audio project kinds via od media generate Introduce non-web media surfaces (image, video, audio) as first-class project kinds. The unifying contract is "skill workflow + project metadata tell the agent WHAT to make; one shell command — od media generate — is HOW bytes are produced", so any code-agent CLI with shell access can drive it without bespoke tools. - Frontend: New Project panel gains Image/Video/Audio tabs with model picker, aspect/length/duration controls, and audio kind/voice selection. Examples and Design Systems tabs gain layered sections. FileViewer renders the generated image/video/audio files. - Shared registry: src/media/models.ts is the single source of truth for image/video/audio model IDs, aspects, and defaults — consumed by the picker AND the daemon dispatcher. - Prompts: media-contract.ts is pinned LAST in the system prompt for media surfaces so its hard rules (call od media generate, don't emit binary in <artifact>, allowed model IDs) win over softer earlier wording. - Daemon: new media.js dispatcher + media-models.js JSON view of the registry; cli.js gets the `od media generate` subcommand wired up via server.js / projects.js so the daemon writes files back into the project dir. - Skills: audio-jingle, image-poster, video-shortform seed examples for the three surfaces. Made-with: Cursor	2026-04-28 22:41:14 +08:00
pftom	ac70719d4d	feat(media): add image / video / audio surfaces with unified od media generate dispatcher Extends Open Design from web-only to a multi-modal creation tool. The unifying contract is one code-agent loop driven by skills + project metadata + prompt constraints; for non-web surfaces the agent shells out to a single dispatcher (`od media generate`) that the daemon routes per (surface, model). - Types: new Surface union, MediaAspect / AudioKind, image/video/audio ProjectKind + ProjectMetadata fields, video/audio ProjectFileKind. - NewProjectPanel: top-level surface picker + Image / Video / Audio forms with model, aspect, length, duration, voice, audio-kind pickers. - ExamplesTab + DesignSystemsTab: surface filter row that scopes before mode / scenario / category filters. - FileViewer / FileWorkspace: native <video> and <audio> previews and matching tab icons. - Daemon: parses `od.surface` and `> Surface:` blockquotes; recognises mp4 / webm / mov / mp3 / wav / ogg / m4a / flac extensions; spawns agents with OD_BIN / OD_DAEMON_URL / OD_PROJECT_ID / OD_PROJECT_DIR env so any code-agent CLI with shell access can call the dispatcher. - daemon/media.js + daemon/media-models.js: surface-agnostic dispatcher with stub providers that emit deterministic placeholder bytes (1x1 PNG, valid mp4 ftyp, mp3 frame / silent WAV) so the framework works without API keys; real provider integrations slot in later. - daemon/cli.js: `od media generate --surface ... --model ...` subcommand routes to POST /api/projects/:id/media/generate and prints one JSON line for the agent to parse. - prompts/media-contract.ts: hard contract pinned LAST in the system prompt for image/video/audio surfaces — env vars, exact invocation, registered model IDs per surface, six workflow rules. system.ts metadata block updated to point at the contract. - Seed skills: image-poster, video-shortform, audio-jingle each ship a SKILL.md with `mode/surface: image\|video\|audio` and a stylized example.html preview, and instruct the agent to dispatch via the contract. Made-with: Cursor	2026-04-28 22:40:58 +08:00
pftom	0b61be5d96	Merge remote-tracking branch 'origin/main' into cursor/289994c1	2026-04-28 22:31:29 +08:00
pftom	bc7c057216	Merge remote-tracking branch 'origin/main' into cursor/47ca13ab	2026-04-28 22:31:20 +08:00
pftom	5a63d09f2f	Enhance README and add star promotion assets - Added a "Star us" section in both English and Chinese README files to encourage users to star the project on GitHub. - Included a new image asset for the star promotion. - Introduced a new HTML file for a dedicated star promotion page. - Updated .gitignore to exclude new cursor-related files.	2026-04-28 20:32:39 +08:00
pftom	19b5272f38	Merge branch 'main' into feat/optimize-naming	2026-04-28 16:23:44 +08:00
pftom	1337907df3	Merge branch 'main' of github.com:nexu-io/open-design	2026-04-28 16:20:14 +08:00
pftom	490bbe29c9	Merge branch 'feat/optimize-naming' of github.com:nexu-io/open-design into feat/optimize-naming	2026-04-28 16:16:51 +08:00
pftom	0eef347336	Update README and documentation for deck framework directives - Clarified DECK_FRAMEWORK_DIRECTIVE description in both English and Chinese README files to specify conditions for deck kind without a skill seed. - Added detailed workflow instructions in deck-framework.ts to emphasize the importance of copying the framework before adding content. - Enhanced discovery.ts to reinforce the framework-first approach for deck projects. - Updated system.ts to ensure proper handling of deck projects with and without bound skills, preventing re-authorship of scaling and navigation logic.	2026-04-28 16:11:46 +08:00
pftom	243e611eeb	Update README and documentation for deck framework directives - Clarified DECK_FRAMEWORK_DIRECTIVE description in both English and Chinese README files to specify conditions for deck kind without a skill seed. - Added detailed workflow instructions in deck-framework.ts to emphasize the importance of copying the framework before adding content. - Enhanced discovery.ts to reinforce the framework-first approach for deck projects. - Updated system.ts to ensure proper handling of deck projects with and without bound skills, preventing re-authorship of scaling and navigation logic.	2026-04-28 16:07:52 +08:00
pftom	985238403f	Add contributing guidelines in English and Chinese - Introduced CONTRIBUTING.md and CONTRIBUTING.zh-CN.md to provide clear instructions for contributors. - Outlined contribution types, local setup instructions, and merging criteria for skills and design systems. - Enhanced README files to reference the new contributing guidelines.	2026-04-28 16:02:17 +08:00
pftom	af3f96379a	Refactor project name from "Open Claude Design" to "Open Design" - Updated project name in package.json, package-lock.json, and README files. - Changed CLI commands and references from "ocd" to "od". - Adjusted file structure references in documentation and code to reflect new naming conventions. - Enhanced .gitignore to include new runtime data files. - Updated metadata in LICENSE file to match new project name.	2026-04-28 14:48:45 +08:00