diff --git a/daemon/cli.js b/daemon/cli.js
index 2cb1deb..29fcef1 100644
--- a/daemon/cli.js
+++ b/daemon/cli.js
@@ -1,24 +1,44 @@
 #!/usr/bin/env node
 import { startServer } from './server.js';
 
-const args = process.argv.slice(2);
+const argv = process.argv.slice(2);
+
+// ---- Subcommand router ----------------------------------------------------
+//
+// `od` is two CLIs glued together:
+//   - default mode: starts the daemon + opens the web UI.
+//   - `od media …`: a thin client that POSTs to the running daemon. This
+//     is what the code agent invokes from inside a chat to actually
+//     produce image / video / audio bytes (the unifying contract).
+//
+// We dispatch on the first positional argument so flags like --port keep
+// working unchanged. Subcommand routing is keyword-based; flags are
+// parsed inside each handler.
+
+const SUBCOMMAND_MAP = {
+  media: runMedia,
+};
+
+const first = argv.find((a) => !a.startsWith('-'));
+if (first && SUBCOMMAND_MAP[first]) {
+  const idx = argv.indexOf(first);
+  const rest = [...argv.slice(0, idx), ...argv.slice(idx + 1)];
+  await SUBCOMMAND_MAP[first](rest);
+  process.exit(0);
+}
+
+// Default: daemon mode.
 let port = Number(process.env.OD_PORT) || 7456;
 let open = true;
 
-for (let i = 0; i < args.length; i++) {
-  const a = args[i];
+for (let i = 0; i < argv.length; i++) {
+  const a = argv[i];
   if (a === '-p' || a === '--port') {
-    port = Number(args[++i]);
+    port = Number(argv[++i]);
   } else if (a === '--no-open') {
     open = false;
   } else if (a === '-h' || a === '--help') {
-    console.log(`Usage: od [--port <n>] [--no-open]
-
-Starts a local daemon that:
-  * scans PATH for installed code-agent CLIs (claude, codex, gemini, opencode, cursor-agent, ...)
-  * serves a tiny web chat UI at http://localhost:<port>
-  * proxies messages (text + images) to the selected agent via child-process spawn
-`);
+    printRootHelp();
     process.exit(0);
   }
 }
@@ -34,3 +54,134 @@ startServer({ port }).then(url => {
     });
   }
 });
+
+function printRootHelp() {
+  console.log(`Usage:
+  od [--port <n>] [--no-open]
+      Start the local daemon and open the web UI.
+
+  od media generate --surface <image|video|audio> --model <id> [opts]
+      Generate a media artifact and write it into the active project.
+      Designed to be invoked by a code agent — picks up OD_DAEMON_URL
+      and OD_PROJECT_ID from the env that the daemon injected on spawn.
+
+What the daemon does:
+  * scans PATH for installed code-agent CLIs (claude, codex, gemini, opencode, cursor-agent, ...)
+  * serves the chat UI at http://localhost:<port>
+  * proxies messages (text + images) to the selected agent via child-process spawn
+  * exposes /api/projects/:id/media/generate — the unified image/video/audio
+    dispatcher that the agent calls via \`od media generate\`.`);
+}
+
+// ---------------------------------------------------------------------------
+// Subcommand: od media …
+// ---------------------------------------------------------------------------
+
+async function runMedia(args) {
+  const sub = args.find((a) => !a.startsWith('-')) || '';
+  if (sub === 'help' || sub === '-h' || sub === '--help' || sub === '') {
+    printMediaHelp();
+    return;
+  }
+  if (sub !== 'generate') {
+    console.error(`unknown subcommand: od media ${sub}`);
+    printMediaHelp();
+    process.exit(1);
+  }
+
+  const idx = args.indexOf(sub);
+  const flags = parseFlags([...args.slice(0, idx), ...args.slice(idx + 1)]);
+
+  const daemonUrl = flags['daemon-url'] || process.env.OD_DAEMON_URL || 'http://127.0.0.1:7456';
+  const projectId = flags.project || process.env.OD_PROJECT_ID;
+  if (!projectId) {
+    console.error(
+      'project id required. Pass --project <id> or set OD_PROJECT_ID. The daemon injects this when it spawns the code agent.',
+    );
+    process.exit(2);
+  }
+
+  const surface = flags.surface;
+  if (!surface || !['image', 'video', 'audio'].includes(surface)) {
+    console.error('--surface must be one of: image | video | audio');
+    process.exit(2);
+  }
+  if (!flags.model) {
+    console.error('--model required (see http://<daemon>/api/media/models)');
+    process.exit(2);
+  }
+
+  const body = {
+    surface,
+    model: flags.model,
+    prompt: flags.prompt,
+    output: flags.output,
+    aspect: flags.aspect,
+    voice: flags.voice,
+    audioKind: flags['audio-kind'],
+  };
+  if (flags.length != null) body.length = Number(flags.length);
+  if (flags.duration != null) body.duration = Number(flags.duration);
+
+  const url = `${daemonUrl.replace(/\/$/, '')}/api/projects/${encodeURIComponent(projectId)}/media/generate`;
+  let resp;
+  try {
+    resp = await fetch(url, {
+      method: 'POST',
+      headers: { 'content-type': 'application/json' },
+      body: JSON.stringify(body),
+    });
+  } catch (err) {
+    console.error(`failed to reach daemon at ${daemonUrl}: ${err.message}`);
+    process.exit(3);
+  }
+  const text = await resp.text();
+  if (!resp.ok) {
+    console.error(`daemon ${resp.status}: ${text}`);
+    process.exit(4);
+  }
+  // Print the JSON response as one line so the agent can parse it.
+  process.stdout.write(text.trim() + '\n');
+}
+
+function parseFlags(argv) {
+  const out = {};
+  for (let i = 0; i < argv.length; i++) {
+    const a = argv[i];
+    if (!a || !a.startsWith('--')) continue;
+    const key = a.slice(2);
+    const next = argv[i + 1];
+    if (next != null && !next.startsWith('--')) {
+      out[key] = next;
+      i++;
+    } else {
+      out[key] = true;
+    }
+  }
+  return out;
+}
+
+function printMediaHelp() {
+  console.log(`Usage: od media generate --surface <image|video|audio> --model <id> [opts]
+
+Required:
+  --surface  image | video | audio
+  --model    Model id from /api/media/models (e.g. gpt-image-2, seedance-2, suno-v5).
+  --project  Project id. Auto-resolved from OD_PROJECT_ID when invoked by the daemon.
+
+Common options:
+  --prompt "<text>"         Generation prompt.
+  --output <filename>       File to write under the project. Auto-named if omitted.
+  --aspect 1:1|16:9|9:16|4:3|3:4
+  --length <seconds>        Video length.
+  --duration <seconds>      Audio duration.
+  --voice <voice-id>        Speech / TTS voice.
+  --audio-kind music|speech|sfx
+  --daemon-url http://127.0.0.1:7456
+
+Output: a single line of JSON: {"file": { name, size, kind, mime, ... }}.
+
+Skills should call this and then reference the returned filename in their
+artifact / message body. The daemon writes the bytes into the project's
+files folder so the FileViewer can preview them immediately.`);
+}
diff --git a/daemon/design-systems.js b/daemon/design-systems.js
index 544c4bf..9a9622b 100644
--- a/daemon/design-systems.js
+++ b/daemon/design-systems.js
@@ -29,6 +29,11 @@ export async function listDesignSystems(root) {
         category: extractCategory(raw) ?? 'Uncategorized',
         summary: summarize(raw),
         swatches: extractSwatches(raw),
+        // Optional `> Surface: image|video|audio` blockquote line. Most
+        // existing systems target the web surface and don't declare it;
+        // we default to 'web' so the right-side filter classifies them
+        // correctly.
+        surface: extractSurface(raw),
         body: raw,
       });
     } catch {
@@ -67,6 +72,14 @@ function extractCategory(raw) {
   return m?.[1];
 }
 
+const KNOWN_SURFACES = new Set(['web', 'image', 'video', 'audio']);
+function extractSurface(raw) {
+  const m = /^>\s*Surface:\s*(.+?)\s*$/im.exec(raw);
+  if (!m) return 'web';
+  const v = m[1].trim().toLowerCase();
+  return KNOWN_SURFACES.has(v) ? v : 'web';
+}
+
 // Strip boilerplate like "Design System Inspired by Cohere" → "Cohere" so
 // the picker dropdown reads cleanly. Hand-authored titles that don't match
 // the pattern (e.g. "Neutral Modern") pass through unchanged.
diff --git a/daemon/media-models.js b/daemon/media-models.js
new file mode 100644
index 0000000..11f537f
--- /dev/null
+++ b/daemon/media-models.js
@@ -0,0 +1,62 @@
+// Daemon-side mirror of src/media/models.ts. We keep this in plain JS so
+// node imports are native and the daemon never needs a TS toolchain at
+// runtime. The two files are kept in sync by review — any model added to
+// src/media/models.ts must be added here too. Tests in verify ensure the
+// arrays are non-empty and IDs are unique.
+
+export const IMAGE_MODELS = [
+  { id: 'gpt-image-2', label: 'gpt-image-2', hint: 'OpenAI · default', caps: ['t2i', 'i2i', 'inpaint'] },
+  { id: 'flux-1.1-pro', label: 'flux-1.1-pro', hint: 'Black Forest Labs', caps: ['t2i', 'i2i'] },
+  { id: 'imagen-4', label: 'imagen-4', hint: 'Google', caps: ['t2i'] },
+  { id: 'midjourney-v7', label: 'midjourney-v7', hint: 'Midjourney', caps: ['t2i'] },
+];
+
+export const VIDEO_MODELS = [
+  { id: 'seedance-2', label: 'seedance-2', hint: 'ByteDance · default', caps: ['t2v', 'i2v'] },
+  { id: 'kling-3', label: 'kling-3', hint: 'Kuaishou', caps: ['t2v', 'i2v'] },
+  { id: 'kling-4', label: 'kling-4', hint: 'Kuaishou · latest', caps: ['t2v', 'i2v'] },
+  { id: 'veo-3', label: 'veo-3', hint: 'Google', caps: ['t2v'] },
+  { id: 'sora-2', label: 'sora-2', hint: 'OpenAI', caps: ['t2v'] },
+];
+
+export const AUDIO_MODELS_BY_KIND = {
+  music: [
+    { id: 'suno-v5', label: 'suno-v5', hint: 'Suno · default', caps: ['music'] },
+    { id: 'udio-v2', label: 'udio-v2', hint: 'Udio', caps: ['music'] },
+    { id: 'lyria-2', label: 'lyria-2', hint: 'Google', caps: ['music'] },
+  ],
+  speech: [
+    { id: 'minimax-tts', label: 'minimax-tts', hint: 'MiniMax · default', caps: ['tts'] },
+    { id: 'fish-speech-2', label: 'fish-speech-2', hint: 'FishAudio', caps: ['tts', 'voice-clone'] },
+    { id: 'elevenlabs-v3', label: 'elevenlabs-v3', hint: 'ElevenLabs', caps: ['tts', 'voice-clone'] },
+  ],
+  sfx: [
+    { id: 'elevenlabs-sfx', label: 'elevenlabs-sfx', hint: 'ElevenLabs SFX', caps: ['sfx'] },
+    { id: 'audiocraft', label: 'audiocraft', hint: 'Meta · open', caps: ['sfx', 'music'] },
+  ],
+};
+
+export const MEDIA_ASPECTS = ['1:1', '16:9', '9:16', '4:3', '3:4'];
+export const VIDEO_LENGTHS_SEC = [3, 5, 8, 10, 15, 30];
+export const AUDIO_DURATIONS_SEC = [5, 10, 15, 30, 60, 120];
+
+export function findMediaModel(id) {
+  const all = [
+    ...IMAGE_MODELS,
+    ...VIDEO_MODELS,
+    ...AUDIO_MODELS_BY_KIND.music,
+    ...AUDIO_MODELS_BY_KIND.speech,
+    ...AUDIO_MODELS_BY_KIND.sfx,
+  ];
+  return all.find((m) => m.id === id) || null;
+}
+
+export function modelsForSurface(surface, audioKind) {
+  if (surface === 'image') return IMAGE_MODELS;
+  if (surface === 'video') return VIDEO_MODELS;
+  if (surface === 'audio') {
+    const k = audioKind || 'music';
+    return AUDIO_MODELS_BY_KIND[k] || AUDIO_MODELS_BY_KIND.music;
+  }
+  return [];
+}
diff --git a/daemon/media.js b/daemon/media.js
new file mode 100644
index 0000000..84b921d
--- /dev/null
+++ b/daemon/media.js
@@ -0,0 +1,263 @@
+// Media-generation dispatcher. The unifying contract is:
+//
+//   skills + metadata + system-prompt
+//        ↓ (the code agent decides what to make)
+//   `od media generate --surface … --model … --output … --prompt …`
+//        ↓ (this module routes to a provider)
+//   bytes written to <projectsRoot>/<projectId>/<output>
+//        ↓
+//   FileViewer renders it.
+//
+// Every surface (image / video / audio) flows through this single
+// entrypoint. Providers are pluggable: each file under ./media-providers/
+// (or inline below) registers handlers keyed by (surface, model). The
+// fallback handlers emit a deterministic, lightweight placeholder
+// (labeled SVG-PNG, silent WAV/MP3, blank MP4) so the framework works
+// without API keys — real provider integrations slot in later by
+// replacing the handler.
+
+import { mkdir, stat, writeFile } from 'node:fs/promises';
+import path from 'node:path';
+import { findMediaModel } from './media-models.js';
+import {
+  ensureProject,
+  kindFor,
+  mimeFor,
+  sanitizeName,
+} from './projects.js';
+
+const DEFAULT_OUTPUT_BY_SURFACE = {
+  image: 'image.png',
+  video: 'video.mp4',
+  audio: 'audio.mp3',
+};
+
+const SURFACES = new Set(['image', 'video', 'audio']);
+
+/**
+ * Generate a media artifact and write it into the project's files dir.
+ *
+ * @param {Object} args
+ * @param {string} args.projectsRoot - Absolute path to <repo>/.od/projects.
+ * @param {string} args.projectId
+ * @param {'image'|'video'|'audio'} args.surface
+ * @param {string} args.model - Must be a registered model id.
+ * @param {string} [args.prompt]
+ * @param {string} [args.output] - Optional filename; auto-named if missing.
+ * @param {string} [args.aspect] - 1:1 / 16:9 / 9:16 / 4:3 / 3:4
+ * @param {number} [args.length] - Video length, seconds.
+ * @param {number} [args.duration] - Audio duration, seconds.
+ * @param {string} [args.voice]
+ * @param {string} [args.audioKind] - music | speech | sfx
+ * @returns {Promise<{ name: string, size: number, mtime: number, kind: string, mime: string, model: string, surface: string, providerNote: string }>}
+ */
+export async function generateMedia(args) {
+  const {
+    projectsRoot,
+    projectId,
+    surface,
+    model,
+    prompt,
+    output,
+    aspect,
+    length,
+    duration,
+    voice,
+    audioKind,
+  } = args;
+
+  if (!projectsRoot) throw new Error('projectsRoot required');
+  if (typeof projectId !== 'string' || !projectId) {
+    throw new Error('projectId required');
+  }
+  if (!SURFACES.has(surface)) {
+    throw new Error(`unsupported surface: ${surface}`);
+  }
+  if (typeof model !== 'string' || !model) {
+    throw new Error('model required');
+  }
+  const def = findMediaModel(model);
+  if (!def) {
+    throw new Error(
+      `unknown model: ${model}. Pass --model from the registered list (see /api/media/models).`,
+    );
+  }
+
+  const dir = await ensureProject(projectsRoot, projectId);
+  const safeOut = sanitizeName(
+    output || autoOutputName(surface, model, audioKind),
+  );
+  const target = path.join(dir, safeOut);
+  await mkdir(path.dirname(target), { recursive: true });
+
+  const ctx = {
+    surface,
+    model,
+    prompt: prompt || '',
+    aspect: aspect || defaultAspectFor(surface),
+    length: typeof length === 'number' ? length : undefined,
+    duration: typeof duration === 'number' ? duration : undefined,
+    voice: voice || '',
+    audioKind: audioKind || (surface === 'audio' ? 'music' : undefined),
+  };
+
+  let bytes;
+  let providerNote;
+  if (surface === 'image') {
+    ({ bytes, providerNote } = await renderImage(ctx, safeOut));
+  } else if (surface === 'video') {
+    ({ bytes, providerNote } = await renderVideo(ctx, safeOut));
+  } else {
+    ({ bytes, providerNote } = await renderAudio(ctx, safeOut));
+  }
+
+  await writeFile(target, bytes);
+  const st = await stat(target);
+  return {
+    name: safeOut,
+    size: st.size,
+    mtime: st.mtimeMs,
+    kind: kindFor(safeOut),
+    mime: mimeFor(safeOut),
+    model,
+    surface,
+    providerNote,
+  };
+}
+
+function autoOutputName(surface, model, audioKind) {
+  const base = DEFAULT_OUTPUT_BY_SURFACE[surface] || 'artifact.bin';
+  const stamp = Date.now().toString(36);
+  const tag = surface === 'audio' && audioKind ? `${audioKind}-${model}` : model;
+  const dot = base.lastIndexOf('.');
+  const stem = dot > 0 ? base.slice(0, dot) : base;
+  const ext = dot > 0 ? base.slice(dot) : '';
+  return `${stem}-${tag}-${stamp}${ext}`;
+}
+
+function defaultAspectFor(surface) {
+  if (surface === 'image') return '1:1';
+  if (surface === 'video') return '16:9';
+  return undefined;
+}
+
+// ---------------------------------------------------------------------------
+// Provider stubs.
+//
+// Each renderer returns Buffer bytes that the caller writes to disk. They
+// produce real, lightweight placeholder media labelled with the model +
+// prompt so the user can verify which call was dispatched while the real
+// provider integrations are still pending. To replace a stub with a real
+// provider, swap the body — keep the (ctx, fileName) → { bytes, note }
+// shape so server.js doesn't change.
+
+async function renderImage(ctx, fileName) {
+  // SVG-as-image: write SVG bytes into a .png filename only when ext is
+  // svg; otherwise emit a tiny PNG that browsers can decode. We pick
+  // PNG-as-bytes by encoding the SVG inside a minimal PNG container —
+  // simpler: just write SVG XML into a .png, browsers can't render that.
+  // So instead: for png/jpg, emit a deterministic 1×1 PNG; for svg, emit
+  // a labelled SVG.
+  const ext = path.extname(fileName).toLowerCase();
+  if (ext === '.svg') {
+    return { bytes: Buffer.from(svgPlaceholder(ctx), 'utf8'), providerNote: 'svg-stub' };
+  }
+  // Minimal 1×1 transparent PNG. Real provider would emit a full image.
+  const png = Buffer.from(
+    [
+      0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0x00, 0x00, 0x00, 0x0d,
+      0x49, 0x48, 0x44, 0x52, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
+      0x08, 0x06, 0x00, 0x00, 0x00, 0x1f, 0x15, 0xc4, 0x89, 0x00, 0x00, 0x00,
+      0x0d, 0x49, 0x44, 0x41, 0x54, 0x78, 0x9c, 0x63, 0x00, 0x01, 0x00, 0x00,
+      0x05, 0x00, 0x01, 0x0d, 0x0a, 0x2d, 0xb4, 0x00, 0x00, 0x00, 0x00, 0x49,
+      0x45, 0x4e, 0x44, 0xae, 0x42, 0x60, 0x82,
+    ],
+  );
+  return {
+    bytes: png,
+    providerNote: `stub-png · model=${ctx.model} · aspect=${ctx.aspect} · prompt=${truncate(ctx.prompt, 60)}`,
+  };
+}
+
+async function renderVideo(ctx, _fileName) {
+  // Tiny but valid mp4 (ftyp + minimal moov). Browsers without a video
+  // track will show 0 seconds, which is fine — this proves the dispatch
+  // round-trip; real Seedance/Kling/Veo providers replace this body.
+  const ftyp = Buffer.from([
+    0x00, 0x00, 0x00, 0x18, 0x66, 0x74, 0x79, 0x70, 0x69, 0x73, 0x6f, 0x6d,
+    0x00, 0x00, 0x02, 0x00, 0x69, 0x73, 0x6f, 0x6d, 0x69, 0x73, 0x6f, 0x32,
+  ]);
+  const mdat = Buffer.from([0x00, 0x00, 0x00, 0x08, 0x6d, 0x64, 0x61, 0x74]);
+  return {
+    bytes: Buffer.concat([ftyp, mdat]),
+    providerNote: `stub-mp4 · model=${ctx.model} · aspect=${ctx.aspect} · length=${ctx.length ?? '?'}s · prompt=${truncate(ctx.prompt, 60)}`,
+  };
+}
+
+async function renderAudio(ctx, fileName) {
+  const ext = path.extname(fileName).toLowerCase();
+  if (ext === '.wav') {
+    return {
+      bytes: silentWav(0.5),
+      providerNote: `stub-wav · model=${ctx.model} · kind=${ctx.audioKind} · duration=${ctx.duration ?? '?'}s`,
+    };
+  }
+  // Default: emit a near-empty mp3 frame header so the file is valid but
+  // tiny. Browsers may report 0:00; replace with real provider output.
+  const mp3 = Buffer.from([
+    0xff, 0xfb, 0x90, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  ]);
+  return {
+    bytes: mp3,
+    providerNote: `stub-mp3 · model=${ctx.model} · kind=${ctx.audioKind} · voice=${ctx.voice || '-'} · duration=${ctx.duration ?? '?'}s`,
+  };
+}
+
+function svgPlaceholder(ctx) {
+  const [w, h] = aspectToBox(ctx.aspect, 800);
+  const safe = (s) =>
+    String(s || '')
+      .replace(/&/g, '&amp;')
+      .replace(/</g, '&lt;')
+      .replace(/>/g, '&gt;');
+  return [
+    `<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 ${w} ${h}" width="${w}" height="${h}">`,
+    `<rect width="${w}" height="${h}" fill="#0f1424"/>`,
+    `<text x="50%" y="50%" fill="#7da4ff" font-family="ui-sans-serif" font-size="20" text-anchor="middle">${safe(ctx.model)} — ${safe(ctx.prompt).slice(0, 60)}</text>`,
+    '</svg>',
+  ].join('');
+}
+
+function aspectToBox(aspect, base) {
+  const [a, b] = String(aspect || '1:1').split(':').map(Number);
+  if (!a || !b) return [base, base];
+  if (a >= b) return [base, Math.round((base * b) / a)];
+  return [Math.round((base * a) / b), base];
+}
+
+function silentWav(seconds) {
+  const sampleRate = 8000;
+  const numSamples = Math.max(1, Math.round(sampleRate * seconds));
+  const dataSize = numSamples * 2;
+  const buf = Buffer.alloc(44 + dataSize);
+  buf.write('RIFF', 0, 'ascii');
+  buf.writeUInt32LE(36 + dataSize, 4);
+  buf.write('WAVE', 8, 'ascii');
+  buf.write('fmt ', 12, 'ascii');
+  buf.writeUInt32LE(16, 16);
+  buf.writeUInt16LE(1, 20); // PCM
+  buf.writeUInt16LE(1, 22); // mono
+  buf.writeUInt32LE(sampleRate, 24);
+  buf.writeUInt32LE(sampleRate * 2, 28);
+  buf.writeUInt16LE(2, 32);
+  buf.writeUInt16LE(16, 34);
+  buf.write('data', 36, 'ascii');
+  buf.writeUInt32LE(dataSize, 40);
+  return buf;
+}
+
+function truncate(s, n) {
+  const v = String(s || '');
+  if (v.length <= n) return v;
+  return v.slice(0, n - 1) + '…';
+}
diff --git a/daemon/projects.js b/daemon/projects.js
index c1a94e5..1b093a1 100644
--- a/daemon/projects.js
+++ b/daemon/projects.js
@@ -156,6 +156,21 @@ const EXT_MIME = {
   '.gif': 'image/gif',
   '.webp': 'image/webp',
   '.avif': 'image/avif',
+  // Video — covered MIMEs are the formats most generators emit. Browsers
+  // play them via <video> / <audio> in the FileViewer with no transcode.
+  '.mp4': 'video/mp4',
+  '.m4v': 'video/mp4',
+  '.webm': 'video/webm',
+  '.mov': 'video/quicktime',
+  // Audio — music / TTS generators commonly produce mp3 / wav / ogg /
+  // m4a; flac is rarer but cheap to support.
+  '.mp3': 'audio/mpeg',
+  '.wav': 'audio/wav',
+  '.ogg': 'audio/ogg',
+  '.oga': 'audio/ogg',
+  '.m4a': 'audio/mp4',
+  '.flac': 'audio/flac',
+  '.aac': 'audio/aac',
 };
 
 export function mimeFor(name) {
@@ -175,6 +190,10 @@ export function kindFor(name) {
     if (name.startsWith('sketch-')) return 'sketch';
     return 'image';
   }
+  if (['.mp4', '.m4v', '.webm', '.mov'].includes(ext)) return 'video';
+  if (['.mp3', '.wav', '.ogg', '.oga', '.m4a', '.flac', '.aac'].includes(ext)) {
+    return 'audio';
+  }
   if (['.md', '.txt'].includes(ext)) return 'text';
   if (['.js', '.mjs', '.cjs', '.ts', '.tsx', '.json', '.css'].includes(ext)) {
     return 'code';
diff --git a/daemon/server.js b/daemon/server.js
index c12311f..73fd76e 100644
--- a/daemon/server.js
+++ b/daemon/server.js
@@ -22,6 +22,15 @@ import {
   sanitizeName,
   writeProjectFile,
 } from './projects.js';
+import { generateMedia } from './media.js';
+import {
+  AUDIO_MODELS_BY_KIND,
+  IMAGE_MODELS,
+  VIDEO_MODELS,
+  MEDIA_ASPECTS,
+  VIDEO_LENGTHS_SEC,
+  AUDIO_DURATIONS_SEC,
+} from './media-models.js';
 import {
   deleteConversation,
   deleteProject as dbDeleteProject,
@@ -50,6 +59,10 @@ const PROJECT_ROOT = path.resolve(__dirname, '..');
 const STATIC_DIR = path.join(PROJECT_ROOT, 'dist');
 const SKILLS_DIR = path.join(PROJECT_ROOT, 'skills');
 const DESIGN_SYSTEMS_DIR = path.join(PROJECT_ROOT, 'design-systems');
+// Absolute path to the daemon CLI entry. We inject this into the spawned
+// agent's env as OD_BIN so the agent can run `node "$OD_BIN" media generate …`
+// regardless of whether the user has `od` on PATH.
+const OD_BIN_PATH = path.join(__dirname, 'cli.js');
 const ARTIFACTS_DIR = path.join(PROJECT_ROOT, '.od', 'artifacts');
 const PROJECTS_DIR = path.join(PROJECT_ROOT, '.od', 'projects');
 fs.mkdirSync(PROJECTS_DIR, { recursive: true });
@@ -650,6 +663,56 @@ export async function startServer({ port = 7456 } = {}) {
     }
   });
 
+  // ---- Media generation -----------------------------------------------------
+  //
+  // Surface-agnostic media dispatcher. The code agent reaches this via
+  // `od media generate` (see daemon/cli.js media subcommand), which is
+  // the unified contract: skills + metadata + system-prompt instruct the
+  // agent on WHAT to produce, the agent invokes ONE entrypoint that
+  // dispatches per (surface, model) and writes the bytes into the project.
+  // The shape of the response matches POST /api/projects/:id/files so the
+  // frontend can refresh the file list with the same code path.
+
+  app.get('/api/media/models', (_req, res) => {
+    res.json({
+      image: IMAGE_MODELS,
+      video: VIDEO_MODELS,
+      audio: AUDIO_MODELS_BY_KIND,
+      aspects: MEDIA_ASPECTS,
+      videoLengthsSec: VIDEO_LENGTHS_SEC,
+      audioDurationsSec: AUDIO_DURATIONS_SEC,
+    });
+  });
+
+  app.post('/api/projects/:id/media/generate', async (req, res) => {
+    try {
+      const projectId = req.params.id;
+      // Ensure the project exists in DB before writing files; this gives
+      // a friendly 404 when the agent calls with a bad id. The agent
+      // normally inherits OD_PROJECT_ID from spawn env so this should
+      // always resolve.
+      const project = getProject(db, projectId);
+      if (!project) return res.status(404).json({ error: 'project not found' });
+      const meta = await generateMedia({
+        projectsRoot: PROJECTS_DIR,
+        projectId,
+        surface: req.body?.surface,
+        model: req.body?.model,
+        prompt: req.body?.prompt,
+        output: req.body?.output,
+        aspect: req.body?.aspect,
+        length: typeof req.body?.length === 'number' ? req.body.length : undefined,
+        duration:
+          typeof req.body?.duration === 'number' ? req.body.duration : undefined,
+        voice: req.body?.voice,
+        audioKind: req.body?.audioKind,
+      });
+      res.json({ file: meta });
+    } catch (err) {
+      res.status(400).json({ error: String(err && err.message ? err.message : err) });
+    }
+  });
+
   // Multi-file upload that the chat composer uses for paste/drop/picker.
   // Files land flat in the project folder; the response carries the same
   // metadata as listFiles so the client can stage them as ChatAttachments
@@ -800,10 +863,20 @@ export async function startServer({ port = 7456 } = {}) {
       cwd,
     });
 
+    // Inject the OD context. Skills + the media-contract prompt tell the
+    // agent how to spend this — call `node "$OD_BIN" media generate
+    // --project "$OD_PROJECT_ID" …` and the daemon dispatches.
+    const odEnv = {
+      OD_BIN: OD_BIN_PATH,
+      OD_DAEMON_URL: `http://127.0.0.1:${port}`,
+      OD_PROJECT_ID: typeof projectId === 'string' ? projectId : '',
+      OD_PROJECT_DIR: cwd || '',
+    };
+
     let child;
     try {
       child = spawn(def.bin, args, {
-        env: { ...process.env },
+        env: { ...process.env, ...odEnv },
         stdio: ['ignore', 'pipe', 'pipe'],
         cwd: cwd || undefined,
       });
diff --git a/daemon/skills.js b/daemon/skills.js
index ff0fa85..53c8068 100644
--- a/daemon/skills.js
+++ b/daemon/skills.js
@@ -25,12 +25,16 @@ export async function listSkills(skillsRoot) {
       const { data, body } = parseFrontmatter(raw);
       const hasAttachments = await dirHasAttachments(dir);
       const mode = data.od?.mode || inferMode(body, data.description);
+      const surface = normalizeSurface(data.od?.surface, mode);
       out.push({
         id: data.name || entry.name,
         name: data.name || entry.name,
         description: data.description || "",
         triggers: Array.isArray(data.triggers) ? data.triggers : [],
         mode,
+        // Surface defaults to inferring from `mode` so legacy SKILL.md
+        // files (no `od.surface` declared) keep classifying correctly.
+        surface,
         platform: normalizePlatform(
           data.od?.platform,
           mode,
@@ -159,6 +163,20 @@ function inferMode(body, description) {
   return "prototype";
 }
 
+// Surface is the high-level output bucket — web, image, video or audio.
+// Authors can pin it via `od.surface`; otherwise we derive from `mode`,
+// then fall back to the safe default ('web') so existing skills classify
+// unchanged.
+const KNOWN_SURFACES = new Set(["web", "image", "video", "audio"]);
+function normalizeSurface(value, mode) {
+  if (typeof value === "string") {
+    const v = value.trim().toLowerCase();
+    if (KNOWN_SURFACES.has(v)) return v;
+  }
+  if (mode === "image" || mode === "video" || mode === "audio") return mode;
+  return "web";
+}
+
 // Validate platform tag — only desktop / mobile are meaningful for the
 // Examples gallery. Falls back to autodetecting "mobile" from descriptions
 // so legacy skills sort under the right pill without authoring changes.
diff --git a/skills/audio-jingle/SKILL.md b/skills/audio-jingle/SKILL.md
new file mode 100644
index 0000000..fe58e3d
--- /dev/null
+++ b/skills/audio-jingle/SKILL.md
@@ -0,0 +1,121 @@
+---
+name: audio-jingle
+description: |
+  Audio generation skill — jingles, beds, voiceover, and sound effects.
+  Routes music requests to Suno V5 / Udio / Lyria, speech to MiniMax
+  TTS / FishAudio / ElevenLabs V3, and SFX to ElevenLabs SFX or
+  AudioCraft. Output is one MP3/WAV file saved to the project folder.
+triggers:
+  - "music"
+  - "jingle"
+  - "bed"
+  - "voiceover"
+  - "tts"
+  - "sound effect"
+  - "音乐"
+  - "配音"
+  - "音效"
+od:
+  mode: audio
+  surface: audio
+  scenario: marketing
+  preview:
+    type: html
+    entry: example.html
+  design_system:
+    requires: false
+  example_prompt: |
+    A 30-second upbeat indie-pop jingle for a coffee shop launch — warm
+    electric piano lead, brushed drums, gentle bass, a single sun-soaked
+    "ahhh" choir on the chorus. No vocals. Loop-friendly tail.
+---
+
+# Audio Jingle Skill
+
+Three sub-modes. The active project's `audioKind` decides which one
+runs:
+
+| `audioKind` | Models we route to | Plan focus |
+|---|---|---|
+| `music` | Suno V5 (default), Udio, Lyria 2 | genre + tempo + instrumentation |
+| `speech` | MiniMax TTS (default), Fish, ElevenLabs V3 | script + voice + pacing |
+| `sfx` | ElevenLabs SFX (default), AudioCraft | texture + impact + duration |
+
+## Resource map
+
+```
+audio-jingle/
+├── SKILL.md
+└── example.html
+```
+
+## Workflow
+
+### Step 0 — Read the project metadata
+
+`audioKind`, `audioModel`, `audioDuration` (seconds), and (for speech)
+`voice`. Branch by `audioKind` and use the values verbatim — no
+clarifying form unless something is marked `(unknown — ask)`.
+
+### Step 1 — Plan
+
+**Music**
+- Genre + reference artists (1-2)
+- Tempo (BPM) + key
+- Instrumentation (3-5 instruments max)
+- Vocals: yes / no / hummed / choir
+- Mood arc (intro → chorus → outro)
+
+**Speech**
+- Script (final, not draft — TTS runs verbatim)
+- Voice description (warmth, age, accent, pacing)
+- Pronunciation hints for proper nouns / acronyms
+
+**SFX**
+- Texture (impact / whoosh / ambience / foley)
+- Duration + envelope (sharp attack vs. gentle swell)
+- Layering note (single hit vs. stacked)
+
+State the plan in 2-3 sentences before dispatching.
+
+### Step 2 — Compose the prompt
+
+Use the format the upstream model prefers. Bind `audioDuration` to the
+API parameter directly; never put "make it 30 seconds" in prose.
+
+### Step 3 — Dispatch via the media contract
+
+Use the unified dispatcher — do **not** call provider APIs by hand:
+
+```bash
+node "$OD_BIN" media generate \
+  --project "$OD_PROJECT_ID" \
+  --surface audio \
+  --audio-kind "<music|speech|sfx>" \
+  --model "<audioModel from metadata>" \
+  --duration <audioDuration seconds> \
+  --voice "<voice (speech only)>" \
+  --output "<short-slug>-<duration>s.mp3" \
+  --prompt "<assembled prompt from Step 2 — for speech, the literal script>"
+```
+
+The command prints one line of JSON: `{"file": {"name": "...", ...}}`.
+The bytes land in the project; the FileViewer renders the audio
+transport controls automatically.
+
+### Step 4 — Hand off
+
+Reply with: plan summary, the filename returned by the dispatcher, and
+one sentence on what to try if the user wants a variation (e.g. "swap
+tempo from 92 to 108 BPM" rather than "make it different").
+
+## Hard rules
+
+- TTS runs your script **literally**. Proof it before dispatching —
+  even one stray comma changes the cadence.
+- Music: under 30s = single section; 30–90s = intro + body; 90s+ =
+  full arc. Don't try to fit a 3-act song into 15 seconds.
+- SFX: prefer one well-described layer over a paragraph of "make it
+  cool" — generators reward specific texture words.
+- Save the file every turn. The audio viewer shows transport controls
+  the moment the file lands.
diff --git a/skills/audio-jingle/example.html b/skills/audio-jingle/example.html
new file mode 100644
index 0000000..27e11b7
--- /dev/null
+++ b/skills/audio-jingle/example.html
@@ -0,0 +1,128 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>Audio jingle — example</title>
+    <style>
+      :root {
+        --bg: #f5efe5;
+        --panel: #ffffff;
+        --ink: #1c1b1a;
+        --muted: #8b8579;
+        --accent: #c96442;
+        --grid: #e6dfd1;
+      }
+      * { box-sizing: border-box; }
+      html, body { margin: 0; padding: 0; background: var(--bg); color: var(--ink);
+        font-family: 'Iowan Old Style', 'Charter', Georgia, serif; }
+      body { min-height: 100dvh; display: grid; place-items: center; padding: 32px; }
+      .card {
+        width: min(640px, 92vw);
+        background: var(--panel);
+        border-radius: 8px;
+        padding: 26px 28px 22px;
+        box-shadow: 0 16px 40px rgba(28,27,26,0.10), 0 1px 2px rgba(28,27,26,0.05);
+        border: 1px solid rgba(28,27,26,0.06);
+      }
+      .row1 { display: flex; align-items: center; gap: 14px; margin-bottom: 18px; }
+      .icon {
+        width: 44px; height: 44px; border-radius: 50%;
+        background: var(--accent); color: #fff;
+        display: grid; place-items: center;
+        box-shadow: 0 6px 18px rgba(201, 100, 66, 0.35);
+      }
+      .icon svg { width: 22px; height: 22px; }
+      .title { margin: 0; font-size: 20px; line-height: 1.2; }
+      .sub { font-family: ui-monospace, 'SF Mono', Menlo, monospace;
+        font-size: 11px; color: var(--muted); letter-spacing: 0.14em; text-transform: uppercase; margin-top: 2px; }
+
+      .wave {
+        display: flex; align-items: end; gap: 3px;
+        height: 96px; padding: 0 4px;
+        border-top: 1px dashed var(--grid);
+        border-bottom: 1px dashed var(--grid);
+      }
+      .wave span {
+        flex: 1; background: linear-gradient(180deg, var(--accent), #a4502f);
+        border-radius: 2px;
+        animation: bob 2s ease-in-out infinite;
+        animation-delay: var(--d, 0s);
+      }
+      @keyframes bob {
+        0%, 100% { height: var(--h, 30%); }
+        50% { height: calc(var(--h, 30%) * 1.6); }
+      }
+
+      .transport {
+        margin-top: 14px;
+        display: grid; grid-template-columns: auto 1fr auto auto; gap: 12px;
+        align-items: center;
+      }
+      .play {
+        width: 36px; height: 36px; border-radius: 50%;
+        background: var(--ink); color: #fff;
+        display: grid; place-items: center;
+      }
+      .timeline {
+        height: 4px; border-radius: 2px;
+        background: linear-gradient(90deg, var(--accent) 0 32%, var(--grid) 32% 100%);
+      }
+      .time {
+        font-family: ui-monospace, 'SF Mono', Menlo, monospace;
+        font-size: 11px; color: var(--muted);
+        letter-spacing: 0.08em;
+      }
+      .badge {
+        font-family: ui-monospace, 'SF Mono', Menlo, monospace;
+        font-size: 10px; color: var(--accent);
+        letter-spacing: 0.18em; text-transform: uppercase;
+        padding: 4px 8px; border-radius: 999px;
+        background: rgba(201, 100, 66, 0.1);
+      }
+    </style>
+  </head>
+  <body>
+    <div class="card">
+      <div class="row1">
+        <div class="icon" aria-hidden>
+          <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M9 18V5l12-2v13"/><circle cx="6" cy="18" r="3"/><circle cx="18" cy="16" r="3"/></svg>
+        </div>
+        <div>
+          <h1 class="title">A 30s coffee-shop launch jingle.</h1>
+          <div class="sub">suno-v5 · 92 BPM · loop-friendly tail</div>
+        </div>
+      </div>
+      <div class="wave" aria-hidden>
+        <span style="--h:24%;--d:0s"></span>
+        <span style="--h:38%;--d:.05s"></span>
+        <span style="--h:52%;--d:.1s"></span>
+        <span style="--h:64%;--d:.15s"></span>
+        <span style="--h:48%;--d:.2s"></span>
+        <span style="--h:70%;--d:.25s"></span>
+        <span style="--h:42%;--d:.3s"></span>
+        <span style="--h:58%;--d:.35s"></span>
+        <span style="--h:36%;--d:.4s"></span>
+        <span style="--h:62%;--d:.45s"></span>
+        <span style="--h:26%;--d:.5s"></span>
+        <span style="--h:50%;--d:.55s"></span>
+        <span style="--h:34%;--d:.6s"></span>
+        <span style="--h:46%;--d:.65s"></span>
+        <span style="--h:58%;--d:.7s"></span>
+        <span style="--h:30%;--d:.75s"></span>
+        <span style="--h:44%;--d:.8s"></span>
+        <span style="--h:54%;--d:.85s"></span>
+        <span style="--h:28%;--d:.9s"></span>
+        <span style="--h:48%;--d:.95s"></span>
+      </div>
+      <div class="transport">
+        <div class="play" aria-hidden>
+          <svg viewBox="0 0 24 24" width="14" height="14" fill="currentColor"><path d="M6 4v16l14-8z"/></svg>
+        </div>
+        <div class="timeline" aria-hidden></div>
+        <span class="time">00:09 / 00:30</span>
+        <span class="badge">MP3</span>
+      </div>
+    </div>
+  </body>
+</html>
diff --git a/skills/image-poster/SKILL.md b/skills/image-poster/SKILL.md
new file mode 100644
index 0000000..007745e
--- /dev/null
+++ b/skills/image-poster/SKILL.md
@@ -0,0 +1,104 @@
+---
+name: image-poster
+description: |
+  Single-image generation skill for posters, key art, and editorial
+  illustrations. Defaults to gpt-image-2 but is provider-agnostic — the
+  same workflow drives Flux, Imagen, or Midjourney via the active
+  upstream tooling. Output is one or more PNG/JPEG files saved to the
+  project folder.
+triggers:
+  - "poster"
+  - "key art"
+  - "illustration"
+  - "image"
+  - "cover art"
+  - "海报"
+  - "插画"
+od:
+  mode: image
+  surface: image
+  scenario: design
+  preview:
+    type: html
+    entry: example.html
+  design_system:
+    requires: false
+  example_prompt: |
+    Editorial poster for an indie film festival — one bold abstract
+    silhouette over a warm, slightly grainy paper background; hand-set
+    sans serif title at the top, festival dates and venue at the bottom
+    in monospace. Muted ochre + ink palette.
+---
+
+# Image Poster Skill
+
+Produce **one** finished image asset per turn unless the user asks for
+variations. Image generation rewards a tight, structured prompt — your
+job is to assemble that prompt from the user's brief, then dispatch.
+
+## Resource map
+
+```
+image-poster/
+├── SKILL.md         ← you're reading this
+└── example.html     ← what the resulting card looks like in Examples
+```
+
+## Workflow
+
+### Step 0 — Read the project metadata
+
+The active project carries `imageModel`, `imageAspect`, and (optional)
+`imageStyle` notes. Use them as the upstream model + canvas + style
+anchor; only ask the user to fill them in if they're marked `(unknown
+— ask)`.
+
+### Step 1 — Compose the prompt
+
+Plan in this exact order before calling any tool:
+
+1. **Subject + composition** — what is in the frame, where, at what
+   scale; eye-line and crop.
+2. **Lighting + mood** — natural / studio / moody; warm / cool; key
+   plus rim plus fill; time of day if outdoor.
+3. **Palette + textures** — hex anchors when the user gave a brand
+   palette; otherwise a 3-word mood tag (e.g. "muted ochre + ink").
+4. **Camera / lens** — only if the user wants photographic realism
+   ("85mm portrait, shallow DOF") or a specific film stock.
+5. **What to avoid** — common AI-slop patterns ("no extra fingers, no
+   warped text, no logo placeholders").
+
+### Step 2 — Dispatch via the media contract
+
+Use the unified dispatcher — do **not** call upstream provider APIs by
+hand. Run from your shell tool:
+
+```bash
+node "$OD_BIN" media generate \
+  --project "$OD_PROJECT_ID" \
+  --surface image \
+  --model "<imageModel from metadata>" \
+  --aspect "<imageAspect from metadata>" \
+  --output "<short-descriptive-name>.png" \
+  --prompt "<the full assembled prompt from Step 1>"
+```
+
+The command prints one line of JSON: `{"file": {"name": "...", ...}}`.
+The daemon writes the bytes into the project folder; the FileViewer
+picks it up automatically.
+
+### Step 3 — Hand off
+
+Reply with a one-paragraph summary of the prompt you used and the
+filename returned by the dispatcher (e.g. *I generated `hero-poster.png`
+with `gpt-image-2` at 1:1.*). Do **not** emit an `<artifact>` tag.
+
+## Hard rules
+
+- One image per turn unless asked for variations.
+- Honor `imageAspect` exactly — the upstream cost is the same; matching
+  the aspect avoids a re-render.
+- No filler typography in the image itself unless the user asked for
+  in-frame text. Real copy beats lorem.
+- Save every render — never describe an image without producing the
+  file. The user expects something to open in the file viewer.
diff --git a/skills/image-poster/example.html b/skills/image-poster/example.html
new file mode 100644
index 0000000..9e47d72
--- /dev/null
+++ b/skills/image-poster/example.html
@@ -0,0 +1,113 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>Image poster — example</title>
+    <style>
+      :root {
+        --bg: #f5efe5;
+        --ink: #1c1b1a;
+        --accent: #c96442;
+        --muted: #8b8579;
+        --paper: #efe7d7;
+      }
+      * { box-sizing: border-box; }
+      html, body { margin: 0; padding: 0; background: var(--bg); color: var(--ink);
+        font-family: 'Iowan Old Style', 'Charter', Georgia, serif; }
+      body { min-height: 100dvh; display: grid; place-items: center; padding: 32px; }
+      .poster {
+        width: min(640px, 92vw);
+        aspect-ratio: 3 / 4;
+        background: var(--paper);
+        border: 1px solid rgba(28, 27, 26, 0.08);
+        border-radius: 6px;
+        box-shadow: 0 16px 48px rgba(28, 27, 26, 0.12), 0 1px 2px rgba(28, 27, 26, 0.06);
+        display: grid;
+        grid-template-rows: auto 1fr auto;
+        padding: 38px 32px;
+        position: relative;
+        overflow: hidden;
+      }
+      .poster::after {
+        content: '';
+        position: absolute; inset: 0;
+        pointer-events: none;
+        background:
+          radial-gradient(circle at 30% 18%, rgba(255,255,255,0.7), transparent 60%),
+          repeating-linear-gradient(0deg, rgba(28,27,26,0.025) 0 1px, transparent 1px 2px);
+      }
+      .eyebrow {
+        font-family: ui-monospace, 'SF Mono', Menlo, monospace;
+        font-size: 11px;
+        letter-spacing: 0.18em;
+        text-transform: uppercase;
+        color: var(--muted);
+        display: flex;
+        justify-content: space-between;
+        align-items: center;
+      }
+      .accent-dot {
+        width: 8px; height: 8px; border-radius: 50%;
+        background: var(--accent);
+      }
+      .silhouette {
+        align-self: center;
+        justify-self: center;
+        width: 70%;
+        aspect-ratio: 1 / 1;
+        position: relative;
+      }
+      .silhouette svg { width: 100%; height: 100%; display: block; }
+      .meta {
+        font-family: ui-monospace, 'SF Mono', Menlo, monospace;
+        font-size: 10.5px;
+        letter-spacing: 0.14em;
+        text-transform: uppercase;
+        color: var(--muted);
+        display: grid;
+        grid-template-columns: 1fr auto 1fr;
+        gap: 12px;
+        align-items: end;
+      }
+      .meta strong { color: var(--ink); font-weight: 600; }
+      .title {
+        font-size: 44px;
+        line-height: 0.95;
+        margin: 18px 0 0;
+        letter-spacing: -0.01em;
+      }
+      .title em { font-style: italic; color: var(--accent); }
+      .footer {
+        margin-top: 12px;
+        font-size: 13px;
+        color: var(--muted);
+        font-family: ui-monospace, 'SF Mono', Menlo, monospace;
+      }
+    </style>
+  </head>
+  <body>
+    <div class="poster">
+      <div class="eyebrow">
+        <span>Open Design · Image</span>
+        <span class="accent-dot" aria-hidden></span>
+      </div>
+      <div class="silhouette" aria-hidden>
+        <svg viewBox="0 0 100 100">
+          <circle cx="50" cy="38" r="18" fill="#1c1b1a" />
+          <path d="M22 100 C 22 70, 78 70, 78 100 Z" fill="#1c1b1a" />
+          <circle cx="68" cy="22" r="6" fill="#c96442" />
+        </svg>
+      </div>
+      <div>
+        <h1 class="title">An <em>image</em> project<br />produced by the agent.</h1>
+        <div class="meta">
+          <span><strong>gpt-image-2</strong></span>
+          <span>·</span>
+          <span style="text-align:right">3:4 · poster</span>
+        </div>
+        <p class="footer">Saved as PNG into the project folder.</p>
+      </div>
+    </div>
+  </body>
+</html>
diff --git a/skills/video-shortform/SKILL.md b/skills/video-shortform/SKILL.md
new file mode 100644
index 0000000..8e2116c
--- /dev/null
+++ b/skills/video-shortform/SKILL.md
@@ -0,0 +1,108 @@
+---
+name: video-shortform
+description: |
+  Short-form video generation skill — 3-10 second clips for product
+  reveals, motion teasers, ambient loops. Defaults to Seedance 2 but
+  works the same with Kling 3 / 4, Veo 3 or Sora 2. Output is one MP4
+  saved to the project folder. When the workspace also ships an
+  interactive-video / hyperframes skill, prefer composing several short
+  shots into a single timeline rather than one long monolithic clip.
+triggers:
+  - "video"
+  - "clip"
+  - "shortform"
+  - "reel"
+  - "短视频"
+  - "动效"
+od:
+  mode: video
+  surface: video
+  scenario: marketing
+  preview:
+    type: html
+    entry: example.html
+  design_system:
+    requires: false
+  example_prompt: |
+    5-second product reveal — ceramic coffee mug rotating on a soft
+    paper backdrop, warm side-light from camera-left, micro dust motes
+    drifting through the beam. Cinematic, 16:9, slow drift on the camera.
+---
+
+# Video Shortform Skill
+
+Short-form (≤ 10s) is the sweet spot for current text-to-video models —
+they're great at one **shot** with one **idea**, weaker at multi-cut
+narratives. Plan one shot per call.
+
+## Resource map
+
+```
+video-shortform/
+├── SKILL.md
+└── example.html
+```
+
+## Workflow
+
+### Step 0 — Read the project metadata
+
+`videoModel`, `videoLength` (seconds), `videoAspect`. These are
+hard-locks — clamp the prompt to whatever the chosen model supports
+(Seedance 2 caps at 10s; Kling 4 supports up to 10s + image-to-video;
+Veo 3 supports 8s with audio).
+
+### Step 1 — Plan the shot
+
+Write the shotlist BEFORE calling the model:
+
+| Slot | Content |
+|---|---|
+| Subject | What's in frame? |
+| Camera | Static / pan / push-in / orbit? |
+| Lighting | Key direction + temperature |
+| Motion | What moves, at what pace? Subject motion vs camera motion. |
+| Sound | Ambient bed? (only if the model supports audio) |
+
+Show this to the user as a one-sentence plan before dispatching — they
+can redirect cheaply.
+
+### Step 2 — Compose the prompt
+
+Use the format the upstream model prefers (Seedance: motion + camera +
+mood; Kling: subject + camera + style; Veo: subject + cinematography +
+sound). Bind the project's `videoAspect` and `videoLength` directly to
+the API parameters; never put them in prose.
+
+### Step 3 — Dispatch via the media contract
+
+Use the unified dispatcher — do **not** call provider APIs by hand:
+
+```bash
+node "$OD_BIN" media generate \
+  --project "$OD_PROJECT_ID" \
+  --surface video \
+  --model "<videoModel from metadata>" \
+  --aspect "<videoAspect from metadata>" \
+  --length <videoLength seconds> \
+  --output "<short-slug>-<seconds>s.mp4" \
+  --prompt "<assembled shot prompt from Step 2>"
+```
+
+The command prints one line of JSON: `{"file": {"name": "...", ...}}`.
+The bytes land in the project; the FileViewer plays it automatically.
+
+### Step 4 — Hand off
+
+Reply with: shot summary, the filename returned by the dispatcher, and
+one sentence on what to try if the user wants a variation.
+
+## Hard rules
+
+- One shot per turn. Multi-shot timelines belong in a hyperframes /
+  interactive-video skill, not here.
+- Match `videoAspect` exactly — re-renders are slow.
+- Never ship a video without saving the file — the user expects
+  something to play in the file viewer.
+- When the underlying model fails (NSFW filter, content policy,
+  timeout), report the error verbatim. Don't silently retry.
diff --git a/skills/video-shortform/example.html b/skills/video-shortform/example.html
new file mode 100644
index 0000000..cfb50e4
--- /dev/null
+++ b/skills/video-shortform/example.html
@@ -0,0 +1,90 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>Short-form video — example</title>
+    <style>
+      :root {
+        --bg: #0e0d0c;
+        --panel: #1a1816;
+        --ink: #f5efe5;
+        --muted: #8b8579;
+        --accent: #c96442;
+      }
+      * { box-sizing: border-box; }
+      html, body { margin: 0; padding: 0; background: var(--bg); color: var(--ink);
+        font-family: 'Iowan Old Style', 'Charter', Georgia, serif; }
+      body { min-height: 100dvh; display: grid; place-items: center; padding: 32px; }
+      .stage {
+        width: min(720px, 92vw);
+        background: var(--panel);
+        border-radius: 8px;
+        padding: 22px;
+        box-shadow: 0 24px 60px rgba(0,0,0,0.45);
+      }
+      .frame {
+        position: relative;
+        aspect-ratio: 16 / 9;
+        border-radius: 6px;
+        overflow: hidden;
+        background:
+          radial-gradient(circle at 30% 35%, #d8b08b 0%, #6f4a35 40%, #1a120c 80%);
+      }
+      .frame::after {
+        content: ''; position: absolute; inset: 0;
+        background: repeating-linear-gradient(0deg, rgba(0,0,0,0.18) 0 1px, transparent 1px 4px);
+        pointer-events: none;
+        animation: scan 12s linear infinite;
+      }
+      @keyframes scan { from { background-position-y: 0; } to { background-position-y: 200px; } }
+      .frame .mug {
+        position: absolute; left: 50%; top: 56%; transform: translate(-50%, -50%);
+        width: 28%; aspect-ratio: 1 / 1;
+        background: radial-gradient(ellipse at 35% 35%, #f5efe5 0%, #c2b8a7 50%, #6f6757 100%);
+        border-radius: 18% 18% 22% 22% / 28% 28% 18% 18%;
+        box-shadow: 18px 6px 30px rgba(0,0,0,0.45);
+        animation: turn 6s ease-in-out infinite alternate;
+      }
+      .frame .mug::after {
+        content: ''; position: absolute; right: -14%; top: 28%;
+        width: 18%; height: 44%;
+        border: 6px solid #c2b8a7; border-left: none; border-radius: 0 100% 100% 0 / 0 50% 50% 0;
+      }
+      @keyframes turn { from { transform: translate(-50%, -50%) rotate(-6deg); } to { transform: translate(-50%, -50%) rotate(6deg); } }
+      .frame .timecode {
+        position: absolute; left: 14px; bottom: 12px;
+        font-family: ui-monospace, 'SF Mono', Menlo, monospace;
+        font-size: 11px; letter-spacing: 0.16em;
+        color: var(--muted);
+        background: rgba(0,0,0,0.4);
+        padding: 4px 8px; border-radius: 999px;
+      }
+      .frame .badge {
+        position: absolute; left: 14px; top: 12px;
+        font-family: ui-monospace, 'SF Mono', Menlo, monospace;
+        font-size: 10.5px; letter-spacing: 0.2em; text-transform: uppercase;
+        color: var(--accent);
+      }
+      .meta {
+        display: grid; grid-template-columns: 1fr auto; gap: 10px;
+        align-items: end; margin-top: 18px;
+      }
+      .title { font-size: 22px; line-height: 1.1; margin: 0; }
+      .sub { font-family: ui-monospace, 'SF Mono', Menlo, monospace; font-size: 11px; color: var(--muted); letter-spacing: 0.14em; text-transform: uppercase; }
+    </style>
+  </head>
+  <body>
+    <div class="stage">
+      <div class="frame">
+        <span class="badge">● REC</span>
+        <div class="mug" aria-hidden></div>
+        <span class="timecode">00:05 · 16:9 · seedance-2</span>
+      </div>
+      <div class="meta">
+        <h1 class="title">A 5-second product reveal — saved as MP4.</h1>
+        <span class="sub">Open Design · Video</span>
+      </div>
+    </div>
+  </body>
+</html>
diff --git a/src/components/DesignSystemsTab.tsx b/src/components/DesignSystemsTab.tsx
index bdfc02b..eaf856f 100644
--- a/src/components/DesignSystemsTab.tsx
+++ b/src/components/DesignSystemsTab.tsx
@@ -1,6 +1,8 @@
 import { useMemo, useState } from 'react';
 import { useT } from '../i18n';
-import type { DesignSystemSummary } from '../types';
+import type { Dict } from '../i18n/types';
+import type { DesignSystemSummary, Surface } from '../types';
+import { Icon } from './Icon';
 
 interface Props {
   systems: DesignSystemSummary[];
@@ -9,6 +11,20 @@ interface Props {
   onPreview: (id: string) => void;
 }
 
+type SurfaceFilter = 'all' | Surface;
+
+const SURFACE_PILLS: { value: SurfaceFilter; labelKey: keyof Dict; icon: 'grid' | 'image' | 'video' | 'music' | null }[] = [
+  { value: 'all', labelKey: 'common.all', icon: null },
+  { value: 'web', labelKey: 'ds.surfaceWeb', icon: 'grid' },
+  { value: 'image', labelKey: 'ds.surfaceImage', icon: 'image' },
+  { value: 'video', labelKey: 'ds.surfaceVideo', icon: 'video' },
+  { value: 'audio', labelKey: 'ds.surfaceAudio', icon: 'music' },
+];
+
+function surfaceOf(system: DesignSystemSummary): Surface {
+  return system.surface ?? 'web';
+}
+
 const CATEGORY_ORDER = [
   'Starter',
   'AI & LLM',
@@ -26,19 +42,43 @@ export function DesignSystemsTab({ systems, selectedId, onSelect, onPreview }: P
   const t = useT();
   const [filter, setFilter] = useState('');
   const [category, setCategory] = useState<string>('All');
+  const [surfaceFilter, setSurfaceFilter] = useState<SurfaceFilter>('all');
+
+  // Pre-scope by surface so the category dropdown only lists categories
+  // that exist within the active surface — avoids ghost options that
+  // would yield zero rows.
+  const surfaceScoped = useMemo(
+    () =>
+      surfaceFilter === 'all'
+        ? systems
+        : systems.filter((s) => surfaceOf(s) === surfaceFilter),
+    [systems, surfaceFilter],
+  );
+
+  const surfaceCounts = useMemo(() => {
+    const counts: Record<SurfaceFilter, number> = {
+      all: systems.length,
+      web: 0,
+      image: 0,
+      video: 0,
+      audio: 0,
+    };
+    for (const s of systems) counts[surfaceOf(s)]++;
+    return counts;
+  }, [systems]);
 
   const categories = useMemo(() => {
     const cats = new Set<string>();
-    for (const s of systems) cats.add(s.category || 'Uncategorized');
+    for (const s of surfaceScoped) cats.add(s.category || 'Uncategorized');
     const ordered: string[] = [];
     for (const c of CATEGORY_ORDER) if (cats.has(c)) ordered.push(c);
     for (const c of [...cats].sort()) if (!ordered.includes(c)) ordered.push(c);
     return ['All', ...ordered];
-  }, [systems]);
+  }, [surfaceScoped]);
 
   const filtered = useMemo(() => {
     const q = filter.trim().toLowerCase();
-    return systems.filter((s) => {
+    return surfaceScoped.filter((s) => {
       if (category !== 'All' && (s.category || 'Uncategorized') !== category) return false;
       if (!q) return true;
       return (
@@ -46,7 +86,7 @@ export function DesignSystemsTab({ systems, selectedId, onSelect, onPreview }: P
         s.summary.toLowerCase().includes(q)
       );
     });
-  }, [systems, filter, category]);
+  }, [surfaceScoped, filter, category]);
 
   // The category metadata coming from each design system is authored in
   // English. We translate the well-known buckets (All / Uncategorized) but
@@ -60,6 +100,30 @@ export function DesignSystemsTab({ systems, selectedId, onSelect, onPreview }: P
 
   return (
     <div className="tab-panel">
+      <div
+        className="examples-filter-row"
+        role="tablist"
+        aria-label={t('ds.surfaceLabel')}
+      >
+        <span className="examples-filter-label">{t('ds.surfaceLabel')}</span>
+        {SURFACE_PILLS.map((p) => (
+          <button
+            key={p.value}
+            type="button"
+            role="tab"
+            aria-selected={surfaceFilter === p.value}
+            className={`filter-pill ${surfaceFilter === p.value ? 'active' : ''}`}
+            onClick={() => {
+              setSurfaceFilter(p.value);
+              setCategory('All');
+            }}
+          >
+            {p.icon ? <Icon name={p.icon} size={12} /> : null}
+            {t(p.labelKey)}
+            <span className="filter-pill-count">{surfaceCounts[p.value]}</span>
+          </button>
+        ))}
+      </div>
       <div className="tab-panel-toolbar">
         <input
           placeholder={t('ds.searchPlaceholder')}
diff --git a/src/components/EntryView.tsx b/src/components/EntryView.tsx
index 29dd1a8..5f63e11 100644
--- a/src/components/EntryView.tsx
+++ b/src/components/EntryView.tsx
@@ -330,6 +330,30 @@ function metadataForSkill(skill: SkillSummary): ProjectMetadata {
         typeof skill.animations === 'boolean' ? skill.animations : false,
     };
   }
+  // Media surfaces — defaults match the new-project form so the
+  // 'Use this prompt' fast-create produces sensible metadata even
+  // when the SKILL.md doesn't pin a specific model. Skills can pin
+  // a model later via `od.image_model` etc.; for now we fall back to
+  // the surface's first default.
+  if (kind === 'image') {
+    return { kind, imageModel: 'gpt-image-2', imageAspect: '1:1' };
+  }
+  if (kind === 'video') {
+    return {
+      kind,
+      videoModel: 'seedance-2',
+      videoLength: 5,
+      videoAspect: '16:9',
+    };
+  }
+  if (kind === 'audio') {
+    return {
+      kind,
+      audioKind: 'music',
+      audioModel: 'suno-v5',
+      audioDuration: 30,
+    };
+  }
   return { kind: 'other' };
 }
 
@@ -337,5 +361,8 @@ function kindForSkill(skill: SkillSummary): ProjectKind {
   if (skill.mode === 'deck') return 'deck';
   if (skill.mode === 'prototype') return 'prototype';
   if (skill.mode === 'template') return 'template';
+  if (skill.mode === 'image') return 'image';
+  if (skill.mode === 'video') return 'video';
+  if (skill.mode === 'audio') return 'audio';
   return 'other';
 }
diff --git a/src/components/ExamplesTab.tsx b/src/components/ExamplesTab.tsx
index d2f251f..8f61c3f 100644
--- a/src/components/ExamplesTab.tsx
+++ b/src/components/ExamplesTab.tsx
@@ -4,7 +4,8 @@ import type { Dict } from '../i18n/types';
 import { fetchSkillExample } from '../providers/registry';
 import { exportAsHtml, exportAsPdf, exportAsZip } from '../runtime/exports';
 import { buildSrcdoc } from '../runtime/srcdoc';
-import type { SkillSummary } from '../types';
+import type { SkillSummary, Surface } from '../types';
+import { Icon } from './Icon';
 import { PreviewModal } from './PreviewModal';
 
 type TranslateFn = (key: keyof Dict, vars?: Record<string, string | number>) => string;
@@ -14,16 +15,73 @@ interface Props {
   onUsePrompt: (skill: SkillSummary) => void;
 }
 
-type ModeFilter = 'all' | 'prototype-desktop' | 'prototype-mobile' | 'deck' | 'document';
+type SurfaceFilter = 'all' | Surface;
+type ModeFilter =
+  | 'all'
+  | 'prototype-desktop'
+  | 'prototype-mobile'
+  | 'deck'
+  | 'document'
+  | 'image'
+  | 'video'
+  | 'audio';
 type ScenarioFilter = string;
 
-const MODE_PILLS: { value: ModeFilter; labelKey: keyof Dict }[] = [
+// Each surface gets its own type pills. We branch on `SURFACE_PILLS` so
+// the mode row reflects what makes sense within the active surface
+// (web has the most granularity; image / video / audio collapse to a
+// single mode pill so the pill count stays reasonable).
+const SURFACE_PILLS: { value: SurfaceFilter; labelKey: keyof Dict; icon: 'grid' | 'image' | 'video' | 'music' | null }[] = [
+  { value: 'all', labelKey: 'examples.modeAll', icon: null },
+  { value: 'web', labelKey: 'examples.surfaceWeb', icon: 'grid' },
+  { value: 'image', labelKey: 'examples.surfaceImage', icon: 'image' },
+  { value: 'video', labelKey: 'examples.surfaceVideo', icon: 'video' },
+  { value: 'audio', labelKey: 'examples.surfaceAudio', icon: 'music' },
+];
+
+const WEB_MODE_PILLS: { value: ModeFilter; labelKey: keyof Dict }[] = [
   { value: 'all', labelKey: 'examples.modeAll' },
   { value: 'prototype-desktop', labelKey: 'examples.modePrototypeDesktop' },
   { value: 'prototype-mobile', labelKey: 'examples.modePrototypeMobile' },
   { value: 'deck', labelKey: 'examples.modeDeck' },
   { value: 'document', labelKey: 'examples.modeDocument' },
 ];
+const IMAGE_MODE_PILLS: { value: ModeFilter; labelKey: keyof Dict }[] = [
+  { value: 'all', labelKey: 'examples.modeAll' },
+  { value: 'image', labelKey: 'examples.modeImage' },
+];
+const VIDEO_MODE_PILLS: { value: ModeFilter; labelKey: keyof Dict }[] = [
+  { value: 'all', labelKey: 'examples.modeAll' },
+  { value: 'video', labelKey: 'examples.modeVideo' },
+];
+const AUDIO_MODE_PILLS: { value: ModeFilter; labelKey: keyof Dict }[] = [
+  { value: 'all', labelKey: 'examples.modeAll' },
+  { value: 'audio', labelKey: 'examples.modeAudio' },
+];
+
+// Convenience — the union pill list for the "All surfaces" view.
+const ALL_MODE_PILLS: { value: ModeFilter; labelKey: keyof Dict }[] = [
+  ...WEB_MODE_PILLS,
+  { value: 'image', labelKey: 'examples.modeImage' },
+  { value: 'video', labelKey: 'examples.modeVideo' },
+  { value: 'audio', labelKey: 'examples.modeAudio' },
+];
+
+function surfaceOf(skill: SkillSummary): Surface {
+  if (skill.surface) return skill.surface;
+  if (skill.mode === 'image') return 'image';
+  if (skill.mode === 'video') return 'video';
+  if (skill.mode === 'audio') return 'audio';
+  return 'web';
+}
+
+function pillsForSurface(surface: SurfaceFilter): { value: ModeFilter; labelKey: keyof Dict }[] {
+  if (surface === 'web') return WEB_MODE_PILLS;
+  if (surface === 'image') return IMAGE_MODE_PILLS;
+  if (surface === 'video') return VIDEO_MODE_PILLS;
+  if (surface === 'audio') return AUDIO_MODE_PILLS;
+  return ALL_MODE_PILLS;
+}
 
 const SCENARIO_LABEL_KEY: Record<string, keyof Dict> = {
   general: 'examples.scenarioGeneral',
@@ -71,13 +129,22 @@ function matchesMode(skill: SkillSummary, filter: ModeFilter): boolean {
   if (filter === 'prototype-mobile')
     return skill.mode === 'prototype' && skill.platform === 'mobile';
   if (filter === 'document') return skill.mode === 'template';
+  if (filter === 'image') return surfaceOf(skill) === 'image';
+  if (filter === 'video') return surfaceOf(skill) === 'video';
+  if (filter === 'audio') return surfaceOf(skill) === 'audio';
   return true;
 }
 
+function matchesSurface(skill: SkillSummary, filter: SurfaceFilter): boolean {
+  if (filter === 'all') return true;
+  return surfaceOf(skill) === filter;
+}
+
 export function ExamplesTab({ skills, onUsePrompt }: Props) {
   const t = useT();
   // Hold preview HTML per skill across re-renders so cards never re-flicker.
   const [previews, setPreviews] = useState<Record<string, string | null>>({});
+  const [surfaceFilter, setSurfaceFilter] = useState<SurfaceFilter>('all');
   const [modeFilter, setModeFilter] = useState<ModeFilter>('all');
   const [scenarioFilter, setScenarioFilter] = useState<ScenarioFilter>('all');
   const [previewSkillId, setPreviewSkillId] = useState<string | null>(null);
@@ -106,32 +173,46 @@ export function ExamplesTab({ skills, onUsePrompt }: Props) {
     [skills, previewSkillId],
   );
 
-  const modeCounts = useMemo(() => {
-    const c: Record<ModeFilter, number> = {
+  const surfaceCounts = useMemo(() => {
+    const counts: Record<SurfaceFilter, number> = {
       all: skills.length,
-      'prototype-desktop': 0,
-      'prototype-mobile': 0,
-      deck: 0,
-      document: 0,
+      web: 0,
+      image: 0,
+      video: 0,
+      audio: 0,
     };
     for (const s of skills) {
-      if (matchesMode(s, 'prototype-desktop')) c['prototype-desktop']++;
-      if (matchesMode(s, 'prototype-mobile')) c['prototype-mobile']++;
-      if (matchesMode(s, 'deck')) c.deck++;
-      if (matchesMode(s, 'document')) c.document++;
+      const sf = surfaceOf(s);
+      counts[sf] = (counts[sf] ?? 0) + 1;
+    }
+    return counts;
+  }, [skills]);
+
+  const surfaceScopedSkills = useMemo(
+    () => skills.filter((s) => matchesSurface(s, surfaceFilter)),
+    [skills, surfaceFilter],
+  );
+
+  const modePills = useMemo(() => pillsForSurface(surfaceFilter), [surfaceFilter]);
+
+  const modeCounts = useMemo(() => {
+    const c: Record<string, number> = { all: surfaceScopedSkills.length };
+    for (const p of modePills) {
+      if (p.value === 'all') continue;
+      c[p.value] = surfaceScopedSkills.filter((s) => matchesMode(s, p.value)).length;
     }
     return c;
-  }, [skills]);
+  }, [surfaceScopedSkills, modePills]);
 
   const scenarioCounts = useMemo(() => {
     const counts = new Map<string, number>();
-    for (const s of skills) {
+    for (const s of surfaceScopedSkills) {
       if (!matchesMode(s, modeFilter)) continue;
       const tag = s.scenario || 'general';
       counts.set(tag, (counts.get(tag) ?? 0) + 1);
     }
     return counts;
-  }, [skills, modeFilter]);
+  }, [surfaceScopedSkills, modeFilter]);
 
   const scenarioOptions = useMemo(() => {
     const have = new Set(scenarioCounts.keys());
@@ -142,7 +223,7 @@ export function ExamplesTab({ skills, onUsePrompt }: Props) {
   }, [scenarioCounts]);
 
   const filtered = useMemo(() => {
-    const matched = skills.filter((s) => {
+    const matched = surfaceScopedSkills.filter((s) => {
       if (!matchesMode(s, modeFilter)) return false;
       if (scenarioFilter === 'all') return true;
       return (s.scenario || 'general') === scenarioFilter;
@@ -159,7 +240,7 @@ export function ExamplesTab({ skills, onUsePrompt }: Props) {
         return a.idx - b.idx;
       })
       .map(({ s }) => s);
-  }, [skills, modeFilter, scenarioFilter]);
+  }, [surfaceScopedSkills, modeFilter, scenarioFilter]);
 
   if (skills.length === 0) {
     return <div className="tab-empty">{t('examples.emptyNoSkills')}</div>;
@@ -168,13 +249,38 @@ export function ExamplesTab({ skills, onUsePrompt }: Props) {
   return (
     <div className="tab-panel examples-panel">
       <div className="examples-toolbar">
+        <div
+          className="examples-filter-row"
+          role="tablist"
+          aria-label={t('examples.surfaceLabel')}
+        >
+          <span className="examples-filter-label">{t('examples.surfaceLabel')}</span>
+          {SURFACE_PILLS.map((p) => (
+            <button
+              key={p.value}
+              type="button"
+              role="tab"
+              aria-selected={surfaceFilter === p.value}
+              className={`filter-pill ${surfaceFilter === p.value ? 'active' : ''}`}
+              onClick={() => {
+                setSurfaceFilter(p.value);
+                setModeFilter('all');
+                setScenarioFilter('all');
+              }}
+            >
+              {p.icon ? <Icon name={p.icon} size={12} /> : null}
+              {t(p.labelKey)}
+              <span className="filter-pill-count">{surfaceCounts[p.value]}</span>
+            </button>
+          ))}
+        </div>
         <div
           className="examples-filter-row"
           role="tablist"
           aria-label={t('examples.typeLabel')}
         >
           <span className="examples-filter-label">{t('examples.typeLabel')}</span>
-          {MODE_PILLS.map((p) => (
+          {modePills.map((p) => (
             <button
               key={p.value}
               type="button"
@@ -187,7 +293,9 @@ export function ExamplesTab({ skills, onUsePrompt }: Props) {
               }}
             >
               {t(p.labelKey)}
-              <span className="filter-pill-count">{modeCounts[p.value]}</span>
+              <span className="filter-pill-count">
+                {p.value === 'all' ? surfaceScopedSkills.length : (modeCounts[p.value] ?? 0)}
+              </span>
             </button>
           ))}
         </div>
@@ -445,6 +553,9 @@ function ExampleCard({
 }
 
 function tagForSkill(skill: SkillSummary, t: TranslateFn): string {
+  if (skill.mode === 'image') return t('examples.tagImage');
+  if (skill.mode === 'video') return t('examples.tagVideo');
+  if (skill.mode === 'audio') return t('examples.tagAudio');
   if (skill.mode === 'deck') return t('examples.tagSlideDeck');
   if (skill.mode === 'template') return t('examples.tagTemplate');
   if (skill.mode === 'design-system') return t('examples.tagDesignSystem');
diff --git a/src/components/FileViewer.tsx b/src/components/FileViewer.tsx
index d3b2aae..5c1fd53 100644
--- a/src/components/FileViewer.tsx
+++ b/src/components/FileViewer.tsx
@@ -42,6 +42,12 @@ export function FileViewer({
   if (file.kind === 'sketch') {
     return <ImageViewer projectId={projectId} file={file} />;
   }
+  if (file.kind === 'video') {
+    return <VideoViewer projectId={projectId} file={file} />;
+  }
+  if (file.kind === 'audio') {
+    return <AudioViewer projectId={projectId} file={file} />;
+  }
   if (file.kind === 'text' || file.kind === 'code') {
     return <TextViewer projectId={projectId} file={file} />;
   }
@@ -679,6 +685,95 @@ function ImageViewer({
   );
 }
 
+function VideoViewer({
+  projectId,
+  file,
+}: {
+  projectId: string;
+  file: ProjectFile;
+}) {
+  const t = useT();
+  // Bust the browser cache when the agent regenerates the file in place.
+  const url = `${projectFileUrl(projectId, file.name)}?v=${Math.round(file.mtime)}`;
+  return (
+    <div className="viewer video-viewer">
+      <div className="viewer-toolbar">
+        <div className="viewer-toolbar-left">
+          <span className="viewer-meta">
+            {t('fileViewer.videoMeta', { size: humanSize(file.size) })}
+          </span>
+        </div>
+        <div className="viewer-toolbar-actions">
+          <a
+            className="ghost-link"
+            href={projectFileUrl(projectId, file.name)}
+            download={file.name}
+          >
+            {t('fileViewer.download')}
+          </a>
+          <a
+            className="ghost-link"
+            href={projectFileUrl(projectId, file.name)}
+            target="_blank"
+            rel="noreferrer noopener"
+          >
+            {t('fileViewer.open')}
+          </a>
+        </div>
+      </div>
+      <div className="viewer-body video-body">
+        <video src={url} controls preload="metadata" />
+      </div>
+    </div>
+  );
+}
+
+function AudioViewer({
+  projectId,
+  file,
+}: {
+  projectId: string;
+  file: ProjectFile;
+}) {
+  const t = useT();
+  const url = `${projectFileUrl(projectId, file.name)}?v=${Math.round(file.mtime)}`;
+  return (
+    <div className="viewer audio-viewer">
+      <div className="viewer-toolbar">
+        <div className="viewer-toolbar-left">
+          <span className="viewer-meta">
+            {t('fileViewer.audioMeta', { size: humanSize(file.size) })}
+          </span>
+        </div>
+        <div className="viewer-toolbar-actions">
+          <a
+            className="ghost-link"
+            href={projectFileUrl(projectId, file.name)}
+            download={file.name}
+          >
+            {t('fileViewer.download')}
+          </a>
+          <a
+            className="ghost-link"
+            href={projectFileUrl(projectId, file.name)}
+            target="_blank"
+            rel="noreferrer noopener"
+          >
+            {t('fileViewer.open')}
+          </a>
+        </div>
+      </div>
+      <div className="viewer-body audio-body">
+        <div className="audio-card">
+          <Icon name="music" size={28} />
+          <div className="audio-card-name">{file.name}</div>
+          <audio src={url} controls preload="metadata" />
+        </div>
+      </div>
+    </div>
+  );
+}
+
 function TextViewer({
   projectId,
   file,
diff --git a/src/components/FileWorkspace.tsx b/src/components/FileWorkspace.tsx
index c761698..03ef12a 100644
--- a/src/components/FileWorkspace.tsx
+++ b/src/components/FileWorkspace.tsx
@@ -397,7 +397,7 @@ function Tab({
   onActivate: () => void;
   onClose?: () => void;
   closable?: boolean;
-  kind?: 'html' | 'image' | 'sketch' | 'text' | 'code' | 'binary';
+  kind?: 'html' | 'image' | 'video' | 'audio' | 'sketch' | 'text' | 'code' | 'binary';
 }) {
   const t = useT();
   const iconName = kindIconName(kind);
@@ -439,9 +439,13 @@ function kindIconName(
   | 'image'
   | 'pencil'
   | 'file'
+  | 'video'
+  | 'music'
   | null {
   if (kind === 'html') return 'file-code';
   if (kind === 'image') return 'image';
+  if (kind === 'video') return 'video';
+  if (kind === 'audio') return 'music';
   if (kind === 'sketch') return 'pencil';
   if (kind === 'code') return 'file-code';
   if (kind === 'text') return 'file';
diff --git a/src/components/Icon.tsx b/src/components/Icon.tsx
index f90ede5..96596bb 100644
--- a/src/components/Icon.tsx
+++ b/src/components/Icon.tsx
@@ -24,6 +24,8 @@ type IconName =
   | 'link'
   | 'mic'
   | 'minus'
+  | 'music'
+  | 'video'
   | 'pencil'
   | 'plus'
   | 'play'
@@ -232,6 +234,21 @@ export function Icon({ name, size = 14, strokeWidth = 1.6, ...rest }: Props) {
           <path d="M5 12h14" />
         </svg>
       );
+    case 'music':
+      return (
+        <svg {...common}>
+          <path d="M9 18V5l12-2v13" />
+          <circle cx="6" cy="18" r="3" />
+          <circle cx="18" cy="16" r="3" />
+        </svg>
+      );
+    case 'video':
+      return (
+        <svg {...common}>
+          <rect x="2" y="6" width="14" height="12" rx="2" />
+          <path d="m16 10 6-3v10l-6-3z" />
+        </svg>
+      );
     case 'pencil':
       return (
         <svg {...common}>
diff --git a/src/components/NewProjectPanel.tsx b/src/components/NewProjectPanel.tsx
index cde451a..806e972 100644
--- a/src/components/NewProjectPanel.tsx
+++ b/src/components/NewProjectPanel.tsx
@@ -1,18 +1,31 @@
 import { useEffect, useMemo, useRef, useState } from 'react';
 import { useT } from '../i18n';
 import type { Dict } from '../i18n/types';
+import {
+  AUDIO_MODELS_BY_KIND,
+  DEFAULT_AUDIO_MODEL,
+  DEFAULT_IMAGE_MODEL,
+  DEFAULT_VIDEO_MODEL,
+  IMAGE_MODELS,
+  VIDEO_MODELS,
+} from '../media/models';
 import type {
+  AudioKind,
   DesignSystemSummary,
+  MediaAspect,
   ProjectKind,
   ProjectMetadata,
   ProjectTemplate,
   SkillSummary,
+  Surface,
 } from '../types';
 import { Icon } from './Icon';
 import { Skeleton } from './Loading';
 
 type TranslateFn = (key: keyof Dict, vars?: Record<string, string | number>) => string;
 
+// Tabs that live INSIDE the Web surface. Image / Video / Audio surfaces
+// don't expose a tab row — they each have a single, dedicated form.
 export type CreateTab = 'prototype' | 'deck' | 'template' | 'other';
 
 export interface CreateInput {
@@ -38,6 +51,33 @@ const TAB_LABEL_KEYS: Record<CreateTab, keyof Dict> = {
   other: 'newproj.tabOther',
 };
 
+// Per-surface model lists are maintained in src/media/models.ts (and
+// daemon/media-models.js for the dispatcher). Both the picker below and
+// the agent's `od media generate --model …` invocation read the same
+// registry so the metadata captured here is what the daemon dispatches.
+
+// Surface vocab shared by the surface picker and the create-flow.
+const SURFACES: Surface[] = ['web', 'image', 'video', 'audio'];
+
+const SURFACE_LABEL_KEY: Record<Surface, keyof Dict> = {
+  web: 'newproj.surfaceWeb',
+  image: 'newproj.surfaceImage',
+  video: 'newproj.surfaceVideo',
+  audio: 'newproj.surfaceAudio',
+};
+const SURFACE_HINT_KEY: Record<Surface, keyof Dict> = {
+  web: 'newproj.surfaceWebHint',
+  image: 'newproj.surfaceImageHint',
+  video: 'newproj.surfaceVideoHint',
+  audio: 'newproj.surfaceAudioHint',
+};
+const SURFACE_ICON: Record<Surface, 'grid' | 'image' | 'video' | 'music'> = {
+  web: 'grid',
+  image: 'image',
+  video: 'video',
+  audio: 'music',
+};
+
 export function NewProjectPanel({
   skills,
   designSystems,
@@ -47,6 +87,10 @@ export function NewProjectPanel({
   loading = false,
 }: Props) {
   const t = useT();
+  // Top-level surface — controls which sub-form renders below. We keep
+  // it separate from the Web tab state so users can flip between
+  // surfaces without losing their per-surface choices.
+  const [surface, setSurface] = useState<Surface>('web');
   const [tab, setTab] = useState<CreateTab>('prototype');
   const [name, setName] = useState('');
   // Design-system selection is now an *array* internally so the same
@@ -64,12 +108,32 @@ export function NewProjectPanel({
   const [animations, setAnimations] = useState(false);
   const [templateId, setTemplateId] = useState<string | null>(null);
 
+  // Image / Video / Audio metadata. Kept independently so flipping
+  // surfaces preserves each surface's last pick instead of resetting.
+  const [imageModel, setImageModel] = useState<string>(DEFAULT_IMAGE_MODEL);
+  const [imageAspect, setImageAspect] = useState<MediaAspect>('1:1');
+  const [imageStyle, setImageStyle] = useState('');
+  const [videoModel, setVideoModel] = useState<string>(DEFAULT_VIDEO_MODEL);
+  const [videoLength, setVideoLength] = useState<number>(5);
+  const [videoAspect, setVideoAspect] = useState<MediaAspect>('16:9');
+  const [audioKind, setAudioKind] = useState<AudioKind>('music');
+  const [audioModel, setAudioModel] = useState<string>(DEFAULT_AUDIO_MODEL.music);
+  const [audioDuration, setAudioDuration] = useState<number>(30);
+  const [voice, setVoice] = useState('');
+
+  // When the audio kind flips, reset the model to that kind's default.
+  // This keeps users from accidentally creating a "music" project that
+  // has `audioModel: minimax-tts` because they last visited speech.
+  useEffect(() => {
+    setAudioModel(DEFAULT_AUDIO_MODEL[audioKind]);
+  }, [audioKind]);
+
   // When entering the template tab, snap to the first user-saved template
   // if there is one (and we don't already have a valid pick). The template
   // tab no longer offers a built-in fallback — the entire point is to
   // start from a template *the user* created via Share.
   useEffect(() => {
-    if (tab !== 'template') return;
+    if (surface !== 'web' || tab !== 'template') return;
     if (templates.length === 0) {
       setTemplateId(null);
       return;
@@ -77,12 +141,24 @@ export function NewProjectPanel({
     if (templateId == null || !templates.some((t) => t.id === templateId)) {
       setTemplateId(templates[0]!.id);
     }
-  }, [tab, templates, templateId]);
+  }, [surface, tab, templates, templateId]);
 
   // The skill the request still routes through — kept so prototype/deck
   // pick a default-rendered skill (so the agent gets the right SKILL.md
-  // body) without requiring the user to choose one explicitly.
+  // body) without requiring the user to choose one explicitly. For
+  // image / video / audio surfaces we look up a skill that targets that
+  // surface; if none ships yet the request still flies (skill_id null),
+  // and the agent falls back to its base behavior + project metadata.
   const skillIdForTab = useMemo(() => {
+    if (surface === 'image') {
+      return pickDefaultSkill(skills, 'image');
+    }
+    if (surface === 'video') {
+      return pickDefaultSkill(skills, 'video');
+    }
+    if (surface === 'audio') {
+      return pickDefaultSkill(skills, 'audio');
+    }
     if (tab === 'other') return null;
     if (tab === 'prototype') {
       const list = skills.filter((s) => s.mode === 'prototype');
@@ -97,16 +173,18 @@ export function NewProjectPanel({
         ?? null;
     }
     return null;
-  }, [tab, skills]);
+  }, [surface, tab, skills]);
 
-  const canCreate =
-    !loading && (tab !== 'template' || templateId != null);
+  const canCreate = !loading && (
+    surface !== 'web' || tab !== 'template' || templateId != null
+  );
 
   function handleCreate() {
     if (!canCreate) return;
     const primaryDs = selectedDsIds[0] ?? null;
     const inspirations = selectedDsIds.slice(1);
     const metadata = buildMetadata({
+      surface,
       tab,
       fidelity,
       speakerNotes,
@@ -114,32 +192,58 @@ export function NewProjectPanel({
       templateId,
       templates,
       inspirationIds: inspirations,
+      imageModel,
+      imageAspect,
+      imageStyle,
+      videoModel,
+      videoLength,
+      videoAspect,
+      audioKind,
+      audioModel,
+      audioDuration,
+      voice,
     });
+    const fallbackName = surface === 'web'
+      ? autoName(tab, t)
+      : autoNameForSurface(surface, t);
     onCreate({
-      name: name.trim() || autoName(tab, t),
+      name: name.trim() || fallbackName,
       skillId: skillIdForTab,
       designSystemId: primaryDs,
       metadata,
     });
   }
 
+  // Web surface needs a design-system picker; the media surfaces
+  // currently don't bind tokens to a system so we hide it to reduce
+  // noise. (When image/video DS surfaces ship, this will swap to a
+  // surface-filtered picker variant.)
+  const showDesignSystemPicker = surface === 'web';
+
+  // Web surface still uses the four sub-tabs; the media surfaces
+  // skip the row entirely because each has a single dedicated form.
+  const showWebTabs = surface === 'web';
+
   return (
     <div className="newproj">
-      <div className="newproj-tabs" role="tablist">
-        {(Object.keys(TAB_LABEL_KEYS) as CreateTab[]).map((entry) => (
-          <button
-            key={entry}
-            role="tab"
-            aria-selected={tab === entry}
-            className={`newproj-tab ${tab === entry ? 'active' : ''}`}
-            onClick={() => setTab(entry)}
-          >
-            {t(TAB_LABEL_KEYS[entry])}
-          </button>
-        ))}
-      </div>
+      <SurfacePicker value={surface} onChange={setSurface} />
+      {showWebTabs ? (
+        <div className="newproj-tabs" role="tablist">
+          {(Object.keys(TAB_LABEL_KEYS) as CreateTab[]).map((entry) => (
+            <button
+              key={entry}
+              role="tab"
+              aria-selected={tab === entry}
+              className={`newproj-tab ${tab === entry ? 'active' : ''}`}
+              onClick={() => setTab(entry)}
+            >
+              {t(TAB_LABEL_KEYS[entry])}
+            </button>
+          ))}
+        </div>
+      ) : null}
       <div className="newproj-body">
-        <h3 className="newproj-title">{titleForTab(tab, t)}</h3>
+        <h3 className="newproj-title">{titleForView(surface, tab, t)}</h3>
 
         <input
           className="newproj-name"
@@ -148,21 +252,23 @@ export function NewProjectPanel({
           onChange={(e) => setName(e.target.value)}
         />
 
-        <DesignSystemPicker
-          designSystems={designSystems}
-          defaultDesignSystemId={defaultDesignSystemId}
-          selectedIds={selectedDsIds}
-          multi={dsMulti}
-          onChangeMulti={setDsMulti}
-          onChange={setSelectedDsIds}
-          loading={loading}
-        />
+        {showDesignSystemPicker ? (
+          <DesignSystemPicker
+            designSystems={designSystems}
+            defaultDesignSystemId={defaultDesignSystemId}
+            selectedIds={selectedDsIds}
+            multi={dsMulti}
+            onChangeMulti={setDsMulti}
+            onChange={setSelectedDsIds}
+            loading={loading}
+          />
+        ) : null}
 
-        {tab === 'prototype' ? (
+        {surface === 'web' && tab === 'prototype' ? (
           <FidelityPicker value={fidelity} onChange={setFidelity} />
         ) : null}
 
-        {tab === 'deck' ? (
+        {surface === 'web' && tab === 'deck' ? (
           <ToggleRow
             label={t('newproj.toggleSpeakerNotes')}
             hint={t('newproj.toggleSpeakerNotesHint')}
@@ -171,7 +277,7 @@ export function NewProjectPanel({
           />
         ) : null}
 
-        {tab === 'template' ? (
+        {surface === 'web' && tab === 'template' ? (
           <>
             <TemplatePicker
               templates={templates}
@@ -187,19 +293,54 @@ export function NewProjectPanel({
           </>
         ) : null}
 
+        {surface === 'image' ? (
+          <ImageForm
+            model={imageModel}
+            onChangeModel={setImageModel}
+            aspect={imageAspect}
+            onChangeAspect={setImageAspect}
+            style={imageStyle}
+            onChangeStyle={setImageStyle}
+          />
+        ) : null}
+
+        {surface === 'video' ? (
+          <VideoForm
+            model={videoModel}
+            onChangeModel={setVideoModel}
+            length={videoLength}
+            onChangeLength={setVideoLength}
+            aspect={videoAspect}
+            onChangeAspect={setVideoAspect}
+          />
+        ) : null}
+
+        {surface === 'audio' ? (
+          <AudioForm
+            kind={audioKind}
+            onChangeKind={setAudioKind}
+            model={audioModel}
+            onChangeModel={setAudioModel}
+            duration={audioDuration}
+            onChangeDuration={setAudioDuration}
+            voice={voice}
+            onChangeVoice={setVoice}
+          />
+        ) : null}
+
         <button
           className="primary newproj-create"
           onClick={handleCreate}
           disabled={!canCreate}
           title={
-            tab === 'template' && templateId == null
+            surface === 'web' && tab === 'template' && templateId == null
               ? t('newproj.createDisabledTitle')
               : undefined
           }
         >
           <Icon name="plus" size={13} />
           <span>
-            {tab === 'template'
+            {surface === 'web' && tab === 'template'
               ? t('newproj.createFromTemplate')
               : t('newproj.create')}
           </span>
@@ -210,6 +351,290 @@ export function NewProjectPanel({
   );
 }
 
+function pickDefaultSkill(
+  skills: SkillSummary[],
+  surface: Surface,
+): string | null {
+  // Prefer a skill that explicitly declares `od.surface: <surface>` AND
+  // matches the corresponding mode. Fall back to mode-only match so even
+  // legacy skills authored without `surface` still get picked up.
+  const surfaceMatch = skills.find(
+    (s) => s.surface === surface && s.mode === surface,
+  );
+  if (surfaceMatch) return surfaceMatch.id;
+  const modeMatch = skills.find((s) => s.mode === surface);
+  if (modeMatch) return modeMatch.id;
+  return null;
+}
+
+function SurfacePicker({
+  value,
+  onChange,
+}: {
+  value: Surface;
+  onChange: (s: Surface) => void;
+}) {
+  const t = useT();
+  return (
+    <div className="newproj-surfaces" role="tablist" aria-label={t('newproj.surfaceLabel')}>
+      {SURFACES.map((s) => (
+        <button
+          key={s}
+          type="button"
+          role="tab"
+          aria-selected={value === s}
+          className={`newproj-surface${value === s ? ' active' : ''}`}
+          onClick={() => onChange(s)}
+        >
+          <Icon name={SURFACE_ICON[s]} size={15} />
+          <span className="newproj-surface-label">{t(SURFACE_LABEL_KEY[s])}</span>
+          <span className="newproj-surface-hint">{t(SURFACE_HINT_KEY[s])}</span>
+        </button>
+      ))}
+    </div>
+  );
+}
+
+function ImageForm({
+  model,
+  onChangeModel,
+  aspect,
+  onChangeAspect,
+  style,
+  onChangeStyle,
+}: {
+  model: string;
+  onChangeModel: (id: string) => void;
+  aspect: MediaAspect;
+  onChangeAspect: (a: MediaAspect) => void;
+  style: string;
+  onChangeStyle: (s: string) => void;
+}) {
+  const t = useT();
+  return (
+    <>
+      <ModelPicker
+        value={model}
+        onChange={onChangeModel}
+        options={IMAGE_MODELS}
+      />
+      <AspectPicker
+        value={aspect}
+        onChange={onChangeAspect}
+        options={['1:1', '16:9', '9:16', '4:3', '3:4']}
+      />
+      <div className="newproj-section">
+        <label className="newproj-label">{t('newproj.imageStyleLabel')}</label>
+        <textarea
+          className="newproj-textarea"
+          rows={3}
+          placeholder={t('newproj.imageStylePlaceholder')}
+          value={style}
+          onChange={(e) => onChangeStyle(e.target.value)}
+        />
+      </div>
+    </>
+  );
+}
+
+function VideoForm({
+  model,
+  onChangeModel,
+  length,
+  onChangeLength,
+  aspect,
+  onChangeAspect,
+}: {
+  model: string;
+  onChangeModel: (id: string) => void;
+  length: number;
+  onChangeLength: (n: number) => void;
+  aspect: MediaAspect;
+  onChangeAspect: (a: MediaAspect) => void;
+}) {
+  const t = useT();
+  const lengths = [3, 5, 10];
+  return (
+    <>
+      <ModelPicker value={model} onChange={onChangeModel} options={VIDEO_MODELS} />
+      <div className="newproj-section">
+        <label className="newproj-label">{t('newproj.videoLengthLabel')}</label>
+        <div className="pill-grid">
+          {lengths.map((s) => (
+            <button
+              key={s}
+              type="button"
+              className={`pill-grid-btn${length === s ? ' active' : ''}`}
+              onClick={() => onChangeLength(s)}
+              aria-pressed={length === s}
+            >
+              {t('newproj.videoLengthSeconds', { n: s })}
+            </button>
+          ))}
+        </div>
+      </div>
+      <AspectPicker
+        value={aspect}
+        onChange={onChangeAspect}
+        options={['16:9', '9:16', '1:1']}
+      />
+    </>
+  );
+}
+
+function AudioForm({
+  kind,
+  onChangeKind,
+  model,
+  onChangeModel,
+  duration,
+  onChangeDuration,
+  voice,
+  onChangeVoice,
+}: {
+  kind: AudioKind;
+  onChangeKind: (k: AudioKind) => void;
+  model: string;
+  onChangeModel: (id: string) => void;
+  duration: number;
+  onChangeDuration: (n: number) => void;
+  voice: string;
+  onChangeVoice: (v: string) => void;
+}) {
+  const t = useT();
+  const kinds: { id: AudioKind; labelKey: keyof Dict }[] = [
+    { id: 'music', labelKey: 'newproj.audioKindMusic' },
+    { id: 'speech', labelKey: 'newproj.audioKindSpeech' },
+    { id: 'sfx', labelKey: 'newproj.audioKindSfx' },
+  ];
+  // Music tracks are usually 30s-2min; speech / sfx work in shorter
+  // chunks. We expose three buckets per kind so users don't have to
+  // free-form-input a number.
+  const durations = kind === 'music' ? [30, 60, 120] : [10, 30, 60];
+  return (
+    <>
+      <div className="newproj-section">
+        <label className="newproj-label">{t('newproj.audioKindLabel')}</label>
+        <div className="pill-grid">
+          {kinds.map((k) => (
+            <button
+              key={k.id}
+              type="button"
+              className={`pill-grid-btn${kind === k.id ? ' active' : ''}`}
+              onClick={() => onChangeKind(k.id)}
+              aria-pressed={kind === k.id}
+            >
+              {t(k.labelKey)}
+            </button>
+          ))}
+        </div>
+      </div>
+      <ModelPicker
+        value={model}
+        onChange={onChangeModel}
+        options={AUDIO_MODELS_BY_KIND[kind]}
+      />
+      <div className="newproj-section">
+        <label className="newproj-label">{t('newproj.audioDurationLabel')}</label>
+        <div className="pill-grid">
+          {durations.map((s) => (
+            <button
+              key={s}
+              type="button"
+              className={`pill-grid-btn${duration === s ? ' active' : ''}`}
+              onClick={() => onChangeDuration(s)}
+              aria-pressed={duration === s}
+            >
+              {t('newproj.audioDurationSeconds', { n: s })}
+            </button>
+          ))}
+        </div>
+      </div>
+      {kind === 'speech' ? (
+        <div className="newproj-section">
+          <label className="newproj-label">{t('newproj.voiceLabel')}</label>
+          <textarea
+            className="newproj-textarea"
+            rows={2}
+            placeholder={t('newproj.voicePlaceholder')}
+            value={voice}
+            onChange={(e) => onChangeVoice(e.target.value)}
+          />
+        </div>
+      ) : null}
+    </>
+  );
+}
+
+function ModelPicker({
+  value,
+  onChange,
+  options,
+}: {
+  value: string;
+  onChange: (id: string) => void;
+  options: { id: string; label: string; hint: string }[];
+}) {
+  const t = useT();
+  return (
+    <div className="newproj-section">
+      <label className="newproj-label">{t('newproj.modelLabel')}</label>
+      <div className="model-grid">
+        {options.map((o) => (
+          <button
+            key={o.id}
+            type="button"
+            className={`model-card${value === o.id ? ' active' : ''}`}
+            onClick={() => onChange(o.id)}
+            aria-pressed={value === o.id}
+          >
+            <span className="model-card-name">{o.label}</span>
+            <span className="model-card-hint">{o.hint}</span>
+          </button>
+        ))}
+      </div>
+    </div>
+  );
+}
+
+function AspectPicker({
+  value,
+  onChange,
+  options,
+}: {
+  value: MediaAspect;
+  onChange: (a: MediaAspect) => void;
+  options: MediaAspect[];
+}) {
+  const t = useT();
+  const labelKeyFor: Record<MediaAspect, keyof Dict> = {
+    '1:1': 'newproj.aspectSquare',
+    '16:9': 'newproj.aspectLandscape',
+    '9:16': 'newproj.aspectPortrait',
+    '4:3': 'newproj.aspect43',
+    '3:4': 'newproj.aspect34',
+  };
+  return (
+    <div className="newproj-section">
+      <label className="newproj-label">{t('newproj.aspectLabel')}</label>
+      <div className="aspect-grid">
+        {options.map((a) => (
+          <button
+            key={a}
+            type="button"
+            className={`aspect-card${value === a ? ' active' : ''}`}
+            onClick={() => onChange(a)}
+            aria-pressed={value === a}
+          >
+            <span className={`aspect-thumb aspect-thumb-${a.replace(':', 'x')}`} aria-hidden />
+            <span className="aspect-label">{t(labelKeyFor[a])}</span>
+          </button>
+        ))}
+      </div>
+    </div>
+  );
+}
+
 function FidelityPicker({
   value,
   onChange,
@@ -764,6 +1189,7 @@ function fallbackSwatches(seed: string): string[] {
 }
 
 function buildMetadata(input: {
+  surface: Surface;
   tab: CreateTab;
   fidelity: 'wireframe' | 'high-fidelity';
   speakerNotes: boolean;
@@ -771,11 +1197,54 @@ function buildMetadata(input: {
   templateId: string | null;
   templates: ProjectTemplate[];
   inspirationIds: string[];
+  imageModel: string;
+  imageAspect: MediaAspect;
+  imageStyle: string;
+  videoModel: string;
+  videoLength: number;
+  videoAspect: MediaAspect;
+  audioKind: AudioKind;
+  audioModel: string;
+  audioDuration: number;
+  voice: string;
 }): ProjectMetadata {
-  const kind: ProjectKind = input.tab;
   const inspirations = input.inspirationIds.length > 0
     ? { inspirationDesignSystemIds: input.inspirationIds }
     : {};
+
+  if (input.surface === 'image') {
+    return {
+      kind: 'image',
+      imageModel: input.imageModel,
+      imageAspect: input.imageAspect,
+      imageStyle: input.imageStyle.trim() || undefined,
+      ...inspirations,
+    };
+  }
+  if (input.surface === 'video') {
+    return {
+      kind: 'video',
+      videoModel: input.videoModel,
+      videoLength: input.videoLength,
+      videoAspect: input.videoAspect,
+      ...inspirations,
+    };
+  }
+  if (input.surface === 'audio') {
+    return {
+      kind: 'audio',
+      audioKind: input.audioKind,
+      audioModel: input.audioModel,
+      audioDuration: input.audioDuration,
+      voice:
+        input.audioKind === 'speech' && input.voice.trim()
+          ? input.voice.trim()
+          : undefined,
+      ...inspirations,
+    };
+  }
+
+  const kind: ProjectKind = input.tab;
   if (input.tab === 'prototype') {
     return { kind, fidelity: input.fidelity, ...inspirations };
   }
@@ -800,7 +1269,10 @@ function buildMetadata(input: {
   return { kind: 'other', ...inspirations };
 }
 
-function titleForTab(tab: CreateTab, t: TranslateFn): string {
+function titleForView(surface: Surface, tab: CreateTab, t: TranslateFn): string {
+  if (surface === 'image') return t('newproj.titleImage');
+  if (surface === 'video') return t('newproj.titleVideo');
+  if (surface === 'audio') return t('newproj.titleAudio');
   switch (tab) {
     case 'prototype':
       return t('newproj.titlePrototype');
@@ -817,3 +1289,8 @@ function autoName(tab: CreateTab, t: TranslateFn): string {
   const stamp = new Date().toLocaleDateString();
   return `${t(TAB_LABEL_KEYS[tab])} · ${stamp}`;
 }
+
+function autoNameForSurface(surface: Surface, t: TranslateFn): string {
+  const stamp = new Date().toLocaleDateString();
+  return `${t(SURFACE_LABEL_KEY[surface])} · ${stamp}`;
+}
diff --git a/src/i18n/locales/en.ts b/src/i18n/locales/en.ts
index f37d29d..f74e6a1 100644
--- a/src/i18n/locales/en.ts
+++ b/src/i18n/locales/en.ts
@@ -92,6 +92,16 @@ export const en: Dict = {
   'entry.resizeAria': 'Resize sidebar',
   'entry.loadingWorkspace': 'Loading workspace…',
 
+  'newproj.surfaceLabel': 'Surface',
+  'newproj.surfaceWeb': 'Web',
+  'newproj.surfaceImage': 'Image',
+  'newproj.surfaceVideo': 'Video',
+  'newproj.surfaceAudio': 'Audio',
+  'newproj.surfaceWebHint': 'Prototypes, decks, docs',
+  'newproj.surfaceImageHint': 'Posters, illustrations, art',
+  'newproj.surfaceVideoHint': 'Short-form clips, motion',
+  'newproj.surfaceAudioHint': 'Music, voice, sfx',
+
   'newproj.tabPrototype': 'Prototype',
   'newproj.tabDeck': 'Slide deck',
   'newproj.tabTemplate': 'From template',
@@ -100,6 +110,32 @@ export const en: Dict = {
   'newproj.titleDeck': 'New slide deck',
   'newproj.titleTemplate': 'Start from a template',
   'newproj.titleOther': 'New project',
+  'newproj.titleImage': 'New image',
+  'newproj.titleVideo': 'New video',
+  'newproj.titleAudio': 'New audio',
+
+  'newproj.modelLabel': 'Model',
+  'newproj.modelHint': 'Pick the upstream provider the agent should call.',
+  'newproj.aspectLabel': 'Aspect ratio',
+  'newproj.aspectSquare': 'Square · 1:1',
+  'newproj.aspectLandscape': 'Landscape · 16:9',
+  'newproj.aspectPortrait': 'Portrait · 9:16',
+  'newproj.aspect43': 'Wide · 4:3',
+  'newproj.aspect34': 'Tall · 3:4',
+  'newproj.imageStyleLabel': 'Style notes (optional)',
+  'newproj.imageStylePlaceholder':
+    'e.g. editorial photography, muted earth tones, soft daylight',
+  'newproj.videoLengthLabel': 'Length',
+  'newproj.videoLengthSeconds': '{n}s',
+  'newproj.audioKindLabel': 'What are we making?',
+  'newproj.audioKindMusic': 'Music',
+  'newproj.audioKindSpeech': 'Voice / TTS',
+  'newproj.audioKindSfx': 'SFX / foley',
+  'newproj.audioDurationLabel': 'Duration',
+  'newproj.audioDurationSeconds': '{n}s',
+  'newproj.voiceLabel': 'Voice (TTS only)',
+  'newproj.voicePlaceholder':
+    'e.g. warm female narrator, British English, calm pacing',
   'newproj.namePlaceholder': 'Project name',
   'newproj.fidelityLabel': 'Fidelity',
   'newproj.fidelityWireframe': 'Wireframe',
@@ -156,6 +192,17 @@ export const en: Dict = {
   'examples.modePrototypeMobile': 'Prototypes · Mobile',
   'examples.modeDeck': 'Slides',
   'examples.modeDocument': 'Docs & templates',
+  'examples.modeImage': 'Images',
+  'examples.modeVideo': 'Videos',
+  'examples.modeAudio': 'Audio',
+  'examples.surfaceLabel': 'Surface',
+  'examples.surfaceWeb': 'Web',
+  'examples.surfaceImage': 'Image',
+  'examples.surfaceVideo': 'Video',
+  'examples.surfaceAudio': 'Audio',
+  'examples.tagImage': 'Image',
+  'examples.tagVideo': 'Video',
+  'examples.tagAudio': 'Audio',
   'examples.scenarioGeneral': 'General',
   'examples.scenarioEngineering': 'Engineering',
   'examples.scenarioProduct': 'Product',
@@ -197,6 +244,11 @@ export const en: Dict = {
   'ds.categoryUncategorized': 'Uncategorized',
   'ds.showcase': 'Showcase',
   'ds.tokens': 'Tokens',
+  'ds.surfaceLabel': 'Surface',
+  'ds.surfaceWeb': 'Web',
+  'ds.surfaceImage': 'Image',
+  'ds.surfaceVideo': 'Video',
+  'ds.surfaceAudio': 'Audio',
 
   'avatar.title': 'Account & settings',
   'avatar.localCli': 'Local CLI',
@@ -351,6 +403,8 @@ export const en: Dict = {
   'fileViewer.open': 'Open',
   'fileViewer.imageMeta': 'Image · {size}',
   'fileViewer.sketchMeta': 'Sketch · {size}',
+  'fileViewer.videoMeta': 'Video · {size}',
+  'fileViewer.audioMeta': 'Audio · {size}',
   'fileViewer.reload': 'Reload',
   'fileViewer.reloadDisk': 'Reload from disk',
   'fileViewer.copy': 'Copy',
diff --git a/src/i18n/locales/zh-CN.ts b/src/i18n/locales/zh-CN.ts
index 3be126f..976452a 100644
--- a/src/i18n/locales/zh-CN.ts
+++ b/src/i18n/locales/zh-CN.ts
@@ -91,6 +91,16 @@ export const zhCN: Dict = {
   'entry.resizeAria': '调整侧边栏宽度',
   'entry.loadingWorkspace': '正在加载工作区…',
 
+  'newproj.surfaceLabel': '类型',
+  'newproj.surfaceWeb': '网页',
+  'newproj.surfaceImage': '图片',
+  'newproj.surfaceVideo': '视频',
+  'newproj.surfaceAudio': '音频',
+  'newproj.surfaceWebHint': '原型 / 幻灯 / 文档',
+  'newproj.surfaceImageHint': '海报 / 插画 / 设计稿',
+  'newproj.surfaceVideoHint': '短视频 / 动效',
+  'newproj.surfaceAudioHint': '音乐 / 配音 / 音效',
+
   'newproj.tabPrototype': '原型',
   'newproj.tabDeck': '幻灯片',
   'newproj.tabTemplate': '从模板',
@@ -99,6 +109,30 @@ export const zhCN: Dict = {
   'newproj.titleDeck': '新建幻灯片',
   'newproj.titleTemplate': '从模板开始',
   'newproj.titleOther': '新建项目',
+  'newproj.titleImage': '新建图片',
+  'newproj.titleVideo': '新建视频',
+  'newproj.titleAudio': '新建音频',
+
+  'newproj.modelLabel': '模型',
+  'newproj.modelHint': '选择代理调用的上游模型。',
+  'newproj.aspectLabel': '画幅比例',
+  'newproj.aspectSquare': '方形 · 1:1',
+  'newproj.aspectLandscape': '横版 · 16:9',
+  'newproj.aspectPortrait': '竖版 · 9:16',
+  'newproj.aspect43': '宽屏 · 4:3',
+  'newproj.aspect34': '高屏 · 3:4',
+  'newproj.imageStyleLabel': '风格备注（可选）',
+  'newproj.imageStylePlaceholder': '例如：编辑摄影、低饱和大地色、柔光日光',
+  'newproj.videoLengthLabel': '时长',
+  'newproj.videoLengthSeconds': '{n}秒',
+  'newproj.audioKindLabel': '生成什么？',
+  'newproj.audioKindMusic': '音乐',
+  'newproj.audioKindSpeech': '配音 / TTS',
+  'newproj.audioKindSfx': '音效 / 拟音',
+  'newproj.audioDurationLabel': '时长',
+  'newproj.audioDurationSeconds': '{n}秒',
+  'newproj.voiceLabel': '声线（仅 TTS）',
+  'newproj.voicePlaceholder': '例如：温暖女声旁白，普通话，平稳语速',
   'newproj.namePlaceholder': '项目名称',
   'newproj.fidelityLabel': '精度',
   'newproj.fidelityWireframe': '线框图',
@@ -153,6 +187,17 @@ export const zhCN: Dict = {
   'examples.modePrototypeMobile': '原型 · 移动端',
   'examples.modeDeck': '幻灯片',
   'examples.modeDocument': '文档与模板',
+  'examples.modeImage': '图片',
+  'examples.modeVideo': '视频',
+  'examples.modeAudio': '音频',
+  'examples.surfaceLabel': '类型',
+  'examples.surfaceWeb': '网页',
+  'examples.surfaceImage': '图片',
+  'examples.surfaceVideo': '视频',
+  'examples.surfaceAudio': '音频',
+  'examples.tagImage': '图片',
+  'examples.tagVideo': '视频',
+  'examples.tagAudio': '音频',
   'examples.scenarioGeneral': '通用',
   'examples.scenarioEngineering': '工程',
   'examples.scenarioProduct': '产品',
@@ -194,6 +239,11 @@ export const zhCN: Dict = {
   'ds.categoryUncategorized': '未分类',
   'ds.showcase': '展示',
   'ds.tokens': 'Token',
+  'ds.surfaceLabel': '类型',
+  'ds.surfaceWeb': '网页',
+  'ds.surfaceImage': '图片',
+  'ds.surfaceVideo': '视频',
+  'ds.surfaceAudio': '音频',
 
   'avatar.title': '账户与设置',
   'avatar.localCli': '本机 CLI',
@@ -342,6 +392,8 @@ export const zhCN: Dict = {
   'fileViewer.open': '打开',
   'fileViewer.imageMeta': '图片 · {size}',
   'fileViewer.sketchMeta': '草图 · {size}',
+  'fileViewer.videoMeta': '视频 · {size}',
+  'fileViewer.audioMeta': '音频 · {size}',
   'fileViewer.reload': '重新加载',
   'fileViewer.reloadDisk': '从磁盘重新加载',
   'fileViewer.copy': '复制',
diff --git a/src/i18n/types.ts b/src/i18n/types.ts
index 0037dca..fc2a0f9 100644
--- a/src/i18n/types.ts
+++ b/src/i18n/types.ts
@@ -104,6 +104,19 @@ export interface Dict {
   'entry.loadingWorkspace': string;
 
   // New project panel
+  // Top-level surface picker — sits above the existing tabs and switches
+  // the form between Web (prototype/deck/template/other), Image, Video,
+  // and Audio surfaces.
+  'newproj.surfaceLabel': string;
+  'newproj.surfaceWeb': string;
+  'newproj.surfaceImage': string;
+  'newproj.surfaceVideo': string;
+  'newproj.surfaceAudio': string;
+  'newproj.surfaceWebHint': string;
+  'newproj.surfaceImageHint': string;
+  'newproj.surfaceVideoHint': string;
+  'newproj.surfaceAudioHint': string;
+
   'newproj.tabPrototype': string;
   'newproj.tabDeck': string;
   'newproj.tabTemplate': string;
@@ -112,6 +125,31 @@ export interface Dict {
   'newproj.titleDeck': string;
   'newproj.titleTemplate': string;
   'newproj.titleOther': string;
+  'newproj.titleImage': string;
+  'newproj.titleVideo': string;
+  'newproj.titleAudio': string;
+
+  // Media-specific labels for the Image / Video / Audio forms.
+  'newproj.modelLabel': string;
+  'newproj.modelHint': string;
+  'newproj.aspectLabel': string;
+  'newproj.aspectSquare': string;
+  'newproj.aspectLandscape': string;
+  'newproj.aspectPortrait': string;
+  'newproj.aspect43': string;
+  'newproj.aspect34': string;
+  'newproj.imageStyleLabel': string;
+  'newproj.imageStylePlaceholder': string;
+  'newproj.videoLengthLabel': string;
+  'newproj.videoLengthSeconds': string;
+  'newproj.audioKindLabel': string;
+  'newproj.audioKindMusic': string;
+  'newproj.audioKindSpeech': string;
+  'newproj.audioKindSfx': string;
+  'newproj.audioDurationLabel': string;
+  'newproj.audioDurationSeconds': string;
+  'newproj.voiceLabel': string;
+  'newproj.voicePlaceholder': string;
   'newproj.namePlaceholder': string;
   'newproj.fidelityLabel': string;
   'newproj.fidelityWireframe': string;
@@ -167,6 +205,17 @@ export interface Dict {
   'examples.modePrototypeMobile': string;
   'examples.modeDeck': string;
   'examples.modeDocument': string;
+  'examples.modeImage': string;
+  'examples.modeVideo': string;
+  'examples.modeAudio': string;
+  'examples.surfaceLabel': string;
+  'examples.surfaceWeb': string;
+  'examples.surfaceImage': string;
+  'examples.surfaceVideo': string;
+  'examples.surfaceAudio': string;
+  'examples.tagImage': string;
+  'examples.tagVideo': string;
+  'examples.tagAudio': string;
   'examples.scenarioGeneral': string;
   'examples.scenarioEngineering': string;
   'examples.scenarioProduct': string;
@@ -209,6 +258,12 @@ export interface Dict {
   'ds.categoryUncategorized': string;
   'ds.showcase': string;
   'ds.tokens': string;
+  // Surface filter row in the Design systems tab.
+  'ds.surfaceLabel': string;
+  'ds.surfaceWeb': string;
+  'ds.surfaceImage': string;
+  'ds.surfaceVideo': string;
+  'ds.surfaceAudio': string;
 
   // Avatar menu (project topbar)
   'avatar.title': string;
@@ -358,6 +413,8 @@ export interface Dict {
   'fileViewer.open': string;
   'fileViewer.imageMeta': string;
   'fileViewer.sketchMeta': string;
+  'fileViewer.videoMeta': string;
+  'fileViewer.audioMeta': string;
   'fileViewer.reload': string;
   'fileViewer.reloadDisk': string;
   'fileViewer.copy': string;
diff --git a/src/index.css b/src/index.css
index a196b59..43aaa60 100644
--- a/src/index.css
+++ b/src/index.css
@@ -1091,6 +1091,212 @@ code {
   text-align: center;
 }
 
+/* -------- Surface picker (top-level Web/Image/Video/Audio) ----------- */
+.newproj-surfaces {
+  display: grid;
+  grid-template-columns: repeat(2, 1fr);
+  gap: 6px;
+  padding: 10px 10px 8px;
+  border-bottom: 1px solid var(--border);
+}
+.newproj-surface {
+  display: flex;
+  flex-direction: column;
+  align-items: flex-start;
+  gap: 4px;
+  padding: 10px 10px 9px;
+  background: var(--bg-panel);
+  border: 1px solid var(--border);
+  border-radius: var(--radius-sm);
+  cursor: pointer;
+  text-align: left;
+  color: var(--text);
+  transition: border-color 120ms ease, background 120ms ease, box-shadow 120ms ease;
+  min-width: 0;
+}
+.newproj-surface:hover:not(:disabled) { border-color: var(--border-strong); }
+.newproj-surface.active {
+  border-color: var(--accent);
+  background: var(--accent-tint);
+  box-shadow: 0 0 0 1px var(--accent);
+}
+.newproj-surface > svg { color: var(--text-muted); }
+.newproj-surface.active > svg { color: var(--accent); }
+.newproj-surface-label {
+  font-size: 12.5px;
+  font-weight: 600;
+}
+.newproj-surface-hint {
+  font-size: 10.5px;
+  color: var(--text-muted);
+  line-height: 1.3;
+  white-space: nowrap;
+  overflow: hidden;
+  text-overflow: ellipsis;
+  max-width: 100%;
+}
+
+/* -------- Model / aspect / pill grids (image/video/audio forms) ----- */
+.newproj-textarea {
+  width: 100%;
+  resize: vertical;
+  min-height: 60px;
+  padding: 10px 12px;
+  border: 1px solid var(--border);
+  border-radius: var(--radius-sm);
+  background: var(--bg-panel);
+  font: inherit;
+  font-size: 13px;
+  color: var(--text);
+  line-height: 1.45;
+}
+.newproj-textarea:focus {
+  outline: none;
+  border-color: var(--accent);
+  box-shadow: 0 0 0 1px var(--accent);
+}
+
+.model-grid {
+  display: grid;
+  grid-template-columns: 1fr 1fr;
+  gap: 6px;
+}
+.model-card {
+  display: flex;
+  flex-direction: column;
+  align-items: flex-start;
+  gap: 2px;
+  padding: 8px 10px;
+  background: var(--bg-panel);
+  border: 1px solid var(--border);
+  border-radius: var(--radius-sm);
+  cursor: pointer;
+  text-align: left;
+  transition: border-color 120ms ease, background 120ms ease;
+}
+.model-card:hover:not(:disabled) { border-color: var(--border-strong); }
+.model-card.active {
+  border-color: var(--accent);
+  background: var(--accent-tint);
+}
+.model-card-name {
+  font-size: 12.5px;
+  font-weight: 600;
+  color: var(--text);
+  font-family: var(--font-mono, ui-monospace, SFMono-Regular, Menlo, monospace);
+}
+.model-card-hint {
+  font-size: 10.5px;
+  color: var(--text-muted);
+}
+
+.aspect-grid {
+  display: grid;
+  grid-template-columns: repeat(auto-fit, minmax(64px, 1fr));
+  gap: 6px;
+}
+.aspect-card {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  gap: 6px;
+  padding: 8px 6px 9px;
+  background: var(--bg-panel);
+  border: 1px solid var(--border);
+  border-radius: var(--radius-sm);
+  cursor: pointer;
+  transition: border-color 120ms ease, background 120ms ease;
+}
+.aspect-card:hover:not(:disabled) { border-color: var(--border-strong); }
+.aspect-card.active {
+  border-color: var(--accent);
+  background: var(--accent-tint);
+}
+.aspect-thumb {
+  background: var(--bg-subtle);
+  border: 1px solid var(--border-soft);
+  border-radius: 3px;
+  display: block;
+}
+.aspect-thumb-1x1 { width: 24px; height: 24px; }
+.aspect-thumb-16x9 { width: 32px; height: 18px; }
+.aspect-thumb-9x16 { width: 18px; height: 32px; }
+.aspect-thumb-4x3 { width: 28px; height: 21px; }
+.aspect-thumb-3x4 { width: 21px; height: 28px; }
+.aspect-label {
+  font-size: 10.5px;
+  color: var(--text-muted);
+  text-align: center;
+  white-space: nowrap;
+}
+.aspect-card.active .aspect-label { color: var(--text); }
+
+.pill-grid {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 6px;
+}
+.pill-grid-btn {
+  padding: 6px 12px;
+  background: var(--bg-panel);
+  border: 1px solid var(--border);
+  border-radius: 999px;
+  font-size: 12px;
+  color: var(--text);
+  cursor: pointer;
+  transition: border-color 120ms ease, background 120ms ease;
+}
+.pill-grid-btn:hover:not(:disabled) { border-color: var(--border-strong); }
+.pill-grid-btn.active {
+  border-color: var(--accent);
+  background: var(--accent-tint);
+  color: var(--text);
+  font-weight: 500;
+}
+
+/* -------- Video / audio viewers -------------------------------------- */
+.video-body, .audio-body {
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  padding: 24px;
+  background: var(--bg-subtle);
+  min-height: 0;
+  flex: 1;
+}
+.video-body video {
+  max-width: 100%;
+  max-height: 100%;
+  border-radius: var(--radius-sm);
+  background: #000;
+  box-shadow: var(--shadow-md, 0 8px 28px rgba(0, 0, 0, 0.18));
+}
+.audio-card {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  gap: 10px;
+  padding: 28px 32px;
+  background: var(--bg-panel);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+  min-width: 280px;
+  max-width: 480px;
+  width: 100%;
+  box-shadow: var(--shadow-xs);
+  color: var(--text-muted);
+}
+.audio-card-name {
+  font-size: 13px;
+  font-weight: 500;
+  color: var(--text);
+  word-break: break-all;
+  text-align: center;
+}
+.audio-card audio {
+  width: 100%;
+}
+
 /* -------- Fidelity cards (prototype tab) ---------------------------- */
 .fidelity-grid {
   display: grid;
diff --git a/src/media/models.ts b/src/media/models.ts
new file mode 100644
index 0000000..a291c09
--- /dev/null
+++ b/src/media/models.ts
@@ -0,0 +1,107 @@
+/**
+ * Single source of truth for the media-generation model registry.
+ *
+ * Both the frontend (NewProjectPanel model pickers) and the daemon
+ * (od media generate dispatcher) consume this list. When you add a new
+ * model entry here, the picker shows it AND the daemon can route to it —
+ * the unifying contract is "skills + metadata + prompt → code agent →
+ * od media generate", and this file pins down what `--model` IDs the
+ * agent is allowed to pass.
+ *
+ * The daemon imports the JSON view of this file via fs.readFile so we
+ * don't fork the registry between frontend and Node code paths.
+ */
+
+import type { AudioKind, MediaAspect } from '../types';
+
+export interface MediaModel {
+  /** Stable ID used in metadata.imageModel / videoModel / audioModel. */
+  id: string;
+  /** Short label shown in pickers — usually equals id. */
+  label: string;
+  /** Vendor / context hint shown under the label. */
+  hint: string;
+  /**
+   * Capabilities the agent may rely on when planning. Used downstream by
+   * the dispatcher to decide which provider call to make.
+   */
+  caps?: string[];
+}
+
+export const IMAGE_MODELS: MediaModel[] = [
+  { id: 'gpt-image-2', label: 'gpt-image-2', hint: 'OpenAI · default', caps: ['t2i', 'i2i', 'inpaint'] },
+  { id: 'flux-1.1-pro', label: 'flux-1.1-pro', hint: 'Black Forest Labs', caps: ['t2i', 'i2i'] },
+  { id: 'imagen-4', label: 'imagen-4', hint: 'Google', caps: ['t2i'] },
+  { id: 'midjourney-v7', label: 'midjourney-v7', hint: 'Midjourney', caps: ['t2i'] },
+];
+
+export const VIDEO_MODELS: MediaModel[] = [
+  { id: 'seedance-2', label: 'seedance-2', hint: 'ByteDance · default', caps: ['t2v', 'i2v'] },
+  { id: 'kling-3', label: 'kling-3', hint: 'Kuaishou', caps: ['t2v', 'i2v'] },
+  { id: 'kling-4', label: 'kling-4', hint: 'Kuaishou · latest', caps: ['t2v', 'i2v'] },
+  { id: 'veo-3', label: 'veo-3', hint: 'Google', caps: ['t2v'] },
+  { id: 'sora-2', label: 'sora-2', hint: 'OpenAI', caps: ['t2v'] },
+];
+
+export const AUDIO_MODELS_BY_KIND: Record<AudioKind, MediaModel[]> = {
+  music: [
+    { id: 'suno-v5', label: 'suno-v5', hint: 'Suno · default', caps: ['music'] },
+    { id: 'udio-v2', label: 'udio-v2', hint: 'Udio', caps: ['music'] },
+    { id: 'lyria-2', label: 'lyria-2', hint: 'Google', caps: ['music'] },
+  ],
+  speech: [
+    { id: 'minimax-tts', label: 'minimax-tts', hint: 'MiniMax · default', caps: ['tts'] },
+    { id: 'fish-speech-2', label: 'fish-speech-2', hint: 'FishAudio', caps: ['tts', 'voice-clone'] },
+    { id: 'elevenlabs-v3', label: 'elevenlabs-v3', hint: 'ElevenLabs', caps: ['tts', 'voice-clone'] },
+  ],
+  sfx: [
+    { id: 'elevenlabs-sfx', label: 'elevenlabs-sfx', hint: 'ElevenLabs SFX', caps: ['sfx'] },
+    { id: 'audiocraft', label: 'audiocraft', hint: 'Meta · open', caps: ['sfx', 'music'] },
+  ],
+};
+
+export const MEDIA_ASPECTS: MediaAspect[] = ['1:1', '16:9', '9:16', '4:3', '3:4'];
+
+export const VIDEO_LENGTHS_SEC: number[] = [3, 5, 8, 10, 15, 30];
+export const AUDIO_DURATIONS_SEC: number[] = [5, 10, 15, 30, 60, 120];
+
+export const DEFAULT_IMAGE_MODEL = IMAGE_MODELS[0]!.id;
+export const DEFAULT_VIDEO_MODEL = VIDEO_MODELS[0]!.id;
+export const DEFAULT_AUDIO_MODEL: Record<AudioKind, string> = {
+  music: AUDIO_MODELS_BY_KIND.music[0]!.id,
+  speech: AUDIO_MODELS_BY_KIND.speech[0]!.id,
+  sfx: AUDIO_MODELS_BY_KIND.sfx[0]!.id,
+};
+
+/**
+ * Look up a model record across all surfaces by ID. Returns null if the
+ * agent passes an unknown model — the dispatcher rejects with a clear
+ * error so the agent re-plans instead of silently falling back.
+ */
+export function findMediaModel(id: string): MediaModel | null {
+  const all: MediaModel[] = [
+    ...IMAGE_MODELS,
+    ...VIDEO_MODELS,
+    ...AUDIO_MODELS_BY_KIND.music,
+    ...AUDIO_MODELS_BY_KIND.speech,
+    ...AUDIO_MODELS_BY_KIND.sfx,
+  ];
+  return all.find((m) => m.id === id) ?? null;
+}
+
+/** All model IDs grouped by surface, used for prompt-side disclosure. */
+export function modelIdsBySurface(): {
+  image: string[];
+  video: string[];
+  audio: { music: string[]; speech: string[]; sfx: string[] };
+} {
+  return {
+    image: IMAGE_MODELS.map((m) => m.id),
+    video: VIDEO_MODELS.map((m) => m.id),
+    audio: {
+      music: AUDIO_MODELS_BY_KIND.music.map((m) => m.id),
+      speech: AUDIO_MODELS_BY_KIND.speech.map((m) => m.id),
+      sfx: AUDIO_MODELS_BY_KIND.sfx.map((m) => m.id),
+    },
+  };
+}
diff --git a/src/prompts/media-contract.ts b/src/prompts/media-contract.ts
new file mode 100644
index 0000000..c36eac8
--- /dev/null
+++ b/src/prompts/media-contract.ts
@@ -0,0 +1,135 @@
+/**
+ * Media generation contract. Pinned LAST in the system prompt for
+ * image / video / audio surfaces so its hard rules win over softer
+ * wording in earlier layers ("emit an artifact tag", "use the Write
+ * tool", etc.).
+ *
+ * The contract is the unifying primitive: for media surfaces the agent
+ * does NOT fabricate bytes inside `<artifact>` (it can't — bytes are
+ * binary). Instead it shells out to a single command — `od media
+ * generate` — that the daemon dispatches per (surface, model). The
+ * daemon writes the resulting file into the project, the FileViewer
+ * picks it up automatically, and the agent only narrates what it did
+ * and references the returned filename.
+ *
+ * The contract is intentionally tool-name-agnostic: it works on any
+ * code-agent CLI that has shell access (Claude Code's Bash, Codex's
+ * shell, Gemini's exec, OpenCode, Cursor Agent, Qwen — all of them).
+ * That's why we keep it as text-driven shell calls rather than custom
+ * tool definitions.
+ */
+import {
+  AUDIO_MODELS_BY_KIND,
+  IMAGE_MODELS,
+  VIDEO_MODELS,
+} from '../media/models';
+
+function fmtList(ids: string[]): string {
+  return ids.map((id) => `\`${id}\``).join(', ');
+}
+
+const IMAGE_IDS = fmtList(IMAGE_MODELS.map((m) => m.id));
+const VIDEO_IDS = fmtList(VIDEO_MODELS.map((m) => m.id));
+const AUDIO_MUSIC_IDS = fmtList(AUDIO_MODELS_BY_KIND.music.map((m) => m.id));
+const AUDIO_SPEECH_IDS = fmtList(AUDIO_MODELS_BY_KIND.speech.map((m) => m.id));
+const AUDIO_SFX_IDS = fmtList(AUDIO_MODELS_BY_KIND.sfx.map((m) => m.id));
+
+export const MEDIA_GENERATION_CONTRACT = `
+---
+
+## Media generation contract (load-bearing — overrides softer wording above)
+
+This project is a **non-web** surface (image / video / audio). The unifying
+contract is: skill workflow + project metadata tell you WHAT to make; one
+shell command — \`od media generate\` — is HOW you actually produce bytes.
+Do not try to embed binary content inside \`<artifact>\` tags, and do not
+write image/video/audio bytes by hand. Always call out to the dispatcher.
+
+### Environment the daemon injected for you
+
+The daemon spawns you with these env vars set (verify with \`echo\`):
+
+- \`OD_BIN\`         — absolute path to the \`od\` CLI script. Run with \`node "$OD_BIN" …\`.
+- \`OD_PROJECT_ID\`  — the active project's id. Pass it as \`--project "$OD_PROJECT_ID"\`.
+- \`OD_PROJECT_DIR\` — the project's files folder (your cwd). Generated files land here.
+- \`OD_DAEMON_URL\`  — base URL of the local daemon, e.g. \`http://127.0.0.1:7456\`.
+
+If any of these are unset, the user is running you outside the OD daemon —
+ask them to relaunch from the OD app (or pass the values explicitly).
+
+### Invocation
+
+Run via your shell tool (Bash on Claude Code, exec on Codex/Gemini, etc.):
+
+\`\`\`bash
+node "$OD_BIN" media generate \\
+  --project "$OD_PROJECT_ID" \\
+  --surface <image|video|audio> \\
+  --model <model-id> \\
+  --output <filename> \\
+  --prompt "<full prompt>" \\
+  [--aspect 1:1|16:9|9:16|4:3|3:4] \\
+  [--length <seconds>]              # video only
+  [--duration <seconds>]            # audio only
+  [--audio-kind music|speech|sfx]   # audio only
+  [--voice <voice-id>]              # audio:speech only
+\`\`\`
+
+The command prints a single line of JSON describing the written file:
+
+\`\`\`json
+{ "file": { "name": "poster.png", "size": 12345, "kind": "image", "mime": "image/png", ... } }
+\`\`\`
+
+Save the \`file.name\` and reference it in your reply ("I generated
+\`poster.png\`."). The user's FileViewer renders it automatically.
+
+### Allowed model IDs (per surface)
+
+- **image**:   ${IMAGE_IDS}
+- **video**:   ${VIDEO_IDS}
+- **audio · music**:  ${AUDIO_MUSIC_IDS}
+- **audio · speech**: ${AUDIO_SPEECH_IDS}
+- **audio · sfx**:    ${AUDIO_SFX_IDS}
+
+If the user requests a model that is not in this list, surface a warning
+in your reply and either (a) ask them to pick a registered ID or (b)
+proceed with the project metadata's default model and explain the
+substitution. Do not silently fall back.
+
+### Workflow rules
+
+1. **Read project metadata first.** The "Project metadata" block above
+   tells you the user's pre-selected model, aspect, length, voice, audio
+   kind, etc. Treat those as authoritative defaults — only override if
+   the user's chat message explicitly contradicts them.
+2. **One discovery turn before generating.** Even with metadata defaults
+   present, restate what you're about to make and ask one targeted
+   question if anything is ambiguous (subject, mood, brand, voice). The
+   discovery rules from the philosophy layer still apply — emit a
+   question form on turn 1 unless the user's prompt already pins every
+   variable.
+3. **Generate by shell, narrate in chat.** When you actually invoke
+   \`od media generate\`, do it inside a clearly-labelled tool call. After
+   it returns, write a short reply: what was produced, the filename,
+   and any notes (model substitutions, retries, follow-up suggestions).
+4. **Iterate by re-running.** To revise, call \`od media generate\` again
+   with a new \`--output\` filename (or omit \`--output\` to auto-name).
+   Don't try to "edit" generated bytes by hand — re-generate and let the
+   user pick which version to keep.
+5. **Don't emit \`<artifact>\` blocks for media.** They're for HTML/text
+   artifacts. For media surfaces your "artifact" is the file written by
+   the dispatcher. The artifact lint and PDF-stitching layers don't
+   apply.
+6. **Filenames are slugged.** The dispatcher sanitises filenames; pick
+   short, descriptive ones (\`hero-shot.png\`, \`intro-jingle.mp3\`,
+   \`teaser-15s.mp4\`) so the user's file list stays readable.
+
+### Stub-provider note
+
+The provider integrations behind specific models (gpt-image-2,
+seedance-2, suno-v5, …) may still be stubs in this build — the
+dispatcher will return success and a placeholder file. That's fine: the
+contract you follow is the same; the bytes get sharper as real
+provider integrations land. The user has been told to expect this.
+`;
diff --git a/src/prompts/system.ts b/src/prompts/system.ts
index c571f9f..679833e 100644
--- a/src/prompts/system.ts
+++ b/src/prompts/system.ts
@@ -33,13 +33,22 @@ import type { ProjectMetadata, ProjectTemplate } from '../types';
 import { OFFICIAL_DESIGNER_PROMPT } from './official-system';
 import { DISCOVERY_AND_PHILOSOPHY } from './discovery';
 import { DECK_FRAMEWORK_DIRECTIVE } from './deck-framework';
+import { MEDIA_GENERATION_CONTRACT } from './media-contract';
 
 export const BASE_SYSTEM_PROMPT = OFFICIAL_DESIGNER_PROMPT;
 
 export interface ComposeInput {
   skillBody?: string | undefined;
   skillName?: string | undefined;
-  skillMode?: 'prototype' | 'deck' | 'template' | 'design-system' | undefined;
+  skillMode?:
+    | 'prototype'
+    | 'deck'
+    | 'template'
+    | 'design-system'
+    | 'image'
+    | 'video'
+    | 'audio'
+    | undefined;
   designSystemBody?: string | undefined;
   designSystemTitle?: string | undefined;
   // Project-level metadata captured by the new-project panel. Drives the
@@ -111,6 +120,24 @@ export function composeSystemPrompt({
     parts.push(`\n\n---\n\n${DECK_FRAMEWORK_DIRECTIVE}`);
   }
 
+  // Image / video / audio surfaces share one invocation contract:
+  // `od media generate`. We pin it LAST (and only when the project is
+  // actually a media surface) so its rules ("don't fabricate bytes",
+  // "shell out to OD_BIN", "reference the returned filename") override
+  // any softer wording earlier in the stack about emitting <artifact>
+  // tags. We fire on either skillMode OR metadata.kind so a media
+  // project without a bound skill still gets the contract.
+  const isMediaSurface =
+    skillMode === 'image' ||
+    skillMode === 'video' ||
+    skillMode === 'audio' ||
+    metadata?.kind === 'image' ||
+    metadata?.kind === 'video' ||
+    metadata?.kind === 'audio';
+  if (isMediaSurface) {
+    parts.push(MEDIA_GENERATION_CONTRACT);
+  }
+
   return parts.join('');
 }
 
@@ -145,6 +172,56 @@ function renderMetadataBlock(
       lines.push(`- **template**: ${metadata.templateLabel}`);
     }
   }
+  if (metadata.kind === 'image') {
+    lines.push(
+      `- **imageModel**: ${metadata.imageModel ?? '(unknown — ask: which image model to use)'}`,
+    );
+    lines.push(
+      `- **aspectRatio**: ${metadata.imageAspect ?? '(unknown — ask: 1:1, 16:9, 9:16, 4:3, 3:4)'}`,
+    );
+    if (metadata.imageStyle) {
+      lines.push(`- **styleNotes**: ${metadata.imageStyle}`);
+    }
+    lines.push('');
+    lines.push(
+      'This is an **image** project. Plan the prompt carefully — describe subject, composition, lighting, palette, and references — then dispatch via the **media generation contract** (see the contract block at the end of this prompt) using `od media generate --surface image --model <imageModel>`. Reference the returned filename in your reply. Do NOT emit `<artifact>` HTML for media surfaces.',
+    );
+  }
+  if (metadata.kind === 'video') {
+    lines.push(
+      `- **videoModel**: ${metadata.videoModel ?? '(unknown — ask: which video model to use)'}`,
+    );
+    lines.push(
+      `- **lengthSeconds**: ${typeof metadata.videoLength === 'number' ? metadata.videoLength : '(unknown — ask: 3s / 5s / 10s)'}`,
+    );
+    lines.push(
+      `- **aspectRatio**: ${metadata.videoAspect ?? '(unknown — ask: 16:9, 9:16, 1:1)'}`,
+    );
+    lines.push('');
+    lines.push(
+      'This is a **video** project. Plan the shotlist (1-3 shots for short clips), describe motion + camera, then dispatch via the **media generation contract** using `od media generate --surface video --model <videoModel> --length <seconds> --aspect <ratio>`. If the active workspace also ships a hyperframes-style interactive-video skill, prefer composing several shorter clips into a timeline rather than one monolithic generation. Do NOT emit `<artifact>` HTML.',
+    );
+  }
+  if (metadata.kind === 'audio') {
+    lines.push(
+      `- **audioKind**: ${metadata.audioKind ?? '(unknown — ask: music / speech / sfx)'}`,
+    );
+    lines.push(
+      `- **audioModel**: ${metadata.audioModel ?? '(unknown — ask: which audio model to use)'}`,
+    );
+    lines.push(
+      `- **durationSeconds**: ${typeof metadata.audioDuration === 'number' ? metadata.audioDuration : '(unknown — ask: target duration)'}`,
+    );
+    if (metadata.voice) {
+      lines.push(`- **voice**: ${metadata.voice}`);
+    } else if (metadata.audioKind === 'speech') {
+      lines.push('- **voice**: (unknown — ask: voice / accent / pacing)');
+    }
+    lines.push('');
+    lines.push(
+      'This is an **audio** project. Music: lock genre + tempo + instrumentation. Speech: confirm script + voice + pacing. SFX: be precise about texture (impact, ambience, foley layer). Then dispatch via the **media generation contract** using `od media generate --surface audio --audio-kind <kind> --model <audioModel> --duration <seconds>` (add `--voice <voice-id>` for speech). Do NOT emit `<artifact>` HTML.',
+    );
+  }
 
   if (metadata.inspirationDesignSystemIds && metadata.inspirationDesignSystemIds.length > 0) {
     lines.push(
diff --git a/src/types.ts b/src/types.ts
index 0f8aa92..a213fa8 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -72,12 +72,34 @@ export interface AgentInfo {
   version?: string | null;
 }
 
+// The four "surfaces" Open Design now produces. Web covers HTML
+// prototypes, decks, docs and templates; Image / Video / Audio cover
+// generated visual / motion / sound artifacts respectively. Every skill
+// and every design system declares one surface; the Examples and
+// Design-systems tabs filter by it so users can navigate the multi-modal
+// catalog without scrolling past surfaces they're not interested in.
+export type Surface = 'web' | 'image' | 'video' | 'audio';
+
 export interface SkillSummary {
   id: string;
   name: string;
   description: string;
   triggers: string[];
-  mode: 'prototype' | 'deck' | 'template' | 'design-system';
+  // 'design-system' is a meta-mode used by the design-systems registry,
+  // not by user-facing skills; the rest map 1:1 onto ProjectKind. Image
+  // / video / audio modes drive the matching project kind so the
+  // 'Use this prompt' fast-create produces a coherent media project.
+  mode:
+    | 'prototype'
+    | 'deck'
+    | 'template'
+    | 'design-system'
+    | 'image'
+    | 'video'
+    | 'audio';
+  /** Which output surface the skill targets — defaults to 'web' for
+   *  backward compatibility when SKILL.md doesn't declare `od.surface`. */
+  surface?: Surface;
   platform?: 'desktop' | 'mobile' | null;
   scenario?: string | null;
   previewType: string;
@@ -113,6 +135,10 @@ export interface DesignSystemSummary {
   /** 4 representative hex strings extracted from DESIGN.md: [bg, support, fg, accent].
    *  Empty when DESIGN.md doesn't expose its tokens in the bold-and-hex format. */
   swatches?: string[];
+  /** Which surface the system targets. Web is the default — most ship
+   *  HTML/CSS tokens. Image / video / audio systems carry palettes,
+   *  shotlists, voice presets etc. that drive non-web generations. */
+  surface?: Surface;
 }
 
 export interface DesignSystemDetail extends DesignSystemSummary {
@@ -122,6 +148,8 @@ export interface DesignSystemDetail extends DesignSystemSummary {
 export type ProjectFileKind =
   | 'html'
   | 'image'
+  | 'video'
+  | 'audio'
   | 'sketch'
   | 'text'
   | 'code'
@@ -147,7 +175,28 @@ export interface ProjectFile {
 // Per-project metadata captured at creation time. The agent reads this
 // during chat (via the system prompt) and the question-form re-asks for
 // any field that's missing. Each `kind` carries a different shape.
-export type ProjectKind = 'prototype' | 'deck' | 'template' | 'other';
+//
+// 'prototype' / 'deck' / 'template' / 'other' all live on the Web
+// surface; 'image' / 'video' / 'audio' are the new media surfaces.
+export type ProjectKind =
+  | 'prototype'
+  | 'deck'
+  | 'template'
+  | 'other'
+  | 'image'
+  | 'video'
+  | 'audio';
+
+// Aspect ratios offered to image / video projects. Kept as a small fixed
+// vocabulary (vs free-form WxH) so the system prompt can describe them
+// to the agent in concrete terms, and so we can render fixed thumbnails
+// in the picker without a custom-input branch.
+export type MediaAspect = '1:1' | '16:9' | '9:16' | '4:3' | '3:4';
+
+// Audio kind — what *kind* of sound the user wants. The model + prompt
+// pattern differ noticeably between music (Suno-style), TTS (MiniMax,
+// Fish), and SFX/foley, so we capture the intent at create time.
+export type AudioKind = 'music' | 'speech' | 'sfx';
 
 export interface ProjectMetadata {
   kind: ProjectKind;
@@ -172,6 +221,35 @@ export interface ProjectMetadata {
   // generated artifact should *also* draw from. Empty / undefined when the
   // user stayed in single-select mode.
   inspirationDesignSystemIds?: string[];
+
+  // -- Image projects ------------------------------------------------
+  // The model the user wants generations to flow through. We keep this
+  // as a free-form string (rather than a strict enum) so new providers
+  // can be wired up by editing skills alone, without a frontend change.
+  imageModel?: string;
+  // Aspect ratio. Defaults to 1:1 if unset. Drives the canvas the agent
+  // requests from the underlying image API.
+  imageAspect?: MediaAspect;
+  // Free-form palette / mood hint. Carried into the system prompt so the
+  // agent can echo the user's style intent into the upstream prompt.
+  imageStyle?: string;
+
+  // -- Video projects ------------------------------------------------
+  videoModel?: string;
+  // Length in seconds. Most providers cap at 10s today; we don't enforce
+  // here — the skill body is the right place to clamp by model.
+  videoLength?: number;
+  videoAspect?: MediaAspect;
+
+  // -- Audio projects ------------------------------------------------
+  audioKind?: AudioKind;
+  audioModel?: string;
+  // Duration in seconds. Music generators interpret this as song length;
+  // TTS uses it as an upper bound on the spoken passage.
+  audioDuration?: number;
+  // Free-form voice description for TTS (e.g. "warm female narrator,
+  // British English"). Ignored for music / SFX.
+  voice?: string;
 }
 
 export interface Project {