feat(media): add image/video/audio project kinds via od media generate

Introduce non-web media surfaces (image, video, audio) as first-class project kinds. The unifying contract is "skill workflow + project metadata tell the agent WHAT to make; one shell command — od media generate — is HOW bytes are produced", so any code-agent CLI with shell access can drive it without bespoke tools. - Frontend: New Project panel gains Image/Video/Audio tabs with model picker, aspect/length/duration controls, and audio kind/voice selection. Examples and Design Systems tabs gain layered sections. FileViewer renders the generated image/video/audio files. - Shared registry: src/media/models.ts is the single source of truth for image/video/audio model IDs, aspects, and defaults — consumed by the picker AND the daemon dispatcher. - Prompts: media-contract.ts is pinned LAST in the system prompt for media surfaces so its hard rules (call od media generate, don't emit binary in <artifact>, allowed model IDs) win over softer earlier wording. - Daemon: new media.js dispatcher + media-models.js JSON view of the registry; cli.js gets the `od media generate` subcommand wired up via server.js / projects.js so the daemon writes files back into the project dir. - Skills: audio-jingle, image-poster, video-shortform seed examples for the three surfaces. Made-with: Cursor
2026-04-28 22:41:14 +08:00
parent 0b61be5d96
commit 976a6eadf2
28 changed files with 2902 additions and 78 deletions
@@ -25,12 +25,16 @@ export async function listSkills(skillsRoot) {
      const { data, body } = parseFrontmatter(raw);
      const hasAttachments = await dirHasAttachments(dir);
      const mode = data.od?.mode || inferMode(body, data.description);
+      const surface = normalizeSurface(data.od?.surface, mode);
      out.push({
        id: data.name || entry.name,
        name: data.name || entry.name,
        description: data.description || "",
        triggers: Array.isArray(data.triggers) ? data.triggers : [],
        mode,
+        // Surface defaults to inferring from `mode` so legacy SKILL.md
+        // files (no `od.surface` declared) keep classifying correctly.
+        surface,
        platform: normalizePlatform(
          data.od?.platform,
          mode,
@@ -159,6 +163,20 @@ function inferMode(body, description) {
  return "prototype";
 }

+// Surface is the high-level output bucket — web, image, video or audio.
+// Authors can pin it via `od.surface`; otherwise we derive from `mode`,
+// then fall back to the safe default ('web') so existing skills classify
+// unchanged.
+const KNOWN_SURFACES = new Set(["web", "image", "video", "audio"]);
+function normalizeSurface(value, mode) {
+  if (typeof value === "string") {
+    const v = value.trim().toLowerCase();
+    if (KNOWN_SURFACES.has(v)) return v;
+  }
+  if (mode === "image" || mode === "video" || mode === "audio") return mode;
+  return "web";
+}
+
 // Validate platform tag — only desktop / mobile are meaningful for the
 // Examples gallery. Falls back to autodetecting "mobile" from descriptions
 // so legacy skills sort under the right pill without authoring changes.