MAESTRO: fix(db): prevent FK constraint failures on worker restart

Cherry-picked source changes from PR #889 by @Et9797. Fixes #846.

Key changes:
- Add ensureMemorySessionIdRegistered() guard in SessionStore.ts
- Add ON UPDATE CASCADE migration (schema v21) for observations and session_summaries FK constraints
- Change message queue from claim-and-delete to claim-confirm pattern (PendingMessageStore.ts)
- Add spawn deduplication and unrecoverable error detection in SessionRoutes.ts and worker-service.ts
- Add forceInit flag to SDKAgent for stale session recovery

Build artifacts skipped (pre-existing dompurify dep issue). Path fixes (HealthMonitor.ts, worker-utils.ts)
already merged via PR #634.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Alex Newman
2026-02-06 03:16:17 -05:00
parent 7ed1e576b2
commit da1d2cd36a
20 changed files with 1136 additions and 150 deletions
+45 -6
View File
@@ -77,12 +77,13 @@ export class PendingMessageStore {
}
/**
* Atomically claim and DELETE the next pending message.
* Finds oldest pending -> returns it -> deletes from queue.
* The queue is a pure buffer: claim it, delete it, process in memory.
* Atomically claim the next pending message by marking it as 'processing'.
* CRITICAL FIX: Does NOT delete - message stays in DB until confirmProcessed() is called.
* This prevents message loss if the generator crashes mid-processing.
* Uses a transaction to prevent race conditions.
*/
claimAndDelete(sessionDbId: number): PersistentPendingMessage | null {
const now = Date.now();
const claimTx = this.db.transaction((sessionId: number) => {
const peekStmt = this.db.prepare(`
SELECT * FROM pending_messages
@@ -93,9 +94,14 @@ export class PendingMessageStore {
const msg = peekStmt.get(sessionId) as PersistentPendingMessage | null;
if (msg) {
// Delete immediately - no "processing" state needed
const deleteStmt = this.db.prepare('DELETE FROM pending_messages WHERE id = ?');
deleteStmt.run(msg.id);
// CRITICAL FIX: Mark as 'processing' instead of deleting
// Message will be deleted by confirmProcessed() after successful store
const updateStmt = this.db.prepare(`
UPDATE pending_messages
SET status = 'processing', started_processing_at_epoch = ?
WHERE id = ?
`);
updateStmt.run(now, msg.id);
// Log claim with minimal info (avoid logging full payload)
logger.info('QUEUE', `CLAIMED | sessionDbId=${sessionId} | messageId=${msg.id} | type=${msg.message_type}`, {
@@ -108,6 +114,39 @@ export class PendingMessageStore {
return claimTx(sessionDbId) as PersistentPendingMessage | null;
}
/**
* Confirm a message was successfully processed - DELETE it from the queue.
* CRITICAL: Only call this AFTER the observation/summary has been stored to DB.
* This prevents message loss on generator crash.
*/
confirmProcessed(messageId: number): void {
const stmt = this.db.prepare('DELETE FROM pending_messages WHERE id = ?');
const result = stmt.run(messageId);
if (result.changes > 0) {
logger.debug('QUEUE', `CONFIRMED | messageId=${messageId} | deleted from queue`);
}
}
/**
* Reset stale 'processing' messages back to 'pending' for retry.
* Called on worker startup and periodically to recover from crashes.
* @param thresholdMs Messages processing longer than this are considered stale (default: 5 minutes)
* @returns Number of messages reset
*/
resetStaleProcessingMessages(thresholdMs: number = 5 * 60 * 1000): number {
const cutoff = Date.now() - thresholdMs;
const stmt = this.db.prepare(`
UPDATE pending_messages
SET status = 'pending', started_processing_at_epoch = NULL
WHERE status = 'processing' AND started_processing_at_epoch < ?
`);
const result = stmt.run(cutoff);
if (result.changes > 0) {
logger.info('QUEUE', `RESET_STALE | count=${result.changes} | thresholdMs=${thresholdMs}`);
}
return result.changes;
}
/**
* Get all pending messages for session (ordered by creation time)
*/
+234 -25
View File
@@ -47,6 +47,7 @@ export class SessionStore {
this.renameSessionIdColumns();
this.repairSessionIdColumnRename();
this.addFailedAtEpochColumn();
this.addOnUpdateCascadeToForeignKeys();
}
/**
@@ -101,7 +102,7 @@ export class SessionStore {
type TEXT NOT NULL CHECK(type IN ('decision', 'bugfix', 'feature', 'refactor', 'discovery')),
created_at TEXT NOT NULL,
created_at_epoch INTEGER NOT NULL,
FOREIGN KEY(memory_session_id) REFERENCES sdk_sessions(memory_session_id) ON DELETE CASCADE
FOREIGN KEY(memory_session_id) REFERENCES sdk_sessions(memory_session_id) ON DELETE CASCADE ON UPDATE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_observations_sdk_session ON observations(memory_session_id);
@@ -123,7 +124,7 @@ export class SessionStore {
notes TEXT,
created_at TEXT NOT NULL,
created_at_epoch INTEGER NOT NULL,
FOREIGN KEY(memory_session_id) REFERENCES sdk_sessions(memory_session_id) ON DELETE CASCADE
FOREIGN KEY(memory_session_id) REFERENCES sdk_sessions(memory_session_id) ON DELETE CASCADE ON UPDATE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_session_summaries_sdk_session ON session_summaries(memory_session_id);
@@ -645,11 +646,187 @@ export class SessionStore {
this.db.prepare('INSERT OR IGNORE INTO schema_versions (version, applied_at) VALUES (?, ?)').run(20, new Date().toISOString());
}
/**
* Add ON UPDATE CASCADE to FK constraints on observations and session_summaries (migration 21)
*
* Both tables have FK(memory_session_id) -> sdk_sessions(memory_session_id) with ON DELETE CASCADE
* but missing ON UPDATE CASCADE. This causes FK constraint violations when code updates
* sdk_sessions.memory_session_id while child rows still reference the old value.
*
* SQLite doesn't support ALTER TABLE for FK changes, so we recreate both tables.
*/
private addOnUpdateCascadeToForeignKeys(): void {
const applied = this.db.prepare('SELECT version FROM schema_versions WHERE version = ?').get(21) as SchemaVersion | undefined;
if (applied) return;
logger.debug('DB', 'Adding ON UPDATE CASCADE to FK constraints on observations and session_summaries');
this.db.run('BEGIN TRANSACTION');
try {
// ==========================================
// 1. Recreate observations table
// ==========================================
// Drop FTS triggers first (they reference the observations table)
this.db.run('DROP TRIGGER IF EXISTS observations_ai');
this.db.run('DROP TRIGGER IF EXISTS observations_ad');
this.db.run('DROP TRIGGER IF EXISTS observations_au');
this.db.run(`
CREATE TABLE observations_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
memory_session_id TEXT NOT NULL,
project TEXT NOT NULL,
text TEXT,
type TEXT NOT NULL CHECK(type IN ('decision', 'bugfix', 'feature', 'refactor', 'discovery', 'change')),
title TEXT,
subtitle TEXT,
facts TEXT,
narrative TEXT,
concepts TEXT,
files_read TEXT,
files_modified TEXT,
prompt_number INTEGER,
discovery_tokens INTEGER DEFAULT 0,
created_at TEXT NOT NULL,
created_at_epoch INTEGER NOT NULL,
FOREIGN KEY(memory_session_id) REFERENCES sdk_sessions(memory_session_id) ON DELETE CASCADE ON UPDATE CASCADE
)
`);
this.db.run(`
INSERT INTO observations_new
SELECT id, memory_session_id, project, text, type, title, subtitle, facts,
narrative, concepts, files_read, files_modified, prompt_number,
discovery_tokens, created_at, created_at_epoch
FROM observations
`);
this.db.run('DROP TABLE observations');
this.db.run('ALTER TABLE observations_new RENAME TO observations');
// Recreate indexes
this.db.run(`
CREATE INDEX idx_observations_sdk_session ON observations(memory_session_id);
CREATE INDEX idx_observations_project ON observations(project);
CREATE INDEX idx_observations_type ON observations(type);
CREATE INDEX idx_observations_created ON observations(created_at_epoch DESC);
`);
// Recreate FTS triggers only if observations_fts exists
// (SessionSearch.ensureFTSTables creates it on first use with IF NOT EXISTS)
const hasFTS = (this.db.prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='observations_fts'").all() as { name: string }[]).length > 0;
if (hasFTS) {
this.db.run(`
CREATE TRIGGER IF NOT EXISTS observations_ai AFTER INSERT ON observations BEGIN
INSERT INTO observations_fts(rowid, title, subtitle, narrative, text, facts, concepts)
VALUES (new.id, new.title, new.subtitle, new.narrative, new.text, new.facts, new.concepts);
END;
CREATE TRIGGER IF NOT EXISTS observations_ad AFTER DELETE ON observations BEGIN
INSERT INTO observations_fts(observations_fts, rowid, title, subtitle, narrative, text, facts, concepts)
VALUES('delete', old.id, old.title, old.subtitle, old.narrative, old.text, old.facts, old.concepts);
END;
CREATE TRIGGER IF NOT EXISTS observations_au AFTER UPDATE ON observations BEGIN
INSERT INTO observations_fts(observations_fts, rowid, title, subtitle, narrative, text, facts, concepts)
VALUES('delete', old.id, old.title, old.subtitle, old.narrative, old.text, old.facts, old.concepts);
INSERT INTO observations_fts(rowid, title, subtitle, narrative, text, facts, concepts)
VALUES (new.id, new.title, new.subtitle, new.narrative, new.text, new.facts, new.concepts);
END;
`);
}
// ==========================================
// 2. Recreate session_summaries table
// ==========================================
this.db.run(`
CREATE TABLE session_summaries_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
memory_session_id TEXT NOT NULL,
project TEXT NOT NULL,
request TEXT,
investigated TEXT,
learned TEXT,
completed TEXT,
next_steps TEXT,
files_read TEXT,
files_edited TEXT,
notes TEXT,
prompt_number INTEGER,
discovery_tokens INTEGER DEFAULT 0,
created_at TEXT NOT NULL,
created_at_epoch INTEGER NOT NULL,
FOREIGN KEY(memory_session_id) REFERENCES sdk_sessions(memory_session_id) ON DELETE CASCADE ON UPDATE CASCADE
)
`);
this.db.run(`
INSERT INTO session_summaries_new
SELECT id, memory_session_id, project, request, investigated, learned,
completed, next_steps, files_read, files_edited, notes,
prompt_number, discovery_tokens, created_at, created_at_epoch
FROM session_summaries
`);
// Drop session_summaries FTS triggers before dropping the table
this.db.run('DROP TRIGGER IF EXISTS session_summaries_ai');
this.db.run('DROP TRIGGER IF EXISTS session_summaries_ad');
this.db.run('DROP TRIGGER IF EXISTS session_summaries_au');
this.db.run('DROP TABLE session_summaries');
this.db.run('ALTER TABLE session_summaries_new RENAME TO session_summaries');
// Recreate indexes
this.db.run(`
CREATE INDEX idx_session_summaries_sdk_session ON session_summaries(memory_session_id);
CREATE INDEX idx_session_summaries_project ON session_summaries(project);
CREATE INDEX idx_session_summaries_created ON session_summaries(created_at_epoch DESC);
`);
// Recreate session_summaries FTS triggers if FTS table exists
const hasSummariesFTS = (this.db.prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='session_summaries_fts'").all() as { name: string }[]).length > 0;
if (hasSummariesFTS) {
this.db.run(`
CREATE TRIGGER IF NOT EXISTS session_summaries_ai AFTER INSERT ON session_summaries BEGIN
INSERT INTO session_summaries_fts(rowid, request, investigated, learned, completed, next_steps, notes)
VALUES (new.id, new.request, new.investigated, new.learned, new.completed, new.next_steps, new.notes);
END;
CREATE TRIGGER IF NOT EXISTS session_summaries_ad AFTER DELETE ON session_summaries BEGIN
INSERT INTO session_summaries_fts(session_summaries_fts, rowid, request, investigated, learned, completed, next_steps, notes)
VALUES('delete', old.id, old.request, old.investigated, old.learned, old.completed, old.next_steps, old.notes);
END;
CREATE TRIGGER IF NOT EXISTS session_summaries_au AFTER UPDATE ON session_summaries BEGIN
INSERT INTO session_summaries_fts(session_summaries_fts, rowid, request, investigated, learned, completed, next_steps, notes)
VALUES('delete', old.id, old.request, old.investigated, old.learned, old.completed, old.next_steps, old.notes);
INSERT INTO session_summaries_fts(rowid, request, investigated, learned, completed, next_steps, notes)
VALUES (new.id, new.request, new.investigated, new.learned, new.completed, new.next_steps, new.notes);
END;
`);
}
// Record migration
this.db.prepare('INSERT OR IGNORE INTO schema_versions (version, applied_at) VALUES (?, ?)').run(21, new Date().toISOString());
this.db.run('COMMIT');
logger.debug('DB', 'Successfully added ON UPDATE CASCADE to FK constraints');
} catch (error) {
this.db.run('ROLLBACK');
throw error;
}
}
/**
* Update the memory session ID for a session
* Called by SDKAgent when it captures the session ID from the first SDK message
* Also used to RESET to null on stale resume failures (worker-service.ts)
*/
updateMemorySessionId(sessionDbId: number, memorySessionId: string): void {
updateMemorySessionId(sessionDbId: number, memorySessionId: string | null): void {
this.db.prepare(`
UPDATE sdk_sessions
SET memory_session_id = ?
@@ -657,6 +834,37 @@ export class SessionStore {
`).run(memorySessionId, sessionDbId);
}
/**
* Ensures memory_session_id is registered in sdk_sessions before FK-constrained INSERT.
* This fixes Issue #846 where observations fail after worker restart because the
* SDK generates a new memory_session_id but it's not registered in the parent table
* before child records try to reference it.
*
* @param sessionDbId - The database ID of the session
* @param memorySessionId - The memory session ID to ensure is registered
*/
ensureMemorySessionIdRegistered(sessionDbId: number, memorySessionId: string): void {
const session = this.db.prepare(`
SELECT id, memory_session_id FROM sdk_sessions WHERE id = ?
`).get(sessionDbId) as { id: number; memory_session_id: string | null } | undefined;
if (!session) {
throw new Error(`Session ${sessionDbId} not found in sdk_sessions`);
}
if (session.memory_session_id !== memorySessionId) {
this.db.prepare(`
UPDATE sdk_sessions SET memory_session_id = ? WHERE id = ?
`).run(memorySessionId, sessionDbId);
logger.info('DB', 'Registered memory_session_id before storage (FK fix)', {
sessionDbId,
oldId: session.memory_session_id,
newId: memorySessionId
});
}
}
/**
* Get recent session summaries for a project
*/
@@ -1151,39 +1359,40 @@ export class SessionStore {
* - Prompt #2+: session_id exists → INSERT ignored, fetch existing ID
* - Result: Same database ID returned for all prompts in conversation
*
* WHY THIS MATTERS:
* - NO "does session exist?" checks needed anywhere
* - NO risk of creating duplicate sessions
* - ALL hooks automatically connected via session_id
* - SAVE hook observations go to correct session (same session_id)
* - SDKAgent continuation prompt has correct context (same session_id)
*
* This is KISS in action: Trust the database UNIQUE constraint and
* INSERT OR IGNORE to handle both creation and lookup elegantly.
* Pure get-or-create: never modifies memory_session_id.
* Multi-terminal isolation is handled by ON UPDATE CASCADE at the schema level.
*/
createSDKSession(contentSessionId: string, project: string, userPrompt: string): number {
const now = new Date();
const nowEpoch = now.getTime();
// INSERT OR IGNORE to create session, then backfill project if it was created empty
// Session reuse: Return existing session ID if already created for this contentSessionId.
const existing = this.db.prepare(`
SELECT id FROM sdk_sessions WHERE content_session_id = ?
`).get(contentSessionId) as { id: number } | undefined;
if (existing) {
// Backfill project if session was created by another hook with empty project
if (project) {
this.db.prepare(`
UPDATE sdk_sessions SET project = ?
WHERE content_session_id = ? AND (project IS NULL OR project = '')
`).run(project, contentSessionId);
}
return existing.id;
}
// New session - insert fresh row
// NOTE: memory_session_id starts as NULL. It is captured by SDKAgent from the first SDK
// response and stored via updateMemorySessionId(). CRITICAL: memory_session_id must NEVER
// equal contentSessionId - that would inject memory messages into the user's transcript!
// response and stored via ensureMemorySessionIdRegistered(). CRITICAL: memory_session_id
// must NEVER equal contentSessionId - that would inject memory messages into the user's transcript!
this.db.prepare(`
INSERT OR IGNORE INTO sdk_sessions
INSERT INTO sdk_sessions
(content_session_id, memory_session_id, project, user_prompt, started_at, started_at_epoch, status)
VALUES (?, NULL, ?, ?, ?, ?, 'active')
`).run(contentSessionId, project, userPrompt, now.toISOString(), nowEpoch);
// Backfill project if session was created by another hook with empty project
if (project) {
this.db.prepare(`
UPDATE sdk_sessions SET project = ?
WHERE content_session_id = ? AND (project IS NULL OR project = '')
`).run(project, contentSessionId);
}
// Return existing or new ID
// Return new ID
const row = this.db.prepare('SELECT id FROM sdk_sessions WHERE content_session_id = ?')
.get(contentSessionId) as { id: number };
return row.id;
+25 -20
View File
@@ -14,12 +14,8 @@ import { logger } from '../../../utils/logger.js';
* - Prompt #2+: session_id exists -> INSERT ignored, fetch existing ID
* - Result: Same database ID returned for all prompts in conversation
*
* WHY THIS MATTERS:
* - NO "does session exist?" checks needed anywhere
* - NO risk of creating duplicate sessions
* - ALL hooks automatically connected via session_id
* - SAVE hook observations go to correct session (same session_id)
* - SDKAgent continuation prompt has correct context (same session_id)
* Pure get-or-create: never modifies memory_session_id.
* Multi-terminal isolation is handled by ON UPDATE CASCADE at the schema level.
*/
export function createSDKSession(
db: Database,
@@ -30,25 +26,33 @@ export function createSDKSession(
const now = new Date();
const nowEpoch = now.getTime();
// INSERT OR IGNORE to create session, then backfill project if it was created empty
// Check for existing session
const existing = db.prepare(`
SELECT id FROM sdk_sessions WHERE content_session_id = ?
`).get(contentSessionId) as { id: number } | undefined;
if (existing) {
// Backfill project if session was created by another hook with empty project
if (project) {
db.prepare(`
UPDATE sdk_sessions SET project = ?
WHERE content_session_id = ? AND (project IS NULL OR project = '')
`).run(project, contentSessionId);
}
return existing.id;
}
// New session - insert fresh row
// NOTE: memory_session_id starts as NULL. It is captured by SDKAgent from the first SDK
// response and stored via updateMemorySessionId(). CRITICAL: memory_session_id must NEVER
// equal contentSessionId - that would inject memory messages into the user's transcript!
// response and stored via ensureMemorySessionIdRegistered(). CRITICAL: memory_session_id
// must NEVER equal contentSessionId - that would inject memory messages into the user's transcript!
db.prepare(`
INSERT OR IGNORE INTO sdk_sessions
INSERT INTO sdk_sessions
(content_session_id, memory_session_id, project, user_prompt, started_at, started_at_epoch, status)
VALUES (?, NULL, ?, ?, ?, ?, 'active')
`).run(contentSessionId, project, userPrompt, now.toISOString(), nowEpoch);
// Backfill project if session was created by another hook with empty project
if (project) {
db.prepare(`
UPDATE sdk_sessions SET project = ?
WHERE content_session_id = ? AND (project IS NULL OR project = '')
`).run(project, contentSessionId);
}
// Return existing or new ID
// Return new ID
const row = db.prepare('SELECT id FROM sdk_sessions WHERE content_session_id = ?')
.get(contentSessionId) as { id: number };
return row.id;
@@ -57,11 +61,12 @@ export function createSDKSession(
/**
* Update the memory session ID for a session
* Called by SDKAgent when it captures the session ID from the first SDK message
* Also used to RESET to null on stale resume failures (worker-service.ts)
*/
export function updateMemorySessionId(
db: Database,
sessionDbId: number,
memorySessionId: string
memorySessionId: string | null
): void {
db.prepare(`
UPDATE sdk_sessions