MAESTRO: fix(db): prevent FK constraint failures on worker restart

Cherry-picked source changes from PR #889 by @Et9797. Fixes #846.

Key changes:
- Add ensureMemorySessionIdRegistered() guard in SessionStore.ts
- Add ON UPDATE CASCADE migration (schema v21) for observations and session_summaries FK constraints
- Change message queue from claim-and-delete to claim-confirm pattern (PendingMessageStore.ts)
- Add spawn deduplication and unrecoverable error detection in SessionRoutes.ts and worker-service.ts
- Add forceInit flag to SDKAgent for stale session recovery

Build artifacts skipped (pre-existing dompurify dep issue). Path fixes (HealthMonitor.ts, worker-utils.ts)
already merged via PR #634.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Alex Newman
2026-02-06 03:16:17 -05:00
parent 7ed1e576b2
commit da1d2cd36a
20 changed files with 1136 additions and 150 deletions
+45 -6
View File
@@ -77,12 +77,13 @@ export class PendingMessageStore {
}
/**
* Atomically claim and DELETE the next pending message.
* Finds oldest pending -> returns it -> deletes from queue.
* The queue is a pure buffer: claim it, delete it, process in memory.
* Atomically claim the next pending message by marking it as 'processing'.
* CRITICAL FIX: Does NOT delete - message stays in DB until confirmProcessed() is called.
* This prevents message loss if the generator crashes mid-processing.
* Uses a transaction to prevent race conditions.
*/
claimAndDelete(sessionDbId: number): PersistentPendingMessage | null {
const now = Date.now();
const claimTx = this.db.transaction((sessionId: number) => {
const peekStmt = this.db.prepare(`
SELECT * FROM pending_messages
@@ -93,9 +94,14 @@ export class PendingMessageStore {
const msg = peekStmt.get(sessionId) as PersistentPendingMessage | null;
if (msg) {
// Delete immediately - no "processing" state needed
const deleteStmt = this.db.prepare('DELETE FROM pending_messages WHERE id = ?');
deleteStmt.run(msg.id);
// CRITICAL FIX: Mark as 'processing' instead of deleting
// Message will be deleted by confirmProcessed() after successful store
const updateStmt = this.db.prepare(`
UPDATE pending_messages
SET status = 'processing', started_processing_at_epoch = ?
WHERE id = ?
`);
updateStmt.run(now, msg.id);
// Log claim with minimal info (avoid logging full payload)
logger.info('QUEUE', `CLAIMED | sessionDbId=${sessionId} | messageId=${msg.id} | type=${msg.message_type}`, {
@@ -108,6 +114,39 @@ export class PendingMessageStore {
return claimTx(sessionDbId) as PersistentPendingMessage | null;
}
/**
* Confirm a message was successfully processed - DELETE it from the queue.
* CRITICAL: Only call this AFTER the observation/summary has been stored to DB.
* This prevents message loss on generator crash.
*/
confirmProcessed(messageId: number): void {
const stmt = this.db.prepare('DELETE FROM pending_messages WHERE id = ?');
const result = stmt.run(messageId);
if (result.changes > 0) {
logger.debug('QUEUE', `CONFIRMED | messageId=${messageId} | deleted from queue`);
}
}
/**
* Reset stale 'processing' messages back to 'pending' for retry.
* Called on worker startup and periodically to recover from crashes.
* @param thresholdMs Messages processing longer than this are considered stale (default: 5 minutes)
* @returns Number of messages reset
*/
resetStaleProcessingMessages(thresholdMs: number = 5 * 60 * 1000): number {
const cutoff = Date.now() - thresholdMs;
const stmt = this.db.prepare(`
UPDATE pending_messages
SET status = 'pending', started_processing_at_epoch = NULL
WHERE status = 'processing' AND started_processing_at_epoch < ?
`);
const result = stmt.run(cutoff);
if (result.changes > 0) {
logger.info('QUEUE', `RESET_STALE | count=${result.changes} | thresholdMs=${thresholdMs}`);
}
return result.changes;
}
/**
* Get all pending messages for session (ordered by creation time)
*/