MAESTRO: fix(db): prevent FK constraint failures on worker restart

Cherry-picked source changes from PR #889 by @Et9797. Fixes #846.

Key changes:
- Add ensureMemorySessionIdRegistered() guard in SessionStore.ts
- Add ON UPDATE CASCADE migration (schema v21) for observations and session_summaries FK constraints
- Change message queue from claim-and-delete to claim-confirm pattern (PendingMessageStore.ts)
- Add spawn deduplication and unrecoverable error detection in SessionRoutes.ts and worker-service.ts
- Add forceInit flag to SDKAgent for stale session recovery

Build artifacts skipped (pre-existing dompurify dep issue). Path fixes (HealthMonitor.ts, worker-utils.ts)
already merged via PR #634.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Alex Newman
2026-02-06 03:16:17 -05:00
parent 7ed1e576b2
commit da1d2cd36a
20 changed files with 1136 additions and 150 deletions
+77 -4
View File
@@ -190,6 +190,7 @@ export class WorkerService {
this.broadcastProcessingStatus();
});
// Initialize MCP client
// Empty capabilities object: this client only calls tools, doesn't expose any
this.mcpClient = new Client({
@@ -319,13 +320,12 @@ export class WorkerService {
await this.dbManager.initialize();
// Recover stuck messages from previous crashes
// Reset any messages that were processing when worker died
const { PendingMessageStore } = await import('./sqlite/PendingMessageStore.js');
const pendingStore = new PendingMessageStore(this.dbManager.getSessionStore().db, 3);
const STUCK_THRESHOLD_MS = 5 * 60 * 1000;
const resetCount = pendingStore.resetStuckMessages(STUCK_THRESHOLD_MS);
const resetCount = pendingStore.resetStaleProcessingMessages(0); // 0 = reset ALL processing
if (resetCount > 0) {
logger.info('SYSTEM', `Recovered ${resetCount} stuck messages from previous session`, { thresholdMinutes: 5 });
logger.info('SYSTEM', `Reset ${resetCount} stale processing messages to pending`);
}
// Initialize search services
@@ -421,10 +421,43 @@ export class WorkerService {
const agent = this.getActiveAgent();
const providerName = agent.constructor.name;
// Before starting generator, check if AbortController is already aborted
// This can happen after a previous generator was aborted but the session still has pending work
if (session.abortController.signal.aborted) {
logger.debug('SYSTEM', 'Replacing aborted AbortController before starting generator', {
sessionId: session.sessionDbId
});
session.abortController = new AbortController();
}
// Track whether generator failed with an unrecoverable error to prevent infinite restart loops
let hadUnrecoverableError = false;
logger.info('SYSTEM', `Starting generator (${source}) using ${providerName}`, { sessionId: sid });
session.generatorPromise = agent.startSession(session, this)
.catch(async (error: unknown) => {
const errorMessage = (error as Error)?.message || '';
// Detect unrecoverable errors that should NOT trigger restart
// These errors will fail immediately on retry, causing infinite loops
const unrecoverablePatterns = [
'Claude executable not found',
'CLAUDE_CODE_PATH',
'ENOENT',
'spawn',
];
if (unrecoverablePatterns.some(pattern => errorMessage.includes(pattern))) {
hadUnrecoverableError = true;
logger.error('SDK', 'Unrecoverable generator error - will NOT restart', {
sessionId: session.sessionDbId,
project: session.project,
errorMessage
});
return;
}
// Fallback for terminated SDK sessions (provider abstraction)
if (this.isSessionTerminatedError(error)) {
logger.warn('SDK', 'SDK resume failed, falling back to standalone processing', {
sessionId: session.sessionDbId,
@@ -433,6 +466,20 @@ export class WorkerService {
});
return this.runFallbackForTerminatedSession(session, error);
}
// Detect stale resume failures - SDK session context was lost
if ((errorMessage.includes('aborted by user') || errorMessage.includes('No conversation found'))
&& session.memorySessionId) {
logger.warn('SDK', 'Detected stale resume failure, clearing memorySessionId for fresh start', {
sessionId: session.sessionDbId,
memorySessionId: session.memorySessionId,
errorMessage
});
// Clear stale memorySessionId and force fresh init on next attempt
this.dbManager.getSessionStore().updateMemorySessionId(session.sessionDbId, null);
session.memorySessionId = null;
session.forceInit = true;
}
logger.error('SDK', 'Session generator failed', {
sessionId: session.sessionDbId,
project: session.project,
@@ -442,6 +489,32 @@ export class WorkerService {
})
.finally(() => {
session.generatorPromise = null;
// Do NOT restart after unrecoverable errors - prevents infinite loops
if (hadUnrecoverableError) {
logger.warn('SYSTEM', 'Skipping restart due to unrecoverable error', {
sessionId: session.sessionDbId
});
this.broadcastProcessingStatus();
return;
}
// Check if there's pending work that needs processing with a fresh AbortController
const { PendingMessageStore } = require('./sqlite/PendingMessageStore.js');
const pendingStore = new PendingMessageStore(this.dbManager.getSessionStore().db, 3);
const pendingCount = pendingStore.getPendingCount(session.sessionDbId);
if (pendingCount > 0) {
logger.info('SYSTEM', 'Pending work remains after generator exit, restarting with fresh AbortController', {
sessionId: session.sessionDbId,
pendingCount
});
// Reset AbortController for restart
session.abortController = new AbortController();
// Restart processor
this.startSessionProcessor(session, 'pending-work-restart');
}
this.broadcastProcessingStatus();
});
}