MAESTRO: fix(db): prevent FK constraint failures on worker restart

Cherry-picked source changes from PR #889 by @Et9797. Fixes #846.

Key changes:
- Add ensureMemorySessionIdRegistered() guard in SessionStore.ts
- Add ON UPDATE CASCADE migration (schema v21) for observations and session_summaries FK constraints
- Change message queue from claim-and-delete to claim-confirm pattern (PendingMessageStore.ts)
- Add spawn deduplication and unrecoverable error detection in SessionRoutes.ts and worker-service.ts
- Add forceInit flag to SDKAgent for stale session recovery

Build artifacts skipped (pre-existing dompurify dep issue). Path fixes (HealthMonitor.ts, worker-utils.ts)
already merged via PR #634.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Alex Newman
2026-02-06 03:16:17 -05:00
parent 7ed1e576b2
commit da1d2cd36a
20 changed files with 1136 additions and 150 deletions
+4
View File
@@ -186,6 +186,10 @@ export class GeminiAgent {
let lastCwd: string | undefined;
for await (const message of this.sessionManager.getMessageIterator(session.sessionDbId)) {
// CLAIM-CONFIRM: Track message ID for confirmProcessed() after successful storage
// The message is now in 'processing' status in DB until ResponseProcessor calls confirmProcessed()
session.processingMessageIds.push(message._persistentId);
// Capture cwd from each message for worktree support
if (message.cwd) {
lastCwd = message.cwd;
+4
View File
@@ -145,6 +145,10 @@ export class OpenRouterAgent {
// Process pending messages
for await (const message of this.sessionManager.getMessageIterator(session.sessionDbId)) {
// CLAIM-CONFIRM: Track message ID for confirmProcessed() after successful storage
// The message is now in 'processing' status in DB until ResponseProcessor calls confirmProcessed()
session.processingMessageIds.push(message._persistentId);
// Capture cwd from messages for proper worktree support
if (message.cwd) {
lastCwd = message.cwd;
+50 -15
View File
@@ -72,10 +72,21 @@ export class SDKAgent {
// CRITICAL: Only resume if:
// 1. memorySessionId exists (was captured from a previous SDK response)
// 2. lastPromptNumber > 1 (this is a continuation within the same SDK session)
// 3. forceInit is NOT set (stale session recovery clears this)
// On worker restart or crash recovery, memorySessionId may exist from a previous
// SDK session but we must NOT resume because the SDK context was lost.
// NEVER use contentSessionId for resume - that would inject messages into the user's transcript!
const hasRealMemorySessionId = !!session.memorySessionId;
const shouldResume = hasRealMemorySessionId && session.lastPromptNumber > 1 && !session.forceInit;
// Clear forceInit after using it
if (session.forceInit) {
logger.info('SDK', 'forceInit flag set, starting fresh SDK session', {
sessionDbId: session.sessionDbId,
previousMemorySessionId: session.memorySessionId
});
session.forceInit = false;
}
// Build isolated environment from ~/.claude-mem/.env
// This prevents Issue #733: random ANTHROPIC_API_KEY from project .env files
@@ -88,15 +99,15 @@ export class SDKAgent {
contentSessionId: session.contentSessionId,
memorySessionId: session.memorySessionId,
hasRealMemorySessionId,
resume_parameter: hasRealMemorySessionId ? session.memorySessionId : '(none - fresh start)',
shouldResume,
resume_parameter: shouldResume ? session.memorySessionId : '(none - fresh start)',
lastPromptNumber: session.lastPromptNumber,
authMethod
});
// Debug-level alignment logs for detailed tracing
if (session.lastPromptNumber > 1) {
const willResume = hasRealMemorySessionId;
logger.debug('SDK', `[ALIGNMENT] Resume Decision | contentSessionId=${session.contentSessionId} | memorySessionId=${session.memorySessionId} | prompt#=${session.lastPromptNumber} | hasRealMemorySessionId=${hasRealMemorySessionId} | willResume=${willResume} | resumeWith=${willResume ? session.memorySessionId : 'NONE'}`);
logger.debug('SDK', `[ALIGNMENT] Resume Decision | contentSessionId=${session.contentSessionId} | memorySessionId=${session.memorySessionId} | prompt#=${session.lastPromptNumber} | hasRealMemorySessionId=${hasRealMemorySessionId} | shouldResume=${shouldResume} | resumeWith=${shouldResume ? session.memorySessionId : 'NONE'}`);
} else {
// INIT prompt - never resume even if memorySessionId exists (stale from previous session)
const hasStaleMemoryId = hasRealMemorySessionId;
@@ -119,10 +130,8 @@ export class SDKAgent {
// Isolate observer sessions - they'll appear under project "observer-sessions"
// instead of polluting user's actual project resume lists
cwd: OBSERVER_SESSIONS_DIR,
// Only resume if BOTH: (1) we have a memorySessionId AND (2) this isn't the first prompt
// On worker restart, memorySessionId may exist from a previous SDK session but we
// need to start fresh since the SDK context was lost
...(hasRealMemorySessionId && session.lastPromptNumber > 1 && { resume: session.memorySessionId }),
// Only resume if shouldResume is true (memorySessionId exists, not first prompt, not forceInit)
...(shouldResume && { resume: session.memorySessionId }),
disallowedTools,
abortController: session.abortController,
pathToClaudeCodeExecutable: claudePath,
@@ -134,21 +143,35 @@ export class SDKAgent {
// Process SDK messages
for await (const message of queryResult) {
// Capture memory session ID from first SDK message (any type has session_id)
// This enables resume for subsequent generator starts within the same user session
if (!session.memorySessionId && message.session_id) {
// Capture or update memory session ID from SDK message
// IMPORTANT: The SDK may return a DIFFERENT session_id on resume than what we sent!
// We must always sync the DB to match what the SDK actually uses.
//
// MULTI-TERMINAL COLLISION FIX (FK constraint bug):
// Use ensureMemorySessionIdRegistered() instead of updateMemorySessionId() because:
// 1. It's idempotent - safe to call multiple times
// 2. It verifies the update happened (SELECT before UPDATE)
// 3. Consistent with ResponseProcessor's usage pattern
// This ensures FK constraint compliance BEFORE any observations are stored.
if (message.session_id && message.session_id !== session.memorySessionId) {
const previousId = session.memorySessionId;
session.memorySessionId = message.session_id;
// Persist to database for cross-restart recovery
this.dbManager.getSessionStore().updateMemorySessionId(
// Persist to database IMMEDIATELY for FK constraint compliance
// This must happen BEFORE any observations referencing this ID are stored
this.dbManager.getSessionStore().ensureMemorySessionIdRegistered(
session.sessionDbId,
message.session_id
);
// Verify the update by reading back from DB
const verification = this.dbManager.getSessionStore().getSessionById(session.sessionDbId);
const dbVerified = verification?.memory_session_id === message.session_id;
logger.info('SESSION', `MEMORY_ID_CAPTURED | sessionDbId=${session.sessionDbId} | memorySessionId=${message.session_id} | dbVerified=${dbVerified}`, {
const logMessage = previousId
? `MEMORY_ID_CHANGED | sessionDbId=${session.sessionDbId} | from=${previousId} | to=${message.session_id} | dbVerified=${dbVerified}`
: `MEMORY_ID_CAPTURED | sessionDbId=${session.sessionDbId} | memorySessionId=${message.session_id} | dbVerified=${dbVerified}`;
logger.info('SESSION', logMessage, {
sessionId: session.sessionDbId,
memorySessionId: message.session_id
memorySessionId: message.session_id,
previousId
});
if (!dbVerified) {
logger.error('SESSION', `MEMORY_ID_MISMATCH | sessionDbId=${session.sessionDbId} | expected=${message.session_id} | got=${verification?.memory_session_id}`, {
@@ -156,7 +179,7 @@ export class SDKAgent {
});
}
// Debug-level alignment log for detailed tracing
logger.debug('SDK', `[ALIGNMENT] Captured | contentSessionId=${session.contentSessionId} → memorySessionId=${message.session_id} | Future prompts will resume with this ID`);
logger.debug('SDK', `[ALIGNMENT] ${previousId ? 'Updated' : 'Captured'} | contentSessionId=${session.contentSessionId} → memorySessionId=${message.session_id} | Future prompts will resume with this ID`);
}
// Handle assistant messages
@@ -166,6 +189,14 @@ export class SDKAgent {
? content.filter((c: any) => c.type === 'text').map((c: any) => c.text).join('\n')
: typeof content === 'string' ? content : '';
// Check for context overflow - prevents infinite retry loops
if (textContent.includes('prompt is too long') ||
textContent.includes('context window')) {
logger.error('SDK', 'Context overflow detected - terminating session');
session.abortController.abort();
return;
}
const responseSize = textContent.length;
// Capture token state BEFORE updating (for delta calculation)
@@ -317,6 +348,10 @@ export class SDKAgent {
// Consume pending messages from SessionManager (event-driven, no polling)
for await (const message of this.sessionManager.getMessageIterator(session.sessionDbId)) {
// CLAIM-CONFIRM: Track message ID for confirmProcessed() after successful storage
// The message is now in 'processing' status in DB until ResponseProcessor calls confirmProcessed()
session.processingMessageIds.push(message._persistentId);
// Capture cwd from each message for worktree support
if (message.cwd) {
cwdTracker.lastCwd = message.cwd;
+2 -1
View File
@@ -154,7 +154,8 @@ export class SessionManager {
earliestPendingTimestamp: null,
conversationHistory: [], // Initialize empty - will be populated by agents
currentProvider: null, // Will be set when generator starts
consecutiveRestarts: 0 // Track consecutive restart attempts to prevent infinite loops
consecutiveRestarts: 0, // Track consecutive restart attempts to prevent infinite loops
processingMessageIds: [] // CLAIM-CONFIRM: Track message IDs for confirmProcessed()
};
logger.debug('SESSION', 'Creating new session object (memorySessionId cleared to prevent stale resume)', {
@@ -76,6 +76,14 @@ export async function processAgentResponse(
throw new Error('Cannot store observations: memorySessionId not yet captured');
}
// SAFETY NET (Issue #846 / Multi-terminal FK fix):
// The PRIMARY fix is in SDKAgent.ts where ensureMemorySessionIdRegistered() is called
// immediately when the SDK returns a memory_session_id. This call is a defensive safety net
// in case the DB was somehow not updated (race condition, crash, etc.).
// In multi-terminal scenarios, createSDKSession() now resets memory_session_id to NULL
// for each new generator, ensuring clean isolation.
sessionStore.ensureMemorySessionIdRegistered(session.sessionDbId, session.memorySessionId);
// Log pre-storage with session ID chain for verification
logger.info('DB', `STORING | sessionDbId=${session.sessionDbId} | memorySessionId=${session.memorySessionId} | obsCount=${observations.length} | hasSummary=${!!summaryForStore}`, {
sessionId: session.sessionDbId,
@@ -100,6 +108,18 @@ export async function processAgentResponse(
memorySessionId: session.memorySessionId
});
// CLAIM-CONFIRM: Now that storage succeeded, confirm all processing messages (delete from queue)
// This is the critical step that prevents message loss on generator crash
const pendingStore = sessionManager.getPendingMessageStore();
for (const messageId of session.processingMessageIds) {
pendingStore.confirmProcessed(messageId);
}
if (session.processingMessageIds.length > 0) {
logger.debug('QUEUE', `CONFIRMED_BATCH | sessionDbId=${session.sessionDbId} | count=${session.processingMessageIds.length} | ids=[${session.processingMessageIds.join(',')}]`);
}
// Clear the tracking array after confirmation
session.processingMessageIds = [];
// AFTER transaction commits - async operations (can fail safely without data loss)
await syncAndBroadcastObservations(
observations,
@@ -24,6 +24,8 @@ import { USER_SETTINGS_PATH } from '../../../../shared/paths.js';
export class SessionRoutes extends BaseRouteHandler {
private completionHandler: SessionCompletionHandler;
private spawnInProgress = new Map<number, boolean>();
private crashRecoveryScheduled = new Set<number>();
constructor(
private sessionManager: SessionManager,
@@ -91,10 +93,17 @@ export class SessionRoutes extends BaseRouteHandler {
const session = this.sessionManager.getSession(sessionDbId);
if (!session) return;
// GUARD: Prevent duplicate spawns
if (this.spawnInProgress.get(sessionDbId)) {
logger.debug('SESSION', 'Spawn already in progress, skipping', { sessionDbId, source });
return;
}
const selectedProvider = this.getSelectedProvider();
// Start generator if not running
if (!session.generatorPromise) {
this.spawnInProgress.set(sessionDbId, true);
this.startGeneratorWithProvider(session, selectedProvider, source);
return;
}
@@ -135,9 +144,13 @@ export class SessionRoutes extends BaseRouteHandler {
const agent = provider === 'openrouter' ? this.openRouterAgent : (provider === 'gemini' ? this.geminiAgent : this.sdkAgent);
const agentName = provider === 'openrouter' ? 'OpenRouter' : (provider === 'gemini' ? 'Gemini' : 'Claude SDK');
// Use database count for accurate telemetry (in-memory array is always empty due to FK constraint fix)
const pendingStore = this.sessionManager.getPendingMessageStore();
const actualQueueDepth = pendingStore.getPendingCount(session.sessionDbId);
logger.info('SESSION', `Generator auto-starting (${source}) using ${agentName}`, {
sessionId: session.sessionDbId,
queueDepth: session.pendingMessages.length,
queueDepth: actualQueueDepth,
historyLength: session.conversationHistory.length
});
@@ -173,6 +186,7 @@ export class SessionRoutes extends BaseRouteHandler {
})
.finally(() => {
const sessionDbId = session.sessionDbId;
this.spawnInProgress.delete(sessionDbId);
const wasAborted = session.abortController.signal.aborted;
if (wasAborted) {
@@ -196,6 +210,12 @@ export class SessionRoutes extends BaseRouteHandler {
const MAX_CONSECUTIVE_RESTARTS = 3;
if (pendingCount > 0) {
// GUARD: Prevent duplicate crash recovery spawns
if (this.crashRecoveryScheduled.has(sessionDbId)) {
logger.debug('SESSION', 'Crash recovery already scheduled', { sessionDbId });
return;
}
session.consecutiveRestarts = (session.consecutiveRestarts || 0) + 1;
if (session.consecutiveRestarts > MAX_CONSECUTIVE_RESTARTS) {
@@ -223,11 +243,14 @@ export class SessionRoutes extends BaseRouteHandler {
session.abortController = new AbortController();
oldController.abort();
this.crashRecoveryScheduled.add(sessionDbId);
// Exponential backoff: 1s, 2s, 4s for subsequent restarts
const backoffMs = Math.min(1000 * Math.pow(2, session.consecutiveRestarts - 1), 8000);
// Delay before restart with exponential backoff
setTimeout(() => {
this.crashRecoveryScheduled.delete(sessionDbId);
const stillExists = this.sessionManager.getSession(sessionDbId);
if (stillExists && !stillExists.generatorPromise) {
this.startGeneratorWithProvider(stillExists, this.getSelectedProvider(), 'crash-recovery');
@@ -398,11 +421,15 @@ export class SessionRoutes extends BaseRouteHandler {
return;
}
// Use database count for accurate queue length (in-memory array is always empty due to FK constraint fix)
const pendingStore = this.sessionManager.getPendingMessageStore();
const queueLength = pendingStore.getPendingCount(sessionDbId);
res.json({
status: 'active',
sessionDbId,
project: session.project,
queueLength: session.pendingMessages.length,
queueLength,
uptime: Date.now() - session.startTime
});
});