fix: remove agent pool timeout data loss
This commit is contained in:
+476
-430
File diff suppressed because one or more lines are too long
@@ -150,7 +150,7 @@ export class ClaudeProvider {
|
|||||||
|
|
||||||
const settings = SettingsDefaultsManager.loadFromFile(USER_SETTINGS_PATH);
|
const settings = SettingsDefaultsManager.loadFromFile(USER_SETTINGS_PATH);
|
||||||
const maxConcurrent = parseInt(settings.CLAUDE_MEM_MAX_CONCURRENT_AGENTS, 10) || 2;
|
const maxConcurrent = parseInt(settings.CLAUDE_MEM_MAX_CONCURRENT_AGENTS, 10) || 2;
|
||||||
await waitForSlot(maxConcurrent, 60_000);
|
await waitForSlot(maxConcurrent);
|
||||||
|
|
||||||
const isolatedEnv = sanitizeEnv(await buildIsolatedEnvWithFreshOAuth());
|
const isolatedEnv = sanitizeEnv(await buildIsolatedEnvWithFreshOAuth());
|
||||||
const authMethod = getAuthMethodDescription();
|
const authMethod = getAuthMethodDescription();
|
||||||
|
|||||||
@@ -147,16 +147,16 @@ export class SessionRoutes extends BaseRouteHandler {
|
|||||||
|
|
||||||
const pendingStore = this.sessionManager.getPendingMessageStore();
|
const pendingStore = this.sessionManager.getPendingMessageStore();
|
||||||
try {
|
try {
|
||||||
const cleared = pendingStore.clearPendingForSession(session.sessionDbId);
|
const reset = pendingStore.resetProcessingToPending(session.sessionDbId);
|
||||||
if (cleared > 0) {
|
if (reset > 0) {
|
||||||
logger.error('SESSION', `Cleared pending messages after generator error`, {
|
logger.warn('SESSION', `Reset processing messages after generator error`, {
|
||||||
sessionId: session.sessionDbId,
|
sessionId: session.sessionDbId,
|
||||||
cleared
|
reset
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
} catch (dbError) {
|
} catch (dbError) {
|
||||||
const normalizedDbError = dbError instanceof Error ? dbError : new Error(String(dbError));
|
const normalizedDbError = dbError instanceof Error ? dbError : new Error(String(dbError));
|
||||||
logger.error('HTTP', 'Failed to clear pending messages', {
|
logger.error('HTTP', 'Failed to reset processing messages after generator error', {
|
||||||
sessionId: session.sessionDbId
|
sessionId: session.sessionDbId
|
||||||
}, normalizedDbError);
|
}, normalizedDbError);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -174,9 +174,11 @@ export class ProcessRegistry {
|
|||||||
|
|
||||||
unregister(id: string): void {
|
unregister(id: string): void {
|
||||||
this.initialize();
|
this.initialize();
|
||||||
|
const existing = this.entries.get(id);
|
||||||
this.entries.delete(id);
|
this.entries.delete(id);
|
||||||
this.runtimeProcesses.delete(id);
|
this.runtimeProcesses.delete(id);
|
||||||
this.persist();
|
this.persist();
|
||||||
|
if (existing?.type === 'sdk') notifySlotAvailable();
|
||||||
}
|
}
|
||||||
|
|
||||||
clear(): void {
|
clear(): void {
|
||||||
@@ -213,16 +215,19 @@ export class ProcessRegistry {
|
|||||||
this.initialize();
|
this.initialize();
|
||||||
|
|
||||||
let removed = 0;
|
let removed = 0;
|
||||||
|
let removedSdk = 0;
|
||||||
for (const [id, info] of this.entries) {
|
for (const [id, info] of this.entries) {
|
||||||
if (isPidAlive(info.pid)) continue;
|
if (isPidAlive(info.pid)) continue;
|
||||||
this.entries.delete(id);
|
this.entries.delete(id);
|
||||||
this.runtimeProcesses.delete(id);
|
this.runtimeProcesses.delete(id);
|
||||||
removed += 1;
|
removed += 1;
|
||||||
|
if (info.type === 'sdk') removedSdk += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (removed > 0) {
|
if (removed > 0) {
|
||||||
this.persist();
|
this.persist();
|
||||||
}
|
}
|
||||||
|
for (let i = 0; i < removedSdk; i += 1) notifySlotAvailable();
|
||||||
|
|
||||||
return removed;
|
return removed;
|
||||||
}
|
}
|
||||||
@@ -321,6 +326,9 @@ export class ProcessRegistry {
|
|||||||
this.runtimeProcesses.delete(record.id);
|
this.runtimeProcesses.delete(record.id);
|
||||||
}
|
}
|
||||||
this.persist();
|
this.persist();
|
||||||
|
for (const record of sessionRecords) {
|
||||||
|
if (record.type === 'sdk') notifySlotAvailable();
|
||||||
|
}
|
||||||
|
|
||||||
logger.info('SYSTEM', `Reaped ${sessionRecords.length} process(es) for session ${sessionId}`, {
|
logger.info('SYSTEM', `Reaped ${sessionRecords.length} process(es) for session ${sessionId}`, {
|
||||||
sessionId: sessionIdNum,
|
sessionId: sessionIdNum,
|
||||||
@@ -428,6 +436,7 @@ export async function ensureSdkProcessExit(
|
|||||||
}
|
}
|
||||||
|
|
||||||
const TOTAL_PROCESS_HARD_CAP = 10;
|
const TOTAL_PROCESS_HARD_CAP = 10;
|
||||||
|
const SLOT_RECHECK_INTERVAL_MS = 5_000;
|
||||||
const slotWaiters: Array<() => void> = [];
|
const slotWaiters: Array<() => void> = [];
|
||||||
|
|
||||||
function getActiveSdkCount(): number {
|
function getActiveSdkCount(): number {
|
||||||
@@ -439,7 +448,8 @@ function notifySlotAvailable(): void {
|
|||||||
if (waiter) waiter();
|
if (waiter) waiter();
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function waitForSlot(maxConcurrent: number, timeoutMs: number = 60_000): Promise<void> {
|
export async function waitForSlot(maxConcurrent: number): Promise<void> {
|
||||||
|
getProcessRegistry().pruneDeadEntries();
|
||||||
const activeCount = getActiveSdkCount();
|
const activeCount = getActiveSdkCount();
|
||||||
if (activeCount >= TOTAL_PROCESS_HARD_CAP) {
|
if (activeCount >= TOTAL_PROCESS_HARD_CAP) {
|
||||||
throw new Error(`Hard cap exceeded: ${activeCount} processes in registry (cap=${TOTAL_PROCESS_HARD_CAP}). Refusing to spawn more.`);
|
throw new Error(`Hard cap exceeded: ${activeCount} processes in registry (cap=${TOTAL_PROCESS_HARD_CAP}). Refusing to spawn more.`);
|
||||||
@@ -450,15 +460,17 @@ export async function waitForSlot(maxConcurrent: number, timeoutMs: number = 60_
|
|||||||
logger.info('PROCESS', `Pool limit reached (${activeCount}/${maxConcurrent}), waiting for slot...`);
|
logger.info('PROCESS', `Pool limit reached (${activeCount}/${maxConcurrent}), waiting for slot...`);
|
||||||
|
|
||||||
return new Promise<void>((resolve, reject) => {
|
return new Promise<void>((resolve, reject) => {
|
||||||
const timeout = setTimeout(() => {
|
let recheckTimer: ReturnType<typeof setInterval> | null = null;
|
||||||
const idx = slotWaiters.indexOf(onSlot);
|
|
||||||
if (idx >= 0) slotWaiters.splice(idx, 1);
|
|
||||||
reject(new Error(`Timed out waiting for agent pool slot after ${timeoutMs}ms`));
|
|
||||||
}, timeoutMs);
|
|
||||||
|
|
||||||
const onSlot = () => {
|
const onSlot = () => {
|
||||||
clearTimeout(timeout);
|
const count = getActiveSdkCount();
|
||||||
if (getActiveSdkCount() < maxConcurrent) {
|
if (count >= TOTAL_PROCESS_HARD_CAP) {
|
||||||
|
if (recheckTimer) clearInterval(recheckTimer);
|
||||||
|
reject(new Error(`Hard cap exceeded: ${count} processes in registry (cap=${TOTAL_PROCESS_HARD_CAP}). Refusing to spawn more.`));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (count < maxConcurrent) {
|
||||||
|
if (recheckTimer) clearInterval(recheckTimer);
|
||||||
resolve();
|
resolve();
|
||||||
} else {
|
} else {
|
||||||
slotWaiters.push(onSlot);
|
slotWaiters.push(onSlot);
|
||||||
@@ -466,6 +478,14 @@ export async function waitForSlot(maxConcurrent: number, timeoutMs: number = 60_
|
|||||||
};
|
};
|
||||||
|
|
||||||
slotWaiters.push(onSlot);
|
slotWaiters.push(onSlot);
|
||||||
|
recheckTimer = setInterval(() => {
|
||||||
|
const removed = getProcessRegistry().pruneDeadEntries();
|
||||||
|
if (removed > 0) {
|
||||||
|
logger.info('PROCESS', 'Pruned stale process registry entries while waiting for agent slot', { removed });
|
||||||
|
}
|
||||||
|
notifySlotAvailable();
|
||||||
|
}, SLOT_RECHECK_INTERVAL_MS);
|
||||||
|
recheckTimer.unref?.();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user