fix: remove agent pool timeout data loss
This commit is contained in:
@@ -174,9 +174,11 @@ export class ProcessRegistry {
|
||||
|
||||
unregister(id: string): void {
|
||||
this.initialize();
|
||||
const existing = this.entries.get(id);
|
||||
this.entries.delete(id);
|
||||
this.runtimeProcesses.delete(id);
|
||||
this.persist();
|
||||
if (existing?.type === 'sdk') notifySlotAvailable();
|
||||
}
|
||||
|
||||
clear(): void {
|
||||
@@ -213,16 +215,19 @@ export class ProcessRegistry {
|
||||
this.initialize();
|
||||
|
||||
let removed = 0;
|
||||
let removedSdk = 0;
|
||||
for (const [id, info] of this.entries) {
|
||||
if (isPidAlive(info.pid)) continue;
|
||||
this.entries.delete(id);
|
||||
this.runtimeProcesses.delete(id);
|
||||
removed += 1;
|
||||
if (info.type === 'sdk') removedSdk += 1;
|
||||
}
|
||||
|
||||
if (removed > 0) {
|
||||
this.persist();
|
||||
}
|
||||
for (let i = 0; i < removedSdk; i += 1) notifySlotAvailable();
|
||||
|
||||
return removed;
|
||||
}
|
||||
@@ -321,6 +326,9 @@ export class ProcessRegistry {
|
||||
this.runtimeProcesses.delete(record.id);
|
||||
}
|
||||
this.persist();
|
||||
for (const record of sessionRecords) {
|
||||
if (record.type === 'sdk') notifySlotAvailable();
|
||||
}
|
||||
|
||||
logger.info('SYSTEM', `Reaped ${sessionRecords.length} process(es) for session ${sessionId}`, {
|
||||
sessionId: sessionIdNum,
|
||||
@@ -428,6 +436,7 @@ export async function ensureSdkProcessExit(
|
||||
}
|
||||
|
||||
const TOTAL_PROCESS_HARD_CAP = 10;
|
||||
const SLOT_RECHECK_INTERVAL_MS = 5_000;
|
||||
const slotWaiters: Array<() => void> = [];
|
||||
|
||||
function getActiveSdkCount(): number {
|
||||
@@ -439,7 +448,8 @@ function notifySlotAvailable(): void {
|
||||
if (waiter) waiter();
|
||||
}
|
||||
|
||||
export async function waitForSlot(maxConcurrent: number, timeoutMs: number = 60_000): Promise<void> {
|
||||
export async function waitForSlot(maxConcurrent: number): Promise<void> {
|
||||
getProcessRegistry().pruneDeadEntries();
|
||||
const activeCount = getActiveSdkCount();
|
||||
if (activeCount >= TOTAL_PROCESS_HARD_CAP) {
|
||||
throw new Error(`Hard cap exceeded: ${activeCount} processes in registry (cap=${TOTAL_PROCESS_HARD_CAP}). Refusing to spawn more.`);
|
||||
@@ -450,15 +460,17 @@ export async function waitForSlot(maxConcurrent: number, timeoutMs: number = 60_
|
||||
logger.info('PROCESS', `Pool limit reached (${activeCount}/${maxConcurrent}), waiting for slot...`);
|
||||
|
||||
return new Promise<void>((resolve, reject) => {
|
||||
const timeout = setTimeout(() => {
|
||||
const idx = slotWaiters.indexOf(onSlot);
|
||||
if (idx >= 0) slotWaiters.splice(idx, 1);
|
||||
reject(new Error(`Timed out waiting for agent pool slot after ${timeoutMs}ms`));
|
||||
}, timeoutMs);
|
||||
|
||||
let recheckTimer: ReturnType<typeof setInterval> | null = null;
|
||||
const onSlot = () => {
|
||||
clearTimeout(timeout);
|
||||
if (getActiveSdkCount() < maxConcurrent) {
|
||||
const count = getActiveSdkCount();
|
||||
if (count >= TOTAL_PROCESS_HARD_CAP) {
|
||||
if (recheckTimer) clearInterval(recheckTimer);
|
||||
reject(new Error(`Hard cap exceeded: ${count} processes in registry (cap=${TOTAL_PROCESS_HARD_CAP}). Refusing to spawn more.`));
|
||||
return;
|
||||
}
|
||||
|
||||
if (count < maxConcurrent) {
|
||||
if (recheckTimer) clearInterval(recheckTimer);
|
||||
resolve();
|
||||
} else {
|
||||
slotWaiters.push(onSlot);
|
||||
@@ -466,6 +478,14 @@ export async function waitForSlot(maxConcurrent: number, timeoutMs: number = 60_
|
||||
};
|
||||
|
||||
slotWaiters.push(onSlot);
|
||||
recheckTimer = setInterval(() => {
|
||||
const removed = getProcessRegistry().pruneDeadEntries();
|
||||
if (removed > 0) {
|
||||
logger.info('PROCESS', 'Pruned stale process registry entries while waiting for agent slot', { removed });
|
||||
}
|
||||
notifySlotAvailable();
|
||||
}, SLOT_RECHECK_INTERVAL_MS);
|
||||
recheckTimer.unref?.();
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user