fix: prevent zombie process accumulation via PID registry and signal propagation (Issue #737) (#806)
* Fix zombie process accumulation (Issue #737) Problem: Claude haiku subprocesses spawned by the SDK weren't terminating properly, causing zombie process accumulation (user reported 155 processes consuming 51GB RAM). Root causes: 1. SDK's SpawnedProcess interface hides subprocess PIDs 2. deleteSession() didn't verify subprocess exit 3. abort() was fire-and-forget with no confirmation 4. No mechanism to track or clean up orphaned processes Solution: - Add ProcessRegistry module to track spawned Claude subprocesses - Use SDK's spawnClaudeCodeProcess option to capture PIDs via custom spawn - Pass signal parameter to enable AbortController integration - Wait for subprocess exit in deleteSession() with 5s timeout - Escalate to SIGKILL if graceful exit fails - Add orphan reaper running every 5 minutes as safety net Files changed: - src/services/worker/ProcessRegistry.ts (new): PID registry and reaper - src/services/worker/SDKAgent.ts: Use custom spawn to capture PIDs - src/services/worker/SessionManager.ts: Verify subprocess exit on delete - src/services/worker-service.ts: Start/stop orphan reaper Fixes #737 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix: address code review feedback - Replace busy-wait polling with event-based proc.once('exit') - Detect and warn about multiple processes per session (race condition) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: bigphoot <bigphoot@gmail.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -69,6 +69,9 @@ import { SearchRoutes } from './worker/http/routes/SearchRoutes.js';
|
||||
import { SettingsRoutes } from './worker/http/routes/SettingsRoutes.js';
|
||||
import { LogsRoutes } from './worker/http/routes/LogsRoutes.js';
|
||||
|
||||
// Process management for zombie cleanup (Issue #737)
|
||||
import { startOrphanReaper, reapOrphanedProcesses } from './worker/ProcessRegistry.js';
|
||||
|
||||
/**
|
||||
* Build JSON status output for hook framework communication.
|
||||
* This is a pure function extracted for testability.
|
||||
@@ -121,6 +124,9 @@ export class WorkerService {
|
||||
private initializationComplete: Promise<void>;
|
||||
private resolveInitialization!: () => void;
|
||||
|
||||
// Orphan reaper cleanup function (Issue #737)
|
||||
private stopOrphanReaper: (() => void) | null = null;
|
||||
|
||||
constructor() {
|
||||
// Initialize the promise that will resolve when background initialization completes
|
||||
this.initializationComplete = new Promise((resolve) => {
|
||||
@@ -303,6 +309,16 @@ export class WorkerService {
|
||||
this.resolveInitialization();
|
||||
logger.info('SYSTEM', 'Background initialization complete');
|
||||
|
||||
// Start orphan reaper to clean up zombie processes (Issue #737)
|
||||
this.stopOrphanReaper = startOrphanReaper(() => {
|
||||
const activeIds = new Set<number>();
|
||||
for (const [id] of this.sessionManager['sessions']) {
|
||||
activeIds.add(id);
|
||||
}
|
||||
return activeIds;
|
||||
});
|
||||
logger.info('SYSTEM', 'Started orphan reaper (runs every 5 minutes)');
|
||||
|
||||
// Auto-recover orphaned queues (fire-and-forget with error logging)
|
||||
this.processPendingQueues(50).then(result => {
|
||||
if (result.sessionsStarted > 0) {
|
||||
@@ -404,6 +420,12 @@ export class WorkerService {
|
||||
* Shutdown the worker service
|
||||
*/
|
||||
async shutdown(): Promise<void> {
|
||||
// Stop orphan reaper before shutdown (Issue #737)
|
||||
if (this.stopOrphanReaper) {
|
||||
this.stopOrphanReaper();
|
||||
this.stopOrphanReaper = null;
|
||||
}
|
||||
|
||||
await performGracefulShutdown({
|
||||
server: this.server.getHttpServer(),
|
||||
sessionManager: this.sessionManager,
|
||||
|
||||
Reference in New Issue
Block a user