fix: prevent worker daemon from being killed by its own hooks (#1490)

Three independent fixes for worker daemon instability:

1. Remove version mismatch auto-restart from ensureWorkerStarted() (#1435).
   The marketplace bundle ships with __DEFAULT_PACKAGE_VERSION__ unbaked,
   causing BUILT_IN_VERSION to fall back to "development". This creates a
   100% reproducible mismatch on every hook call, killing a healthy worker
   and often failing to restart. Same pattern across #566, #665, #667,
   #669, #689, #1124, #1145 (8+ releases).

2. Add process.ppid and PID-file PID to aggressiveStartupCleanup()
   exclusions (#1426). Without this, a newly spawned daemon SIGKILLs
   the hook process that spawned it and any already-running worker
   the PID file points to.

3. Increase POST_SPAWN_WAIT from 5s to 15s (#1423). The 5s timeout was
   sized for Linux (<1s startup) but macOS ARM64 cold starts take 6-8s
   with Chroma enabled.
This commit is contained in:
Ryan Malia
2026-03-25 14:11:32 -07:00
parent d06882126f
commit 88b47f9e9c
4 changed files with 191 additions and 210 deletions
File diff suppressed because one or more lines are too long
+14 -2
View File
@@ -453,6 +453,18 @@ export async function aggressiveStartupCleanup(): Promise<void> {
const pidsToKill: number[] = [];
const allPatterns = [...AGGRESSIVE_CLEANUP_PATTERNS, ...AGE_GATED_CLEANUP_PATTERNS];
// Protect parent process (the hook that spawned us) and the PID-file-registered
// worker from being killed. Without this, a new daemon kills its own parent hook
// process (#1426) and any already-running worker the PID file points to.
const protectedPids = new Set<number>([currentPid]);
if (process.ppid && process.ppid > 0) {
protectedPids.add(process.ppid);
}
const pidFileInfo = readPidFile();
if (pidFileInfo?.pid && pidFileInfo.pid > 0) {
protectedPids.add(pidFileInfo.pid);
}
try {
if (isWindows) {
// Use WQL -Filter for server-side filtering (no $_ pipeline syntax).
@@ -475,7 +487,7 @@ export async function aggressiveStartupCleanup(): Promise<void> {
for (const proc of processList) {
const pid = proc.ProcessId;
if (!Number.isInteger(pid) || pid <= 0 || pid === currentPid) continue;
if (!Number.isInteger(pid) || pid <= 0 || protectedPids.has(pid)) continue;
const commandLine = proc.CommandLine || '';
const isAggressive = AGGRESSIVE_CLEANUP_PATTERNS.some(p => commandLine.includes(p));
@@ -518,7 +530,7 @@ export async function aggressiveStartupCleanup(): Promise<void> {
const etime = match[2];
const command = match[3];
if (!Number.isInteger(pid) || pid <= 0 || pid === currentPid) continue;
if (!Number.isInteger(pid) || pid <= 0 || protectedPids.has(pid)) continue;
const isAggressive = AGGRESSIVE_CLEANUP_PATTERNS.some(p => command.includes(p));
+12 -41
View File
@@ -80,7 +80,6 @@ import {
cleanStalePidFile,
isProcessAlive,
spawnDaemon,
isPidFileRecent,
touchPidFile
} from './infrastructure/ProcessManager.js';
import {
@@ -88,8 +87,7 @@ import {
waitForHealth,
waitForReadiness,
waitForPortFree,
httpShutdown,
checkVersionMatch
httpShutdown
} from './infrastructure/HealthMonitor.js';
import { performGracefulShutdown } from './infrastructure/GracefulShutdown.js';
@@ -978,43 +976,17 @@ async function ensureWorkerStarted(port: number): Promise<boolean> {
return false;
}
// Check if worker is already running and healthy
// Check if worker is already running and healthy.
// NOTE: Version mismatch auto-restart intentionally removed (#1435).
// The marketplace bundle ships with __DEFAULT_PACKAGE_VERSION__ unbaked, causing
// BUILT_IN_VERSION to fall back to "development". This creates a 100% reproducible
// mismatch on every hook call, killing a healthy worker and often failing to restart
// (cold start exceeds POST_SPAWN_WAIT). A working-but-old worker is strictly better
// than a dead worker. Users must manually restart after genuine plugin updates.
// See also: #566, #665, #667, #669, #689, #1124, #1145 (same pattern across 8+ releases).
if (await waitForHealth(port, 1000)) {
const versionCheck = await checkVersionMatch(port);
if (!versionCheck.matches) {
// Guard: If PID file was written recently, another session is likely already
// restarting the worker. Poll health instead of starting a concurrent restart.
// This prevents the "100 sessions all restart simultaneously" storm (#1145).
const RESTART_COORDINATION_THRESHOLD_MS = 15000;
if (isPidFileRecent(RESTART_COORDINATION_THRESHOLD_MS)) {
logger.info('SYSTEM', 'Version mismatch detected but PID file is recent — another restart likely in progress, polling health', {
pluginVersion: versionCheck.pluginVersion,
workerVersion: versionCheck.workerVersion
});
const healthy = await waitForHealth(port, RESTART_COORDINATION_THRESHOLD_MS);
if (healthy) {
logger.info('SYSTEM', 'Worker became healthy after waiting for concurrent restart');
return true;
}
logger.warn('SYSTEM', 'Worker did not become healthy after waiting — proceeding with own restart');
}
logger.info('SYSTEM', 'Worker version mismatch detected - auto-restarting', {
pluginVersion: versionCheck.pluginVersion,
workerVersion: versionCheck.workerVersion
});
await httpShutdown(port);
const freed = await waitForPortFree(port, getPlatformTimeout(HOOK_TIMEOUTS.PORT_IN_USE_WAIT));
if (!freed) {
logger.error('SYSTEM', 'Port did not free up after shutdown for version mismatch restart', { port });
return false;
}
removePidFile();
} else {
logger.info('SYSTEM', 'Worker already running and healthy');
return true;
}
logger.info('SYSTEM', 'Worker already running and healthy');
return true;
}
// Check if port is in use by something else
@@ -1063,8 +1035,7 @@ async function ensureWorkerStarted(port: number): Promise<boolean> {
}
clearWorkerSpawnAttempted();
// Touch PID file to signal other sessions that a restart just completed.
// Other sessions checking isPidFileRecent() will see this and skip their own restart.
// Touch PID file to signal other sessions that a spawn just completed.
touchPidFile();
logger.info('SYSTEM', 'Worker started successfully');
return true;
+1 -1
View File
@@ -1,7 +1,7 @@
export const HOOK_TIMEOUTS = {
DEFAULT: 300000, // Standard HTTP timeout (5 min for slow systems)
HEALTH_CHECK: 3000, // Worker health check (3s — healthy worker responds in <100ms)
POST_SPAWN_WAIT: 5000, // Wait for daemon to start after spawn (starts in <1s on Linux)
POST_SPAWN_WAIT: 15000, // Wait for daemon to start after spawn (starts in <1s on Linux, 6-8s on macOS with Chroma)
READINESS_WAIT: 30000, // Wait for DB + search init after spawn (typically <5s)
PORT_IN_USE_WAIT: 3000, // Wait when port occupied but health failing
WORKER_STARTUP_WAIT: 1000,