fix: prevent worker daemon from being killed by its own hooks (#1490)

Three independent fixes for worker daemon instability:

1. Remove version mismatch auto-restart from ensureWorkerStarted() (#1435).
   The marketplace bundle ships with __DEFAULT_PACKAGE_VERSION__ unbaked,
   causing BUILT_IN_VERSION to fall back to "development". This creates a
   100% reproducible mismatch on every hook call, killing a healthy worker
   and often failing to restart. Same pattern across #566, #665, #667,
   #669, #689, #1124, #1145 (8+ releases).

2. Add process.ppid and PID-file PID to aggressiveStartupCleanup()
   exclusions (#1426). Without this, a newly spawned daemon SIGKILLs
   the hook process that spawned it and any already-running worker
   the PID file points to.

3. Increase POST_SPAWN_WAIT from 5s to 15s (#1423). The 5s timeout was
   sized for Linux (<1s startup) but macOS ARM64 cold starts take 6-8s
   with Chroma enabled.
This commit is contained in:
Ryan Malia
2026-03-25 14:11:32 -07:00
parent d06882126f
commit 88b47f9e9c
4 changed files with 191 additions and 210 deletions
File diff suppressed because one or more lines are too long
+14 -2
View File
@@ -453,6 +453,18 @@ export async function aggressiveStartupCleanup(): Promise<void> {
const pidsToKill: number[] = []; const pidsToKill: number[] = [];
const allPatterns = [...AGGRESSIVE_CLEANUP_PATTERNS, ...AGE_GATED_CLEANUP_PATTERNS]; const allPatterns = [...AGGRESSIVE_CLEANUP_PATTERNS, ...AGE_GATED_CLEANUP_PATTERNS];
// Protect parent process (the hook that spawned us) and the PID-file-registered
// worker from being killed. Without this, a new daemon kills its own parent hook
// process (#1426) and any already-running worker the PID file points to.
const protectedPids = new Set<number>([currentPid]);
if (process.ppid && process.ppid > 0) {
protectedPids.add(process.ppid);
}
const pidFileInfo = readPidFile();
if (pidFileInfo?.pid && pidFileInfo.pid > 0) {
protectedPids.add(pidFileInfo.pid);
}
try { try {
if (isWindows) { if (isWindows) {
// Use WQL -Filter for server-side filtering (no $_ pipeline syntax). // Use WQL -Filter for server-side filtering (no $_ pipeline syntax).
@@ -475,7 +487,7 @@ export async function aggressiveStartupCleanup(): Promise<void> {
for (const proc of processList) { for (const proc of processList) {
const pid = proc.ProcessId; const pid = proc.ProcessId;
if (!Number.isInteger(pid) || pid <= 0 || pid === currentPid) continue; if (!Number.isInteger(pid) || pid <= 0 || protectedPids.has(pid)) continue;
const commandLine = proc.CommandLine || ''; const commandLine = proc.CommandLine || '';
const isAggressive = AGGRESSIVE_CLEANUP_PATTERNS.some(p => commandLine.includes(p)); const isAggressive = AGGRESSIVE_CLEANUP_PATTERNS.some(p => commandLine.includes(p));
@@ -518,7 +530,7 @@ export async function aggressiveStartupCleanup(): Promise<void> {
const etime = match[2]; const etime = match[2];
const command = match[3]; const command = match[3];
if (!Number.isInteger(pid) || pid <= 0 || pid === currentPid) continue; if (!Number.isInteger(pid) || pid <= 0 || protectedPids.has(pid)) continue;
const isAggressive = AGGRESSIVE_CLEANUP_PATTERNS.some(p => command.includes(p)); const isAggressive = AGGRESSIVE_CLEANUP_PATTERNS.some(p => command.includes(p));
+10 -39
View File
@@ -80,7 +80,6 @@ import {
cleanStalePidFile, cleanStalePidFile,
isProcessAlive, isProcessAlive,
spawnDaemon, spawnDaemon,
isPidFileRecent,
touchPidFile touchPidFile
} from './infrastructure/ProcessManager.js'; } from './infrastructure/ProcessManager.js';
import { import {
@@ -88,8 +87,7 @@ import {
waitForHealth, waitForHealth,
waitForReadiness, waitForReadiness,
waitForPortFree, waitForPortFree,
httpShutdown, httpShutdown
checkVersionMatch
} from './infrastructure/HealthMonitor.js'; } from './infrastructure/HealthMonitor.js';
import { performGracefulShutdown } from './infrastructure/GracefulShutdown.js'; import { performGracefulShutdown } from './infrastructure/GracefulShutdown.js';
@@ -978,44 +976,18 @@ async function ensureWorkerStarted(port: number): Promise<boolean> {
return false; return false;
} }
// Check if worker is already running and healthy // Check if worker is already running and healthy.
// NOTE: Version mismatch auto-restart intentionally removed (#1435).
// The marketplace bundle ships with __DEFAULT_PACKAGE_VERSION__ unbaked, causing
// BUILT_IN_VERSION to fall back to "development". This creates a 100% reproducible
// mismatch on every hook call, killing a healthy worker and often failing to restart
// (cold start exceeds POST_SPAWN_WAIT). A working-but-old worker is strictly better
// than a dead worker. Users must manually restart after genuine plugin updates.
// See also: #566, #665, #667, #669, #689, #1124, #1145 (same pattern across 8+ releases).
if (await waitForHealth(port, 1000)) { if (await waitForHealth(port, 1000)) {
const versionCheck = await checkVersionMatch(port);
if (!versionCheck.matches) {
// Guard: If PID file was written recently, another session is likely already
// restarting the worker. Poll health instead of starting a concurrent restart.
// This prevents the "100 sessions all restart simultaneously" storm (#1145).
const RESTART_COORDINATION_THRESHOLD_MS = 15000;
if (isPidFileRecent(RESTART_COORDINATION_THRESHOLD_MS)) {
logger.info('SYSTEM', 'Version mismatch detected but PID file is recent — another restart likely in progress, polling health', {
pluginVersion: versionCheck.pluginVersion,
workerVersion: versionCheck.workerVersion
});
const healthy = await waitForHealth(port, RESTART_COORDINATION_THRESHOLD_MS);
if (healthy) {
logger.info('SYSTEM', 'Worker became healthy after waiting for concurrent restart');
return true;
}
logger.warn('SYSTEM', 'Worker did not become healthy after waiting — proceeding with own restart');
}
logger.info('SYSTEM', 'Worker version mismatch detected - auto-restarting', {
pluginVersion: versionCheck.pluginVersion,
workerVersion: versionCheck.workerVersion
});
await httpShutdown(port);
const freed = await waitForPortFree(port, getPlatformTimeout(HOOK_TIMEOUTS.PORT_IN_USE_WAIT));
if (!freed) {
logger.error('SYSTEM', 'Port did not free up after shutdown for version mismatch restart', { port });
return false;
}
removePidFile();
} else {
logger.info('SYSTEM', 'Worker already running and healthy'); logger.info('SYSTEM', 'Worker already running and healthy');
return true; return true;
} }
}
// Check if port is in use by something else // Check if port is in use by something else
const portInUse = await isPortInUse(port); const portInUse = await isPortInUse(port);
@@ -1063,8 +1035,7 @@ async function ensureWorkerStarted(port: number): Promise<boolean> {
} }
clearWorkerSpawnAttempted(); clearWorkerSpawnAttempted();
// Touch PID file to signal other sessions that a restart just completed. // Touch PID file to signal other sessions that a spawn just completed.
// Other sessions checking isPidFileRecent() will see this and skip their own restart.
touchPidFile(); touchPidFile();
logger.info('SYSTEM', 'Worker started successfully'); logger.info('SYSTEM', 'Worker started successfully');
return true; return true;
+1 -1
View File
@@ -1,7 +1,7 @@
export const HOOK_TIMEOUTS = { export const HOOK_TIMEOUTS = {
DEFAULT: 300000, // Standard HTTP timeout (5 min for slow systems) DEFAULT: 300000, // Standard HTTP timeout (5 min for slow systems)
HEALTH_CHECK: 3000, // Worker health check (3s — healthy worker responds in <100ms) HEALTH_CHECK: 3000, // Worker health check (3s — healthy worker responds in <100ms)
POST_SPAWN_WAIT: 5000, // Wait for daemon to start after spawn (starts in <1s on Linux) POST_SPAWN_WAIT: 15000, // Wait for daemon to start after spawn (starts in <1s on Linux, 6-8s on macOS with Chroma)
READINESS_WAIT: 30000, // Wait for DB + search init after spawn (typically <5s) READINESS_WAIT: 30000, // Wait for DB + search init after spawn (typically <5s)
PORT_IN_USE_WAIT: 3000, // Wait when port occupied but health failing PORT_IN_USE_WAIT: 3000, // Wait when port occupied but health failing
WORKER_STARTUP_WAIT: 1000, WORKER_STARTUP_WAIT: 1000,