fix: prevent worker daemon from being killed by its own hooks (#1490)
Three independent fixes for worker daemon instability: 1. Remove version mismatch auto-restart from ensureWorkerStarted() (#1435). The marketplace bundle ships with __DEFAULT_PACKAGE_VERSION__ unbaked, causing BUILT_IN_VERSION to fall back to "development". This creates a 100% reproducible mismatch on every hook call, killing a healthy worker and often failing to restart. Same pattern across #566, #665, #667, #669, #689, #1124, #1145 (8+ releases). 2. Add process.ppid and PID-file PID to aggressiveStartupCleanup() exclusions (#1426). Without this, a newly spawned daemon SIGKILLs the hook process that spawned it and any already-running worker the PID file points to. 3. Increase POST_SPAWN_WAIT from 5s to 15s (#1423). The 5s timeout was sized for Linux (<1s startup) but macOS ARM64 cold starts take 6-8s with Chroma enabled.
This commit is contained in:
+164
-166
File diff suppressed because one or more lines are too long
@@ -453,6 +453,18 @@ export async function aggressiveStartupCleanup(): Promise<void> {
|
||||
const pidsToKill: number[] = [];
|
||||
const allPatterns = [...AGGRESSIVE_CLEANUP_PATTERNS, ...AGE_GATED_CLEANUP_PATTERNS];
|
||||
|
||||
// Protect parent process (the hook that spawned us) and the PID-file-registered
|
||||
// worker from being killed. Without this, a new daemon kills its own parent hook
|
||||
// process (#1426) and any already-running worker the PID file points to.
|
||||
const protectedPids = new Set<number>([currentPid]);
|
||||
if (process.ppid && process.ppid > 0) {
|
||||
protectedPids.add(process.ppid);
|
||||
}
|
||||
const pidFileInfo = readPidFile();
|
||||
if (pidFileInfo?.pid && pidFileInfo.pid > 0) {
|
||||
protectedPids.add(pidFileInfo.pid);
|
||||
}
|
||||
|
||||
try {
|
||||
if (isWindows) {
|
||||
// Use WQL -Filter for server-side filtering (no $_ pipeline syntax).
|
||||
@@ -475,7 +487,7 @@ export async function aggressiveStartupCleanup(): Promise<void> {
|
||||
|
||||
for (const proc of processList) {
|
||||
const pid = proc.ProcessId;
|
||||
if (!Number.isInteger(pid) || pid <= 0 || pid === currentPid) continue;
|
||||
if (!Number.isInteger(pid) || pid <= 0 || protectedPids.has(pid)) continue;
|
||||
|
||||
const commandLine = proc.CommandLine || '';
|
||||
const isAggressive = AGGRESSIVE_CLEANUP_PATTERNS.some(p => commandLine.includes(p));
|
||||
@@ -518,7 +530,7 @@ export async function aggressiveStartupCleanup(): Promise<void> {
|
||||
const etime = match[2];
|
||||
const command = match[3];
|
||||
|
||||
if (!Number.isInteger(pid) || pid <= 0 || pid === currentPid) continue;
|
||||
if (!Number.isInteger(pid) || pid <= 0 || protectedPids.has(pid)) continue;
|
||||
|
||||
const isAggressive = AGGRESSIVE_CLEANUP_PATTERNS.some(p => command.includes(p));
|
||||
|
||||
|
||||
@@ -80,7 +80,6 @@ import {
|
||||
cleanStalePidFile,
|
||||
isProcessAlive,
|
||||
spawnDaemon,
|
||||
isPidFileRecent,
|
||||
touchPidFile
|
||||
} from './infrastructure/ProcessManager.js';
|
||||
import {
|
||||
@@ -88,8 +87,7 @@ import {
|
||||
waitForHealth,
|
||||
waitForReadiness,
|
||||
waitForPortFree,
|
||||
httpShutdown,
|
||||
checkVersionMatch
|
||||
httpShutdown
|
||||
} from './infrastructure/HealthMonitor.js';
|
||||
import { performGracefulShutdown } from './infrastructure/GracefulShutdown.js';
|
||||
|
||||
@@ -978,43 +976,17 @@ async function ensureWorkerStarted(port: number): Promise<boolean> {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if worker is already running and healthy
|
||||
// Check if worker is already running and healthy.
|
||||
// NOTE: Version mismatch auto-restart intentionally removed (#1435).
|
||||
// The marketplace bundle ships with __DEFAULT_PACKAGE_VERSION__ unbaked, causing
|
||||
// BUILT_IN_VERSION to fall back to "development". This creates a 100% reproducible
|
||||
// mismatch on every hook call, killing a healthy worker and often failing to restart
|
||||
// (cold start exceeds POST_SPAWN_WAIT). A working-but-old worker is strictly better
|
||||
// than a dead worker. Users must manually restart after genuine plugin updates.
|
||||
// See also: #566, #665, #667, #669, #689, #1124, #1145 (same pattern across 8+ releases).
|
||||
if (await waitForHealth(port, 1000)) {
|
||||
const versionCheck = await checkVersionMatch(port);
|
||||
if (!versionCheck.matches) {
|
||||
// Guard: If PID file was written recently, another session is likely already
|
||||
// restarting the worker. Poll health instead of starting a concurrent restart.
|
||||
// This prevents the "100 sessions all restart simultaneously" storm (#1145).
|
||||
const RESTART_COORDINATION_THRESHOLD_MS = 15000;
|
||||
if (isPidFileRecent(RESTART_COORDINATION_THRESHOLD_MS)) {
|
||||
logger.info('SYSTEM', 'Version mismatch detected but PID file is recent — another restart likely in progress, polling health', {
|
||||
pluginVersion: versionCheck.pluginVersion,
|
||||
workerVersion: versionCheck.workerVersion
|
||||
});
|
||||
const healthy = await waitForHealth(port, RESTART_COORDINATION_THRESHOLD_MS);
|
||||
if (healthy) {
|
||||
logger.info('SYSTEM', 'Worker became healthy after waiting for concurrent restart');
|
||||
return true;
|
||||
}
|
||||
logger.warn('SYSTEM', 'Worker did not become healthy after waiting — proceeding with own restart');
|
||||
}
|
||||
|
||||
logger.info('SYSTEM', 'Worker version mismatch detected - auto-restarting', {
|
||||
pluginVersion: versionCheck.pluginVersion,
|
||||
workerVersion: versionCheck.workerVersion
|
||||
});
|
||||
|
||||
await httpShutdown(port);
|
||||
const freed = await waitForPortFree(port, getPlatformTimeout(HOOK_TIMEOUTS.PORT_IN_USE_WAIT));
|
||||
if (!freed) {
|
||||
logger.error('SYSTEM', 'Port did not free up after shutdown for version mismatch restart', { port });
|
||||
return false;
|
||||
}
|
||||
removePidFile();
|
||||
} else {
|
||||
logger.info('SYSTEM', 'Worker already running and healthy');
|
||||
return true;
|
||||
}
|
||||
logger.info('SYSTEM', 'Worker already running and healthy');
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check if port is in use by something else
|
||||
@@ -1063,8 +1035,7 @@ async function ensureWorkerStarted(port: number): Promise<boolean> {
|
||||
}
|
||||
|
||||
clearWorkerSpawnAttempted();
|
||||
// Touch PID file to signal other sessions that a restart just completed.
|
||||
// Other sessions checking isPidFileRecent() will see this and skip their own restart.
|
||||
// Touch PID file to signal other sessions that a spawn just completed.
|
||||
touchPidFile();
|
||||
logger.info('SYSTEM', 'Worker started successfully');
|
||||
return true;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
export const HOOK_TIMEOUTS = {
|
||||
DEFAULT: 300000, // Standard HTTP timeout (5 min for slow systems)
|
||||
HEALTH_CHECK: 3000, // Worker health check (3s — healthy worker responds in <100ms)
|
||||
POST_SPAWN_WAIT: 5000, // Wait for daemon to start after spawn (starts in <1s on Linux)
|
||||
POST_SPAWN_WAIT: 15000, // Wait for daemon to start after spawn (starts in <1s on Linux, 6-8s on macOS with Chroma)
|
||||
READINESS_WAIT: 30000, // Wait for DB + search init after spawn (typically <5s)
|
||||
PORT_IN_USE_WAIT: 3000, // Wait when port occupied but health failing
|
||||
WORKER_STARTUP_WAIT: 1000,
|
||||
|
||||
Reference in New Issue
Block a user