Merge branch 'pr-1491' into integration/validation-batch

# Conflicts:
#	plugin/scripts/mcp-server.cjs
#	plugin/scripts/worker-service.cjs
#	src/shared/hook-constants.ts
This commit is contained in:
Alex Newman
2026-04-06 14:20:05 -07:00
6 changed files with 317 additions and 328 deletions
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+15 -2
View File
@@ -453,6 +453,19 @@ export async function aggressiveStartupCleanup(): Promise<void> {
const pidsToKill: number[] = []; const pidsToKill: number[] = [];
const allPatterns = [...AGGRESSIVE_CLEANUP_PATTERNS, ...AGE_GATED_CLEANUP_PATTERNS]; const allPatterns = [...AGGRESSIVE_CLEANUP_PATTERNS, ...AGE_GATED_CLEANUP_PATTERNS];
// Protect parent process (the hook that spawned us) from being killed.
// Without this, a new daemon kills its own parent hook process (#1426).
//
// Note: readPidFile() is not used here because start() writes the new PID
// before initializeBackground() calls this function, so readPidFile() would
// just return process.pid (already protected). If a pre-existing worker needs
// protection, ensureWorkerStarted() handles that by returning early when a
// healthy worker is detected — we never reach this code in that case.
const protectedPids = new Set<number>([currentPid]);
if (process.ppid && process.ppid > 0) {
protectedPids.add(process.ppid);
}
try { try {
if (isWindows) { if (isWindows) {
// Use WQL -Filter for server-side filtering (no $_ pipeline syntax). // Use WQL -Filter for server-side filtering (no $_ pipeline syntax).
@@ -475,7 +488,7 @@ export async function aggressiveStartupCleanup(): Promise<void> {
for (const proc of processList) { for (const proc of processList) {
const pid = proc.ProcessId; const pid = proc.ProcessId;
if (!Number.isInteger(pid) || pid <= 0 || pid === currentPid) continue; if (!Number.isInteger(pid) || pid <= 0 || protectedPids.has(pid)) continue;
const commandLine = proc.CommandLine || ''; const commandLine = proc.CommandLine || '';
const isAggressive = AGGRESSIVE_CLEANUP_PATTERNS.some(p => commandLine.includes(p)); const isAggressive = AGGRESSIVE_CLEANUP_PATTERNS.some(p => commandLine.includes(p));
@@ -518,7 +531,7 @@ export async function aggressiveStartupCleanup(): Promise<void> {
const etime = match[2]; const etime = match[2];
const command = match[3]; const command = match[3];
if (!Number.isInteger(pid) || pid <= 0 || pid === currentPid) continue; if (!Number.isInteger(pid) || pid <= 0 || protectedPids.has(pid)) continue;
const isAggressive = AGGRESSIVE_CLEANUP_PATTERNS.some(p => command.includes(p)); const isAggressive = AGGRESSIVE_CLEANUP_PATTERNS.some(p => command.includes(p));
+19 -40
View File
@@ -80,7 +80,6 @@ import {
cleanStalePidFile, cleanStalePidFile,
isProcessAlive, isProcessAlive,
spawnDaemon, spawnDaemon,
isPidFileRecent,
touchPidFile touchPidFile
} from './infrastructure/ProcessManager.js'; } from './infrastructure/ProcessManager.js';
import { import {
@@ -88,8 +87,7 @@ import {
waitForHealth, waitForHealth,
waitForReadiness, waitForReadiness,
waitForPortFree, waitForPortFree,
httpShutdown, httpShutdown
checkVersionMatch
} from './infrastructure/HealthMonitor.js'; } from './infrastructure/HealthMonitor.js';
import { performGracefulShutdown } from './infrastructure/GracefulShutdown.js'; import { performGracefulShutdown } from './infrastructure/GracefulShutdown.js';
@@ -1011,43 +1009,25 @@ async function ensureWorkerStarted(port: number): Promise<boolean> {
return false; return false;
} }
// Check if worker is already running and healthy // Check if worker is already running and healthy.
// NOTE: Version mismatch auto-restart intentionally removed (#1435).
// The marketplace bundle ships with __DEFAULT_PACKAGE_VERSION__ unbaked, causing
// BUILT_IN_VERSION to fall back to "development". This creates a 100% reproducible
// mismatch on every hook call, killing a healthy worker and often failing to restart
// (cold start exceeds POST_SPAWN_WAIT). A working-but-old worker is strictly better
// than a dead worker. Users must manually restart after genuine plugin updates.
// See also: #566, #665, #667, #669, #689, #1124, #1145 (same pattern across 8+ releases).
if (await waitForHealth(port, 1000)) { if (await waitForHealth(port, 1000)) {
const versionCheck = await checkVersionMatch(port); // Health passed — worker is listening. Also wait for readiness in case
if (!versionCheck.matches) { // another hook just spawned it and background init is still running.
// Guard: If PID file was written recently, another session is likely already // This mirrors the fresh-spawn path (line ~1025) so concurrent hooks
// restarting the worker. Poll health instead of starting a concurrent restart. // don't race past a cold-starting worker's initialization guard.
// This prevents the "100 sessions all restart simultaneously" storm (#1145). const ready = await waitForReadiness(port, getPlatformTimeout(HOOK_TIMEOUTS.READINESS_WAIT));
const RESTART_COORDINATION_THRESHOLD_MS = 15000; if (!ready) {
if (isPidFileRecent(RESTART_COORDINATION_THRESHOLD_MS)) { logger.warn('SYSTEM', 'Worker is alive but readiness timed out — proceeding anyway');
logger.info('SYSTEM', 'Version mismatch detected but PID file is recent — another restart likely in progress, polling health', {
pluginVersion: versionCheck.pluginVersion,
workerVersion: versionCheck.workerVersion
});
const healthy = await waitForHealth(port, RESTART_COORDINATION_THRESHOLD_MS);
if (healthy) {
logger.info('SYSTEM', 'Worker became healthy after waiting for concurrent restart');
return true;
}
logger.warn('SYSTEM', 'Worker did not become healthy after waiting — proceeding with own restart');
}
logger.info('SYSTEM', 'Worker version mismatch detected - auto-restarting', {
pluginVersion: versionCheck.pluginVersion,
workerVersion: versionCheck.workerVersion
});
await httpShutdown(port);
const freed = await waitForPortFree(port, getPlatformTimeout(HOOK_TIMEOUTS.PORT_IN_USE_WAIT));
if (!freed) {
logger.error('SYSTEM', 'Port did not free up after shutdown for version mismatch restart', { port });
return false;
}
removePidFile();
} else {
logger.info('SYSTEM', 'Worker already running and healthy');
return true;
} }
logger.info('SYSTEM', 'Worker already running and healthy');
return true;
} }
// Check if port is in use by something else // Check if port is in use by something else
@@ -1096,8 +1076,7 @@ async function ensureWorkerStarted(port: number): Promise<boolean> {
} }
clearWorkerSpawnAttempted(); clearWorkerSpawnAttempted();
// Touch PID file to signal other sessions that a restart just completed. // Touch PID file to signal other sessions that a spawn just completed.
// Other sessions checking isPidFileRecent() will see this and skip their own restart.
touchPidFile(); touchPidFile();
logger.info('SYSTEM', 'Worker started successfully'); logger.info('SYSTEM', 'Worker started successfully');
return true; return true;
+12 -14
View File
@@ -1,13 +1,13 @@
export const HOOK_TIMEOUTS = { export const HOOK_TIMEOUTS = {
DEFAULT: 300000, // Standard HTTP timeout (5 min for slow systems) DEFAULT: 300000, // Standard HTTP timeout (5 min for slow systems)
HEALTH_CHECK: 3000, // Worker health check (3s — healthy worker responds in <100ms) HEALTH_CHECK: 3000, // Worker health check (3s — healthy worker responds in <100ms)
POST_SPAWN_WAIT: 5000, // Wait for daemon to start after spawn (starts in <1s on Linux) POST_SPAWN_WAIT: 15000, // Wait for daemon to start after spawn (starts in <1s on Linux, 6-8s on macOS with Chroma)
READINESS_WAIT: 30000, // Wait for DB + search init after spawn (typically <5s) READINESS_WAIT: 30000, // Wait for DB + search init after spawn (typically <5s)
PORT_IN_USE_WAIT: 3000, // Wait when port occupied but health failing PORT_IN_USE_WAIT: 3000, // Wait when port occupied but health failing
WORKER_STARTUP_WAIT: 1000, WORKER_STARTUP_WAIT: 1000,
PRE_RESTART_SETTLE_DELAY: 2000, // Give files time to sync before restart PRE_RESTART_SETTLE_DELAY: 2000, // Give files time to sync before restart
POWERSHELL_COMMAND: 10000, // PowerShell process enumeration (10s - typically completes in <1s) POWERSHELL_COMMAND: 10000, // PowerShell process enumeration (10s - typically completes in <1s)
WINDOWS_MULTIPLIER: 1.5, // Platform-specific adjustment for hook-side operations WINDOWS_MULTIPLIER: 1.5 // Platform-specific adjustment for hook-side operations
} as const; } as const;
/** /**
@@ -16,21 +16,19 @@ export const HOOK_TIMEOUTS = {
* Exit code behavior per Claude Code docs: * Exit code behavior per Claude Code docs:
* - 0: Success. For SessionStart/UserPromptSubmit, stdout added to context. * - 0: Success. For SessionStart/UserPromptSubmit, stdout added to context.
* - 2: Blocking error. For SessionStart, stderr shown to user only. * - 2: Blocking error. For SessionStart, stderr shown to user only.
* - Other non-zero: treated as hook error (stderr shown in verbose mode only). * - Other non-zero: stderr shown in verbose mode only.
*
* IMPORTANT: Only exit codes 0 and 2 are recognized by Claude Code.
* Any other non-zero exit code (including 1, 3, etc.) is treated as a
* hook failure and triggers the "hook error" UI message on every session start.
*/ */
export const HOOK_EXIT_CODES = { export const HOOK_EXIT_CODES = {
SUCCESS: 0, SUCCESS: 0,
FAILURE: 1, FAILURE: 1,
/** Blocking error - for SessionStart, shows stderr to user only */ /** Blocking error - for SessionStart, shows stderr to user only */
BLOCKING_ERROR: 2, BLOCKING_ERROR: 2,
/** Show stderr to user only, don't inject into context. Used by user-message handler (Cursor). */
USER_MESSAGE_ONLY: 3,
} as const; } as const;
export function getTimeout(baseTimeout: number): number { export function getTimeout(baseTimeout: number): number {
return process.platform === "win32" return process.platform === 'win32'
? Math.round(baseTimeout * HOOK_TIMEOUTS.WINDOWS_MULTIPLIER) ? Math.round(baseTimeout * HOOK_TIMEOUTS.WINDOWS_MULTIPLIER)
: baseTimeout; : baseTimeout;
} }
+2 -2
View File
@@ -32,8 +32,8 @@ describe('hook-constants', () => {
expect(HOOK_TIMEOUTS.HEALTH_CHECK).toBe(3000); expect(HOOK_TIMEOUTS.HEALTH_CHECK).toBe(3000);
}); });
it('should define POST_SPAWN_WAIT as 5s', () => { it('should define POST_SPAWN_WAIT as 15s', () => {
expect(HOOK_TIMEOUTS.POST_SPAWN_WAIT).toBe(5000); expect(HOOK_TIMEOUTS.POST_SPAWN_WAIT).toBe(15000);
}); });
it('should define PORT_IN_USE_WAIT as 3s', () => { it('should define PORT_IN_USE_WAIT as 3s', () => {