d24f3a7019
- Update allProjects test expectation to match [parent, composite] (matches JSDoc + callers in ContextBuilder/context handlers). - Replace string-matched __DRY_RUN_ROLLBACK__ sentinel with dedicated DryRunRollback class to avoid swallowing unrelated errors. - Add 5000ms timeout to spawnSync git calls in WorktreeAdoption and ProcessManager so worker startup can't hang on a stuck git process. - Drop unreachable break after process.exit(0) in adopt case. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1044 lines
38 KiB
TypeScript
1044 lines
38 KiB
TypeScript
/**
|
|
* ProcessManager - PID files, signal handlers, and child process lifecycle management
|
|
*
|
|
* Extracted from worker-service.ts monolith to provide centralized process management.
|
|
* Handles:
|
|
* - PID file management for daemon coordination
|
|
* - Signal handler registration for graceful shutdown
|
|
* - Child process enumeration and cleanup (especially for Windows zombie port fix)
|
|
*/
|
|
|
|
import path from 'path';
|
|
import { homedir } from 'os';
|
|
import { existsSync, writeFileSync, readFileSync, unlinkSync, mkdirSync, rmSync, statSync, utimesSync, copyFileSync } from 'fs';
|
|
import { exec, execSync, spawn, spawnSync } from 'child_process';
|
|
import { promisify } from 'util';
|
|
import { logger } from '../../utils/logger.js';
|
|
import { HOOK_TIMEOUTS } from '../../shared/hook-constants.js';
|
|
import { sanitizeEnv } from '../../supervisor/env-sanitizer.js';
|
|
import { getSupervisor, validateWorkerPidFile, type ValidateWorkerPidStatus } from '../../supervisor/index.js';
|
|
|
|
const execAsync = promisify(exec);
|
|
|
|
// Standard paths for PID file management
|
|
const DATA_DIR = path.join(homedir(), '.claude-mem');
|
|
const PID_FILE = path.join(DATA_DIR, 'worker.pid');
|
|
|
|
// Orphaned process cleanup patterns and thresholds
|
|
// These are claude-mem processes that can accumulate if not properly terminated
|
|
const ORPHAN_PROCESS_PATTERNS = [
|
|
'mcp-server.cjs', // Main MCP server process
|
|
'worker-service.cjs', // Background worker daemon
|
|
'chroma-mcp' // ChromaDB MCP subprocess
|
|
];
|
|
|
|
// Only kill processes older than this to avoid killing the current session
|
|
const ORPHAN_MAX_AGE_MINUTES = 30;
|
|
|
|
interface RuntimeResolverOptions {
|
|
platform?: NodeJS.Platform;
|
|
execPath?: string;
|
|
env?: NodeJS.ProcessEnv;
|
|
homeDirectory?: string;
|
|
pathExists?: (candidatePath: string) => boolean;
|
|
lookupInPath?: (binaryName: string, platform: NodeJS.Platform) => string | null;
|
|
}
|
|
|
|
function isBunExecutablePath(executablePath: string | undefined | null): boolean {
|
|
if (!executablePath) return false;
|
|
|
|
return /(^|[\\/])bun(\.exe)?$/i.test(executablePath.trim());
|
|
}
|
|
|
|
function lookupBinaryInPath(binaryName: string, platform: NodeJS.Platform): string | null {
|
|
const command = platform === 'win32' ? `where ${binaryName}` : `which ${binaryName}`;
|
|
|
|
try {
|
|
const output = execSync(command, {
|
|
stdio: ['ignore', 'pipe', 'ignore'],
|
|
encoding: 'utf-8',
|
|
windowsHide: true
|
|
});
|
|
|
|
const firstMatch = output
|
|
.split(/\r?\n/)
|
|
.map(line => line.trim())
|
|
.find(line => line.length > 0);
|
|
|
|
return firstMatch || null;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
// Memoize the resolved runtime path for the no-options call site (which is
|
|
// what spawnDaemon uses). Caches successful resolutions so repeated spawn
|
|
// attempts (crash loops, health thrashing) don't repeatedly hit `statSync`
|
|
// on the candidate paths.
|
|
//
|
|
// IMPORTANT: only success is cached. A `null` result (Bun not found) is
|
|
// never cached so that a long-running MCP server can recover if the user
|
|
// installs Bun in another terminal between the first failed lookup and a
|
|
// subsequent retry. Caching `null` would permanently break the process
|
|
// until restart. Per PR #1645 round-10 review.
|
|
//
|
|
// `undefined` means "not yet resolved"; tests that pass options bypass the
|
|
// cache entirely.
|
|
let cachedWorkerRuntimePath: string | undefined = undefined;
|
|
|
|
/**
|
|
* Reset the memoized runtime path. Exported for test isolation only —
|
|
* production code never needs to call this.
|
|
*/
|
|
export function resetWorkerRuntimePathCache(): void {
|
|
cachedWorkerRuntimePath = undefined;
|
|
}
|
|
|
|
/**
|
|
* Resolve the runtime executable for spawning the worker daemon.
|
|
*
|
|
* worker-service.cjs imports `bun:sqlite`, so it MUST run under Bun on every
|
|
* platform — not just Windows. When the caller is already running under Bun
|
|
* (e.g. the worker self-spawning from a hook), we reuse process.execPath to
|
|
* avoid an extra PATH lookup. Otherwise (notably when the MCP server running
|
|
* under Node spawns the worker for the first time) we locate the Bun binary
|
|
* via env vars, well-known install locations, and finally the system PATH.
|
|
*/
|
|
export function resolveWorkerRuntimePath(options: RuntimeResolverOptions = {}): string | null {
|
|
// Memoization fast path — only when called with no injected options. Tests
|
|
// that pass options always run the full resolution (and never populate or
|
|
// read the cache) to keep the existing test cases deterministic.
|
|
const isMemoizable = Object.keys(options).length === 0;
|
|
if (isMemoizable && cachedWorkerRuntimePath !== undefined) {
|
|
return cachedWorkerRuntimePath;
|
|
}
|
|
|
|
const result = resolveWorkerRuntimePathUncached(options);
|
|
|
|
// Only cache successful resolutions. See the comment on
|
|
// `cachedWorkerRuntimePath` above for the rationale.
|
|
if (isMemoizable && result !== null) {
|
|
cachedWorkerRuntimePath = result;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
function resolveWorkerRuntimePathUncached(options: RuntimeResolverOptions): string | null {
|
|
const platform = options.platform ?? process.platform;
|
|
const execPath = options.execPath ?? process.execPath;
|
|
|
|
// If already running under Bun, reuse it directly.
|
|
if (isBunExecutablePath(execPath)) {
|
|
return execPath;
|
|
}
|
|
|
|
const env = options.env ?? process.env;
|
|
const homeDirectory = options.homeDirectory ?? homedir();
|
|
const pathExists = options.pathExists ?? existsSync;
|
|
const lookupInPath = options.lookupInPath ?? lookupBinaryInPath;
|
|
|
|
const candidatePaths: (string | undefined)[] = platform === 'win32'
|
|
? [
|
|
env.BUN,
|
|
env.BUN_PATH,
|
|
path.join(homeDirectory, '.bun', 'bin', 'bun.exe'),
|
|
path.join(homeDirectory, '.bun', 'bin', 'bun'),
|
|
env.USERPROFILE ? path.join(env.USERPROFILE, '.bun', 'bin', 'bun.exe') : undefined,
|
|
env.LOCALAPPDATA ? path.join(env.LOCALAPPDATA, 'bun', 'bun.exe') : undefined,
|
|
env.LOCALAPPDATA ? path.join(env.LOCALAPPDATA, 'bun', 'bin', 'bun.exe') : undefined,
|
|
]
|
|
: [
|
|
env.BUN,
|
|
env.BUN_PATH,
|
|
path.join(homeDirectory, '.bun', 'bin', 'bun'),
|
|
'/usr/local/bin/bun',
|
|
'/opt/homebrew/bin/bun',
|
|
'/home/linuxbrew/.linuxbrew/bin/bun',
|
|
'/usr/bin/bun', // Debian/Ubuntu apt install path
|
|
'/snap/bin/bun', // Ubuntu Snap install path
|
|
];
|
|
|
|
for (const candidate of candidatePaths) {
|
|
const normalized = candidate?.trim();
|
|
if (!normalized) continue;
|
|
|
|
if (isBunExecutablePath(normalized) && pathExists(normalized)) {
|
|
return normalized;
|
|
}
|
|
|
|
// Allow command-style values from env (e.g. BUN=bun). The previous branch
|
|
// would also match this candidate via isBunExecutablePath('bun') === true,
|
|
// but pathExists('bun') is false because it's a relative name — so this
|
|
// branch is what actually fires for the bare-command case. We return the
|
|
// bare name unchanged so child_process.spawn() resolves it via PATH.
|
|
if (normalized.toLowerCase() === 'bun') {
|
|
return normalized;
|
|
}
|
|
}
|
|
|
|
return lookupInPath('bun', platform);
|
|
}
|
|
|
|
export interface PidInfo {
|
|
pid: number;
|
|
port: number;
|
|
startedAt: string;
|
|
}
|
|
|
|
/**
|
|
* Write PID info to the standard PID file location
|
|
*/
|
|
export function writePidFile(info: PidInfo): void {
|
|
mkdirSync(DATA_DIR, { recursive: true });
|
|
writeFileSync(PID_FILE, JSON.stringify(info, null, 2));
|
|
}
|
|
|
|
/**
|
|
* Read PID info from the standard PID file location
|
|
* Returns null if file doesn't exist or is corrupted
|
|
*/
|
|
export function readPidFile(): PidInfo | null {
|
|
if (!existsSync(PID_FILE)) return null;
|
|
|
|
try {
|
|
return JSON.parse(readFileSync(PID_FILE, 'utf-8'));
|
|
} catch (error) {
|
|
logger.warn('SYSTEM', 'Failed to parse PID file', { path: PID_FILE }, error as Error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Remove the PID file (called during shutdown)
|
|
*/
|
|
export function removePidFile(): void {
|
|
if (!existsSync(PID_FILE)) return;
|
|
|
|
try {
|
|
unlinkSync(PID_FILE);
|
|
} catch (error) {
|
|
// [ANTI-PATTERN IGNORED]: Cleanup function - PID file removal failure is non-critical
|
|
logger.warn('SYSTEM', 'Failed to remove PID file', { path: PID_FILE }, error as Error);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get platform-adjusted timeout for worker-side socket operations (2.0x on Windows).
|
|
*
|
|
* Note: Two platform multiplier functions exist intentionally:
|
|
* - getTimeout() in hook-constants.ts uses 1.5x for hook-side operations (fast path)
|
|
* - getPlatformTimeout() here uses 2.0x for worker-side socket operations (slower path)
|
|
*/
|
|
export function getPlatformTimeout(baseMs: number): number {
|
|
const WINDOWS_MULTIPLIER = 2.0;
|
|
return process.platform === 'win32' ? Math.round(baseMs * WINDOWS_MULTIPLIER) : baseMs;
|
|
}
|
|
|
|
/**
|
|
* Get all child process PIDs (Windows-specific)
|
|
* Used for cleanup to prevent zombie ports when parent exits
|
|
*/
|
|
export async function getChildProcesses(parentPid: number): Promise<number[]> {
|
|
if (process.platform !== 'win32') {
|
|
return [];
|
|
}
|
|
|
|
// SECURITY: Validate PID is a positive integer to prevent command injection
|
|
if (!Number.isInteger(parentPid) || parentPid <= 0) {
|
|
logger.warn('SYSTEM', 'Invalid parent PID for child process enumeration', { parentPid });
|
|
return [];
|
|
}
|
|
|
|
try {
|
|
// Use WQL -Filter to avoid $_ pipeline syntax that breaks in Git Bash (#1062, #1024).
|
|
// Get-CimInstance with server-side filtering is also more efficient than piping through Where-Object.
|
|
const cmd = `powershell -NoProfile -NonInteractive -Command "Get-CimInstance Win32_Process -Filter 'ParentProcessId=${parentPid}' | Select-Object -ExpandProperty ProcessId"`;
|
|
const { stdout } = await execAsync(cmd, { timeout: HOOK_TIMEOUTS.POWERSHELL_COMMAND, windowsHide: true });
|
|
return stdout
|
|
.split('\n')
|
|
.map(line => line.trim())
|
|
.filter(line => line.length > 0 && /^\d+$/.test(line))
|
|
.map(line => parseInt(line, 10))
|
|
.filter(pid => pid > 0);
|
|
} catch (error) {
|
|
// Shutdown cleanup - failure is non-critical, continue without child process cleanup
|
|
logger.error('SYSTEM', 'Failed to enumerate child processes', { parentPid }, error as Error);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Force kill a process by PID
|
|
* Windows: uses taskkill /F /T to kill process tree
|
|
* Unix: uses SIGKILL
|
|
*/
|
|
export async function forceKillProcess(pid: number): Promise<void> {
|
|
// SECURITY: Validate PID is a positive integer to prevent command injection
|
|
if (!Number.isInteger(pid) || pid <= 0) {
|
|
logger.warn('SYSTEM', 'Invalid PID for force kill', { pid });
|
|
return;
|
|
}
|
|
|
|
try {
|
|
if (process.platform === 'win32') {
|
|
// /T kills entire process tree, /F forces termination
|
|
await execAsync(`taskkill /PID ${pid} /T /F`, { timeout: HOOK_TIMEOUTS.POWERSHELL_COMMAND, windowsHide: true });
|
|
} else {
|
|
process.kill(pid, 'SIGKILL');
|
|
}
|
|
logger.info('SYSTEM', 'Killed process', { pid });
|
|
} catch (error) {
|
|
// [ANTI-PATTERN IGNORED]: Shutdown cleanup - process already exited, continue
|
|
logger.debug('SYSTEM', 'Process already exited during force kill', { pid }, error as Error);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Wait for processes to fully exit
|
|
*/
|
|
export async function waitForProcessesExit(pids: number[], timeoutMs: number): Promise<void> {
|
|
const start = Date.now();
|
|
|
|
while (Date.now() - start < timeoutMs) {
|
|
const stillAlive = pids.filter(pid => {
|
|
try {
|
|
process.kill(pid, 0);
|
|
return true;
|
|
} catch (error) {
|
|
// [ANTI-PATTERN IGNORED]: Tight loop checking 100s of PIDs every 100ms during cleanup
|
|
return false;
|
|
}
|
|
});
|
|
|
|
if (stillAlive.length === 0) {
|
|
logger.info('SYSTEM', 'All child processes exited');
|
|
return;
|
|
}
|
|
|
|
logger.debug('SYSTEM', 'Waiting for processes to exit', { stillAlive });
|
|
await new Promise(r => setTimeout(r, 100));
|
|
}
|
|
|
|
logger.warn('SYSTEM', 'Timeout waiting for child processes to exit');
|
|
}
|
|
|
|
/**
|
|
* Parse process elapsed time from ps etime format: [[DD-]HH:]MM:SS
|
|
* Returns age in minutes, or -1 if parsing fails
|
|
*/
|
|
export function parseElapsedTime(etime: string): number {
|
|
if (!etime || etime.trim() === '') return -1;
|
|
|
|
const cleaned = etime.trim();
|
|
let totalMinutes = 0;
|
|
|
|
// DD-HH:MM:SS format
|
|
const dayMatch = cleaned.match(/^(\d+)-(\d+):(\d+):(\d+)$/);
|
|
if (dayMatch) {
|
|
totalMinutes = parseInt(dayMatch[1], 10) * 24 * 60 +
|
|
parseInt(dayMatch[2], 10) * 60 +
|
|
parseInt(dayMatch[3], 10);
|
|
return totalMinutes;
|
|
}
|
|
|
|
// HH:MM:SS format
|
|
const hourMatch = cleaned.match(/^(\d+):(\d+):(\d+)$/);
|
|
if (hourMatch) {
|
|
totalMinutes = parseInt(hourMatch[1], 10) * 60 + parseInt(hourMatch[2], 10);
|
|
return totalMinutes;
|
|
}
|
|
|
|
// MM:SS format
|
|
const minMatch = cleaned.match(/^(\d+):(\d+)$/);
|
|
if (minMatch) {
|
|
return parseInt(minMatch[1], 10);
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
/**
|
|
* Clean up orphaned claude-mem processes from previous worker sessions
|
|
*
|
|
* Targets mcp-server.cjs, worker-service.cjs, and chroma-mcp processes
|
|
* that survived a previous daemon crash. Only kills processes older than
|
|
* ORPHAN_MAX_AGE_MINUTES to avoid killing the current session.
|
|
*
|
|
* The periodic ProcessRegistry reaper handles in-session orphans;
|
|
* this function handles cross-session orphans at startup.
|
|
*/
|
|
export async function cleanupOrphanedProcesses(): Promise<void> {
|
|
const isWindows = process.platform === 'win32';
|
|
const currentPid = process.pid;
|
|
const pidsToKill: number[] = [];
|
|
|
|
try {
|
|
if (isWindows) {
|
|
// Windows: Use WQL -Filter for server-side filtering (no $_ pipeline syntax).
|
|
// Avoids Git Bash $_ interpretation (#1062) and PowerShell syntax errors (#1024).
|
|
const wqlPatternConditions = ORPHAN_PROCESS_PATTERNS
|
|
.map(p => `CommandLine LIKE '%${p}%'`)
|
|
.join(' OR ');
|
|
|
|
const cmd = `powershell -NoProfile -NonInteractive -Command "Get-CimInstance Win32_Process -Filter '(${wqlPatternConditions}) AND ProcessId != ${currentPid}' | Select-Object ProcessId, CreationDate | ConvertTo-Json"`;
|
|
const { stdout } = await execAsync(cmd, { timeout: HOOK_TIMEOUTS.POWERSHELL_COMMAND, windowsHide: true });
|
|
|
|
if (!stdout.trim() || stdout.trim() === 'null') {
|
|
logger.debug('SYSTEM', 'No orphaned claude-mem processes found (Windows)');
|
|
return;
|
|
}
|
|
|
|
const processes = JSON.parse(stdout);
|
|
const processList = Array.isArray(processes) ? processes : [processes];
|
|
const now = Date.now();
|
|
|
|
for (const proc of processList) {
|
|
const pid = proc.ProcessId;
|
|
// SECURITY: Validate PID is positive integer and not current process
|
|
if (!Number.isInteger(pid) || pid <= 0 || pid === currentPid) continue;
|
|
|
|
// Parse Windows WMI date format: /Date(1234567890123)/
|
|
const creationMatch = proc.CreationDate?.match(/\/Date\((\d+)\)\//);
|
|
if (creationMatch) {
|
|
const creationTime = parseInt(creationMatch[1], 10);
|
|
const ageMinutes = (now - creationTime) / (1000 * 60);
|
|
|
|
if (ageMinutes >= ORPHAN_MAX_AGE_MINUTES) {
|
|
pidsToKill.push(pid);
|
|
logger.debug('SYSTEM', 'Found orphaned process', { pid, ageMinutes: Math.round(ageMinutes) });
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// Unix: Use ps with elapsed time for age-based filtering
|
|
const patternRegex = ORPHAN_PROCESS_PATTERNS.join('|');
|
|
const { stdout } = await execAsync(
|
|
`ps -eo pid,etime,command | grep -E "${patternRegex}" | grep -v grep || true`
|
|
);
|
|
|
|
if (!stdout.trim()) {
|
|
logger.debug('SYSTEM', 'No orphaned claude-mem processes found (Unix)');
|
|
return;
|
|
}
|
|
|
|
const lines = stdout.trim().split('\n');
|
|
for (const line of lines) {
|
|
// Parse: " 1234 01:23:45 /path/to/process"
|
|
const match = line.trim().match(/^(\d+)\s+(\S+)\s+(.*)$/);
|
|
if (!match) continue;
|
|
|
|
const pid = parseInt(match[1], 10);
|
|
const etime = match[2];
|
|
|
|
// SECURITY: Validate PID is positive integer and not current process
|
|
if (!Number.isInteger(pid) || pid <= 0 || pid === currentPid) continue;
|
|
|
|
const ageMinutes = parseElapsedTime(etime);
|
|
if (ageMinutes >= ORPHAN_MAX_AGE_MINUTES) {
|
|
pidsToKill.push(pid);
|
|
logger.debug('SYSTEM', 'Found orphaned process', { pid, ageMinutes, command: match[3].substring(0, 80) });
|
|
}
|
|
}
|
|
}
|
|
} catch (error) {
|
|
// Orphan cleanup is non-critical - log and continue
|
|
logger.error('SYSTEM', 'Failed to enumerate orphaned processes', {}, error as Error);
|
|
return;
|
|
}
|
|
|
|
if (pidsToKill.length === 0) {
|
|
return;
|
|
}
|
|
|
|
logger.info('SYSTEM', 'Cleaning up orphaned claude-mem processes', {
|
|
platform: isWindows ? 'Windows' : 'Unix',
|
|
count: pidsToKill.length,
|
|
pids: pidsToKill,
|
|
maxAgeMinutes: ORPHAN_MAX_AGE_MINUTES
|
|
});
|
|
|
|
// Kill all found processes
|
|
if (isWindows) {
|
|
for (const pid of pidsToKill) {
|
|
// SECURITY: Double-check PID validation before using in taskkill command
|
|
if (!Number.isInteger(pid) || pid <= 0) {
|
|
logger.warn('SYSTEM', 'Skipping invalid PID', { pid });
|
|
continue;
|
|
}
|
|
try {
|
|
execSync(`taskkill /PID ${pid} /T /F`, { timeout: HOOK_TIMEOUTS.POWERSHELL_COMMAND, stdio: 'ignore', windowsHide: true });
|
|
} catch (error) {
|
|
// [ANTI-PATTERN IGNORED]: Cleanup loop - process may have exited, continue to next PID
|
|
logger.debug('SYSTEM', 'Failed to kill process, may have already exited', { pid }, error as Error);
|
|
}
|
|
}
|
|
} else {
|
|
for (const pid of pidsToKill) {
|
|
try {
|
|
process.kill(pid, 'SIGKILL');
|
|
} catch (error) {
|
|
// [ANTI-PATTERN IGNORED]: Cleanup loop - process may have exited, continue to next PID
|
|
logger.debug('SYSTEM', 'Process already exited', { pid }, error as Error);
|
|
}
|
|
}
|
|
}
|
|
|
|
logger.info('SYSTEM', 'Orphaned processes cleaned up', { count: pidsToKill.length });
|
|
}
|
|
|
|
// Patterns that should be killed immediately at startup (no age gate)
|
|
// These are child processes that should not outlive their parent worker
|
|
const AGGRESSIVE_CLEANUP_PATTERNS = ['worker-service.cjs', 'chroma-mcp'];
|
|
|
|
// Patterns that keep the age-gated threshold (may be legitimately running)
|
|
const AGE_GATED_CLEANUP_PATTERNS = ['mcp-server.cjs'];
|
|
|
|
/**
|
|
* Aggressive startup cleanup for orphaned claude-mem processes.
|
|
*
|
|
* Unlike cleanupOrphanedProcesses() which age-gates everything at 30 minutes,
|
|
* this function kills worker-service.cjs and chroma-mcp processes immediately
|
|
* (they should not outlive their parent worker). Only mcp-server.cjs keeps
|
|
* the age threshold since it may be legitimately running.
|
|
*
|
|
* Called once at daemon startup.
|
|
*/
|
|
export async function aggressiveStartupCleanup(): Promise<void> {
|
|
const isWindows = process.platform === 'win32';
|
|
const currentPid = process.pid;
|
|
const pidsToKill: number[] = [];
|
|
const allPatterns = [...AGGRESSIVE_CLEANUP_PATTERNS, ...AGE_GATED_CLEANUP_PATTERNS];
|
|
|
|
// Protect parent process (the hook that spawned us) from being killed.
|
|
// Without this, a new daemon kills its own parent hook process (#1426).
|
|
//
|
|
// Note: readPidFile() is not used here because start() writes the new PID
|
|
// before initializeBackground() calls this function, so readPidFile() would
|
|
// just return process.pid (already protected). If a pre-existing worker needs
|
|
// protection, ensureWorkerStarted() handles that by returning early when a
|
|
// healthy worker is detected — we never reach this code in that case.
|
|
const protectedPids = new Set<number>([currentPid]);
|
|
if (process.ppid && process.ppid > 0) {
|
|
protectedPids.add(process.ppid);
|
|
}
|
|
|
|
try {
|
|
if (isWindows) {
|
|
// Use WQL -Filter for server-side filtering (no $_ pipeline syntax).
|
|
// Avoids Git Bash $_ interpretation (#1062) and PowerShell syntax errors (#1024).
|
|
const wqlPatternConditions = allPatterns
|
|
.map(p => `CommandLine LIKE '%${p}%'`)
|
|
.join(' OR ');
|
|
|
|
const cmd = `powershell -NoProfile -NonInteractive -Command "Get-CimInstance Win32_Process -Filter '(${wqlPatternConditions}) AND ProcessId != ${currentPid}' | Select-Object ProcessId, CommandLine, CreationDate | ConvertTo-Json"`;
|
|
const { stdout } = await execAsync(cmd, { timeout: HOOK_TIMEOUTS.POWERSHELL_COMMAND, windowsHide: true });
|
|
|
|
if (!stdout.trim() || stdout.trim() === 'null') {
|
|
logger.debug('SYSTEM', 'No orphaned claude-mem processes found (Windows)');
|
|
return;
|
|
}
|
|
|
|
const processes = JSON.parse(stdout);
|
|
const processList = Array.isArray(processes) ? processes : [processes];
|
|
const now = Date.now();
|
|
|
|
for (const proc of processList) {
|
|
const pid = proc.ProcessId;
|
|
if (!Number.isInteger(pid) || pid <= 0 || protectedPids.has(pid)) continue;
|
|
|
|
const commandLine = proc.CommandLine || '';
|
|
const isAggressive = AGGRESSIVE_CLEANUP_PATTERNS.some(p => commandLine.includes(p));
|
|
|
|
if (isAggressive) {
|
|
// Kill immediately — no age check
|
|
pidsToKill.push(pid);
|
|
logger.debug('SYSTEM', 'Found orphaned process (aggressive)', { pid, commandLine: commandLine.substring(0, 80) });
|
|
} else {
|
|
// Age-gated: only kill if older than threshold
|
|
const creationMatch = proc.CreationDate?.match(/\/Date\((\d+)\)\//);
|
|
if (creationMatch) {
|
|
const creationTime = parseInt(creationMatch[1], 10);
|
|
const ageMinutes = (now - creationTime) / (1000 * 60);
|
|
if (ageMinutes >= ORPHAN_MAX_AGE_MINUTES) {
|
|
pidsToKill.push(pid);
|
|
logger.debug('SYSTEM', 'Found orphaned process (age-gated)', { pid, ageMinutes: Math.round(ageMinutes) });
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// Unix: Use ps with elapsed time
|
|
const patternRegex = allPatterns.join('|');
|
|
const { stdout } = await execAsync(
|
|
`ps -eo pid,etime,command | grep -E "${patternRegex}" | grep -v grep || true`
|
|
);
|
|
|
|
if (!stdout.trim()) {
|
|
logger.debug('SYSTEM', 'No orphaned claude-mem processes found (Unix)');
|
|
return;
|
|
}
|
|
|
|
const lines = stdout.trim().split('\n');
|
|
for (const line of lines) {
|
|
const match = line.trim().match(/^(\d+)\s+(\S+)\s+(.*)$/);
|
|
if (!match) continue;
|
|
|
|
const pid = parseInt(match[1], 10);
|
|
const etime = match[2];
|
|
const command = match[3];
|
|
|
|
if (!Number.isInteger(pid) || pid <= 0 || protectedPids.has(pid)) continue;
|
|
|
|
const isAggressive = AGGRESSIVE_CLEANUP_PATTERNS.some(p => command.includes(p));
|
|
|
|
if (isAggressive) {
|
|
// Kill immediately — no age check
|
|
pidsToKill.push(pid);
|
|
logger.debug('SYSTEM', 'Found orphaned process (aggressive)', { pid, command: command.substring(0, 80) });
|
|
} else {
|
|
// Age-gated: only kill if older than threshold
|
|
const ageMinutes = parseElapsedTime(etime);
|
|
if (ageMinutes >= ORPHAN_MAX_AGE_MINUTES) {
|
|
pidsToKill.push(pid);
|
|
logger.debug('SYSTEM', 'Found orphaned process (age-gated)', { pid, ageMinutes, command: command.substring(0, 80) });
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} catch (error) {
|
|
logger.error('SYSTEM', 'Failed to enumerate orphaned processes during aggressive cleanup', {}, error as Error);
|
|
return;
|
|
}
|
|
|
|
if (pidsToKill.length === 0) {
|
|
return;
|
|
}
|
|
|
|
logger.info('SYSTEM', 'Aggressive startup cleanup: killing orphaned processes', {
|
|
platform: isWindows ? 'Windows' : 'Unix',
|
|
count: pidsToKill.length,
|
|
pids: pidsToKill
|
|
});
|
|
|
|
if (isWindows) {
|
|
for (const pid of pidsToKill) {
|
|
if (!Number.isInteger(pid) || pid <= 0) continue;
|
|
try {
|
|
execSync(`taskkill /PID ${pid} /T /F`, { timeout: HOOK_TIMEOUTS.POWERSHELL_COMMAND, stdio: 'ignore', windowsHide: true });
|
|
} catch (error) {
|
|
logger.debug('SYSTEM', 'Failed to kill process, may have already exited', { pid }, error as Error);
|
|
}
|
|
}
|
|
} else {
|
|
for (const pid of pidsToKill) {
|
|
try {
|
|
process.kill(pid, 'SIGKILL');
|
|
} catch (error) {
|
|
logger.debug('SYSTEM', 'Process already exited', { pid }, error as Error);
|
|
}
|
|
}
|
|
}
|
|
|
|
logger.info('SYSTEM', 'Aggressive startup cleanup complete', { count: pidsToKill.length });
|
|
}
|
|
|
|
const CHROMA_MIGRATION_MARKER_FILENAME = '.chroma-cleaned-v10.3';
|
|
|
|
/**
|
|
* One-time chroma data wipe for users upgrading from versions with duplicate
|
|
* worker bugs that could corrupt chroma data. Since chroma is always rebuildable
|
|
* from SQLite (via backfillAllProjects), this is safe.
|
|
*
|
|
* Checks for a marker file. If absent, wipes ~/.claude-mem/chroma/ and writes
|
|
* the marker. If present, skips. Idempotent.
|
|
*
|
|
* @param dataDirectory - Override for DATA_DIR (used in tests)
|
|
*/
|
|
export function runOneTimeChromaMigration(dataDirectory?: string): void {
|
|
const effectiveDataDir = dataDirectory ?? DATA_DIR;
|
|
const markerPath = path.join(effectiveDataDir, CHROMA_MIGRATION_MARKER_FILENAME);
|
|
const chromaDir = path.join(effectiveDataDir, 'chroma');
|
|
|
|
if (existsSync(markerPath)) {
|
|
logger.debug('SYSTEM', 'Chroma migration marker exists, skipping wipe');
|
|
return;
|
|
}
|
|
|
|
logger.warn('SYSTEM', 'Running one-time chroma data wipe (upgrade from pre-v10.3)', { chromaDir });
|
|
|
|
if (existsSync(chromaDir)) {
|
|
rmSync(chromaDir, { recursive: true, force: true });
|
|
logger.info('SYSTEM', 'Chroma data directory removed', { chromaDir });
|
|
}
|
|
|
|
// Write marker file to prevent future wipes
|
|
mkdirSync(effectiveDataDir, { recursive: true });
|
|
writeFileSync(markerPath, new Date().toISOString());
|
|
logger.info('SYSTEM', 'Chroma migration marker written', { markerPath });
|
|
}
|
|
|
|
const CWD_REMAP_MARKER_FILENAME = '.cwd-remap-applied-v1';
|
|
|
|
type CwdClassification =
|
|
| { kind: 'main'; project: string }
|
|
| { kind: 'worktree'; project: string }
|
|
| { kind: 'skip' };
|
|
|
|
function gitQuery(cwd: string, args: string[]): string | null {
|
|
const r = spawnSync('git', ['-C', cwd, ...args], {
|
|
encoding: 'utf8',
|
|
timeout: 5000
|
|
});
|
|
if (r.status !== 0) return null;
|
|
return (r.stdout ?? '').trim();
|
|
}
|
|
|
|
function classifyCwdForRemap(cwd: string): CwdClassification {
|
|
if (!existsSync(cwd)) return { kind: 'skip' };
|
|
|
|
const gitDir = gitQuery(cwd, ['rev-parse', '--absolute-git-dir']);
|
|
if (!gitDir) return { kind: 'skip' };
|
|
|
|
const commonDir = gitQuery(cwd, ['rev-parse', '--path-format=absolute', '--git-common-dir']);
|
|
if (!commonDir) return { kind: 'skip' };
|
|
|
|
const toplevel = gitQuery(cwd, ['rev-parse', '--show-toplevel']);
|
|
if (!toplevel) return { kind: 'skip' };
|
|
const leaf = path.basename(toplevel);
|
|
|
|
if (gitDir === commonDir) {
|
|
return { kind: 'main', project: leaf };
|
|
}
|
|
|
|
const parentRepoDir = commonDir.endsWith('/.git')
|
|
? path.dirname(commonDir)
|
|
: commonDir.replace(/\.git$/, '');
|
|
const parent = path.basename(parentRepoDir);
|
|
return { kind: 'worktree', project: `${parent}/${leaf}` };
|
|
}
|
|
|
|
/**
|
|
* One-time remap of sdk_sessions.project (+ observations.project,
|
|
* session_summaries.project) using the cwd captured in pending_messages.cwd
|
|
* as the source of truth. Required because pre-worktree builds stored bare
|
|
* project names that collide across parent/worktree checkouts.
|
|
*
|
|
* Backs up the DB before writes. Idempotent via marker file. Skips silently
|
|
* if the DB or pending_messages table doesn't exist yet (fresh install).
|
|
*
|
|
* @param dataDirectory - Override for DATA_DIR (used in tests)
|
|
*/
|
|
export function runOneTimeCwdRemap(dataDirectory?: string): void {
|
|
const effectiveDataDir = dataDirectory ?? DATA_DIR;
|
|
const markerPath = path.join(effectiveDataDir, CWD_REMAP_MARKER_FILENAME);
|
|
const dbPath = path.join(effectiveDataDir, 'claude-mem.db');
|
|
|
|
if (existsSync(markerPath)) {
|
|
logger.debug('SYSTEM', 'cwd-remap marker exists, skipping');
|
|
return;
|
|
}
|
|
|
|
if (!existsSync(dbPath)) {
|
|
mkdirSync(effectiveDataDir, { recursive: true });
|
|
writeFileSync(markerPath, new Date().toISOString());
|
|
logger.debug('SYSTEM', 'No DB present, cwd-remap marker written without work', { dbPath });
|
|
return;
|
|
}
|
|
|
|
logger.warn('SYSTEM', 'Running one-time cwd-based project remap', { dbPath });
|
|
|
|
let db: import('bun:sqlite').Database | null = null;
|
|
try {
|
|
const { Database } = require('bun:sqlite') as typeof import('bun:sqlite');
|
|
|
|
const probe = new Database(dbPath, { readonly: true });
|
|
const hasPending = probe.prepare(
|
|
"SELECT name FROM sqlite_master WHERE type='table' AND name='pending_messages'"
|
|
).get() as { name: string } | undefined;
|
|
probe.close();
|
|
|
|
if (!hasPending) {
|
|
mkdirSync(effectiveDataDir, { recursive: true });
|
|
writeFileSync(markerPath, new Date().toISOString());
|
|
logger.info('SYSTEM', 'pending_messages table not present, cwd-remap skipped');
|
|
return;
|
|
}
|
|
|
|
const backup = `${dbPath}.bak-cwd-remap-${Date.now()}`;
|
|
copyFileSync(dbPath, backup);
|
|
logger.info('SYSTEM', 'DB backed up before cwd-remap', { backup });
|
|
|
|
db = new Database(dbPath);
|
|
|
|
const cwdRows = db.prepare(`
|
|
SELECT cwd FROM pending_messages
|
|
WHERE cwd IS NOT NULL AND cwd != ''
|
|
GROUP BY cwd
|
|
`).all() as Array<{ cwd: string }>;
|
|
|
|
const byCwd = new Map<string, CwdClassification>();
|
|
for (const { cwd } of cwdRows) byCwd.set(cwd, classifyCwdForRemap(cwd));
|
|
|
|
const sessionRows = db.prepare(`
|
|
SELECT s.id AS session_id, s.memory_session_id, s.project AS old_project, p.cwd
|
|
FROM sdk_sessions s
|
|
JOIN pending_messages p ON p.content_session_id = s.content_session_id
|
|
WHERE p.cwd IS NOT NULL AND p.cwd != ''
|
|
AND p.id = (
|
|
SELECT MIN(p2.id) FROM pending_messages p2
|
|
WHERE p2.content_session_id = s.content_session_id
|
|
AND p2.cwd IS NOT NULL AND p2.cwd != ''
|
|
)
|
|
`).all() as Array<{ session_id: number; memory_session_id: string | null; old_project: string; cwd: string }>;
|
|
|
|
type Target = { sessionId: number; memorySessionId: string | null; newProject: string };
|
|
const targets: Target[] = [];
|
|
for (const r of sessionRows) {
|
|
const c = byCwd.get(r.cwd);
|
|
if (!c || c.kind === 'skip') continue;
|
|
if (r.old_project === c.project) continue;
|
|
targets.push({ sessionId: r.session_id, memorySessionId: r.memory_session_id, newProject: c.project });
|
|
}
|
|
|
|
if (targets.length === 0) {
|
|
logger.info('SYSTEM', 'cwd-remap: no sessions need updating');
|
|
} else {
|
|
const updSession = db.prepare('UPDATE sdk_sessions SET project = ? WHERE id = ?');
|
|
const updObs = db.prepare('UPDATE observations SET project = ? WHERE memory_session_id = ?');
|
|
const updSum = db.prepare('UPDATE session_summaries SET project = ? WHERE memory_session_id = ?');
|
|
|
|
let sessionN = 0, obsN = 0, sumN = 0;
|
|
const tx = db.transaction(() => {
|
|
for (const t of targets) {
|
|
sessionN += updSession.run(t.newProject, t.sessionId).changes;
|
|
if (t.memorySessionId) {
|
|
obsN += updObs.run(t.newProject, t.memorySessionId).changes;
|
|
sumN += updSum.run(t.newProject, t.memorySessionId).changes;
|
|
}
|
|
}
|
|
});
|
|
tx();
|
|
|
|
logger.info('SYSTEM', 'cwd-remap applied', { sessions: sessionN, observations: obsN, summaries: sumN, backup });
|
|
}
|
|
|
|
mkdirSync(effectiveDataDir, { recursive: true });
|
|
writeFileSync(markerPath, new Date().toISOString());
|
|
logger.info('SYSTEM', 'cwd-remap marker written', { markerPath });
|
|
} catch (err) {
|
|
logger.error('SYSTEM', 'cwd-remap failed, marker not written (will retry on next startup)', {}, err as Error);
|
|
} finally {
|
|
db?.close();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Spawn a detached daemon process
|
|
* Returns the child PID or undefined if spawn failed
|
|
*
|
|
* On Windows, uses PowerShell Start-Process with -WindowStyle Hidden to spawn
|
|
* a truly independent process without console popups. Unlike WMIC, PowerShell
|
|
* inherits environment variables from the parent process.
|
|
*
|
|
* On Unix, uses standard detached spawn.
|
|
*
|
|
* PID file is written by the worker itself after listen() succeeds,
|
|
* not by the spawner (race-free, works on all platforms).
|
|
*/
|
|
export function spawnDaemon(
|
|
scriptPath: string,
|
|
port: number,
|
|
extraEnv: Record<string, string> = {}
|
|
): number | undefined {
|
|
const isWindows = process.platform === 'win32';
|
|
getSupervisor().assertCanSpawn('worker daemon');
|
|
|
|
const env = sanitizeEnv({
|
|
...process.env,
|
|
CLAUDE_MEM_WORKER_PORT: String(port),
|
|
...extraEnv
|
|
});
|
|
|
|
// worker-service.cjs imports `bun:sqlite`, so the spawned runtime MUST be
|
|
// Bun on every platform — never the current process.execPath, which may be
|
|
// Node when the caller is the MCP server. Resolve once before the OS branch
|
|
// split so we don't pay for a duplicate PATH lookup if Bun isn't found at a
|
|
// well-known path. See resolveWorkerRuntimePath() for the candidate list.
|
|
const runtimePath = resolveWorkerRuntimePath();
|
|
if (!runtimePath) {
|
|
logger.error(
|
|
'SYSTEM',
|
|
'Bun runtime not found — install from https://bun.sh and ensure it is on PATH or set BUN env var. The worker daemon requires Bun because it uses bun:sqlite.'
|
|
);
|
|
return undefined;
|
|
}
|
|
|
|
if (isWindows) {
|
|
// Use PowerShell Start-Process to spawn a hidden, independent process
|
|
// Unlike WMIC, PowerShell inherits environment variables from parent
|
|
// -WindowStyle Hidden prevents console popup
|
|
|
|
// Use -EncodedCommand to avoid all shell quoting issues with spaces in paths
|
|
const psScript = `Start-Process -FilePath '${runtimePath.replace(/'/g, "''")}' -ArgumentList @('${scriptPath.replace(/'/g, "''")}','--daemon') -WindowStyle Hidden`;
|
|
const encodedCommand = Buffer.from(psScript, 'utf16le').toString('base64');
|
|
|
|
try {
|
|
execSync(`powershell -NoProfile -EncodedCommand ${encodedCommand}`, {
|
|
stdio: 'ignore',
|
|
windowsHide: true,
|
|
env
|
|
});
|
|
// Windows success sentinel: PowerShell `Start-Process` does not return
|
|
// the spawned PID, and we don't want to pay for an extra `Get-Process`
|
|
// round-trip just to discover it. Return 0 (a conventionally invalid
|
|
// Unix PID) so callers can distinguish "spawn dispatched" from "spawn
|
|
// failed". Callers MUST use `pid === undefined` to detect failure —
|
|
// never falsy checks like `if (!pid)`, which would silently treat
|
|
// success as failure here.
|
|
return 0;
|
|
} catch (error) {
|
|
// APPROVED OVERRIDE: Windows daemon spawn is best-effort; log and let callers fall back to health checks/retry flow.
|
|
logger.error('SYSTEM', 'Failed to spawn worker daemon on Windows', { runtimePath }, error as Error);
|
|
return undefined;
|
|
}
|
|
}
|
|
|
|
// Unix: Use setsid to create a new session, fully detaching from the
|
|
// controlling terminal. This prevents SIGHUP from reaching the daemon
|
|
// even if the in-process SIGHUP handler somehow fails (belt-and-suspenders).
|
|
// Fall back to standard detached spawn if setsid is not available.
|
|
// `runtimePath` was resolved at the top of this function (see comment there).
|
|
const setsidPath = '/usr/bin/setsid';
|
|
if (existsSync(setsidPath)) {
|
|
const child = spawn(setsidPath, [runtimePath, scriptPath, '--daemon'], {
|
|
detached: true,
|
|
stdio: 'ignore',
|
|
env
|
|
});
|
|
|
|
if (child.pid === undefined) {
|
|
return undefined;
|
|
}
|
|
|
|
child.unref();
|
|
return child.pid;
|
|
}
|
|
|
|
// Fallback: standard detached spawn (macOS, systems without setsid)
|
|
const child = spawn(runtimePath, [scriptPath, '--daemon'], {
|
|
detached: true,
|
|
stdio: 'ignore',
|
|
env
|
|
});
|
|
|
|
if (child.pid === undefined) {
|
|
return undefined;
|
|
}
|
|
|
|
child.unref();
|
|
|
|
return child.pid;
|
|
}
|
|
|
|
/**
|
|
* Check if a process with the given PID is alive.
|
|
*
|
|
* Uses the process.kill(pid, 0) idiom: signal 0 doesn't send a signal,
|
|
* it just checks if the process exists and is reachable.
|
|
*
|
|
* EPERM is treated as "alive" because it means the process exists but
|
|
* belongs to a different user/session (common in multi-user setups).
|
|
* PID 0 (Windows sentinel for unknown PID) is treated as alive.
|
|
*/
|
|
export function isProcessAlive(pid: number): boolean {
|
|
// PID 0 is the Windows sentinel value — process was spawned but PID unknown
|
|
if (pid === 0) return true;
|
|
|
|
// Invalid PIDs are not alive
|
|
if (!Number.isInteger(pid) || pid < 0) return false;
|
|
|
|
try {
|
|
process.kill(pid, 0);
|
|
return true;
|
|
} catch (error: unknown) {
|
|
const code = (error as NodeJS.ErrnoException).code;
|
|
// EPERM = process exists but different user/session — treat as alive
|
|
if (code === 'EPERM') return true;
|
|
// ESRCH = no such process — it's dead
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check if the PID file was written recently (within thresholdMs).
|
|
*
|
|
* Used to coordinate restarts across concurrent sessions: if the PID file
|
|
* was recently written, another session likely just restarted the worker.
|
|
* Callers should poll /api/health instead of attempting their own restart.
|
|
*
|
|
* @param thresholdMs - Maximum age in ms to consider "recent" (default: 15000)
|
|
* @returns true if the PID file exists and was modified within thresholdMs
|
|
*/
|
|
export function isPidFileRecent(thresholdMs: number = 15000): boolean {
|
|
try {
|
|
const stats = statSync(PID_FILE);
|
|
return (Date.now() - stats.mtimeMs) < thresholdMs;
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Touch the PID file to update its mtime without changing contents.
|
|
* Used after a restart to signal other sessions that a restart just completed.
|
|
*/
|
|
export function touchPidFile(): void {
|
|
try {
|
|
if (!existsSync(PID_FILE)) return;
|
|
const now = new Date();
|
|
utimesSync(PID_FILE, now, now);
|
|
} catch {
|
|
// Best-effort — failure to touch doesn't affect correctness
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Read the PID file and remove it if the recorded process is dead (stale).
|
|
*
|
|
* This is a cheap operation: one filesystem read + one signal-0 check.
|
|
* Called at the top of ensureWorkerStarted() to clean up after WSL2
|
|
* hibernate, OOM kills, or other ungraceful worker deaths.
|
|
*/
|
|
export function cleanStalePidFile(): ValidateWorkerPidStatus {
|
|
return validateWorkerPidFile({ logAlive: false });
|
|
}
|
|
|
|
/**
|
|
* Create signal handler factory for graceful shutdown
|
|
* Returns a handler function that can be passed to process.on('SIGTERM') etc.
|
|
*/
|
|
export function createSignalHandler(
|
|
shutdownFn: () => Promise<void>,
|
|
isShuttingDownRef: { value: boolean }
|
|
): (signal: string) => Promise<void> {
|
|
return async (signal: string) => {
|
|
if (isShuttingDownRef.value) {
|
|
logger.warn('SYSTEM', `Received ${signal} but shutdown already in progress`);
|
|
return;
|
|
}
|
|
isShuttingDownRef.value = true;
|
|
|
|
logger.info('SYSTEM', `Received ${signal}, shutting down...`);
|
|
try {
|
|
await shutdownFn();
|
|
process.exit(0);
|
|
} catch (error) {
|
|
// Top-level signal handler - log any shutdown error and exit
|
|
logger.error('SYSTEM', 'Error during shutdown', {}, error as Error);
|
|
// Exit gracefully: Windows Terminal won't keep tab open on exit 0
|
|
// Even on shutdown errors, exit cleanly to prevent tab accumulation
|
|
process.exit(0);
|
|
}
|
|
};
|
|
}
|