99060bac1a
* fix: detect PID reuse in worker start-guard to survive container restarts The 'Worker already running' guard checked PID liveness with kill(0), which false-positives when a persistent PID file outlives the PID namespace (docker stop / docker start, pm2 graceful reloads). The new worker comes up with the same low PID (e.g. 11) as the old one, kill(0) says 'alive', and the worker refuses to start against its own prior incarnation. Capture a process-start token alongside the PID and verify identity, not just liveness: - Linux: /proc/<pid>/stat field 22 (starttime, jiffies since boot) - macOS/POSIX: `ps -p <pid> -o lstart=` - Windows: unchanged (returns null, falls back to liveness) PID files written by older versions are token-less, so verifyPidFileOwnership falls back to the current liveness-only behavior for backwards compatibility. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * refactor: apply review feedback to PID identity helpers - Collapse ProcessManager re-export down to a single import/export statement. - Make verifyPidFileOwnership a type predicate (info is PidInfo) so callers don't need non-null assertions on the narrowed value. - Drop the `!` assertions at the worker-service GUARD 1 call site now that the predicate narrows. - Tighten the captureProcessStartToken platform doc comment to enumerate process.platform values explicitly. No behavior change — esbuild output is byte-identical (type-only edits). Addresses items 1-3 of the claude-review comment on PR #2082. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * fix: pin LC_ALL=C for `ps lstart=` in captureProcessStartToken Without a locale pin, `ps -o lstart=` emits month/weekday names in the system locale. A bind-mounted PID file written under one locale and read under another would hash to different tokens and the live worker would incorrectly appear stale — reintroducing the very bug this helper exists to prevent. Flagged by Greptile on PR #2082. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * refactor: address second-round review on PID identity helpers - verifyPidFileOwnership: log a DEBUG diagnostic when the PID is alive but the start-token mismatches. Without it, callers can't distinguish the "process dead" path from the "PID reused" path in production logs — the exact case this helper exists to catch. - writePidFile: drop the redundant `?? undefined` coercion. `null` and `undefined` are both falsy for the subsequent ternary, so the coercion was purely cosmetic noise that suggested an important distinction. - Add a unit test for the win32 fallback path in captureProcessStartToken (mocks process.platform) — previously uncovered in CI. Addresses items 1, 2, and 5 of the second claude-review on PR #2082. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
135 lines
4.8 KiB
TypeScript
135 lines
4.8 KiB
TypeScript
import { afterEach, describe, expect, it } from 'bun:test';
|
|
import { mkdirSync, rmSync, writeFileSync } from 'fs';
|
|
import { tmpdir } from 'os';
|
|
import path from 'path';
|
|
import { validateWorkerPidFile, type ValidateWorkerPidStatus } from '../../src/supervisor/index.js';
|
|
|
|
function makeTempDir(): string {
|
|
const dir = path.join(tmpdir(), `claude-mem-index-${Date.now()}-${Math.random().toString(36).slice(2)}`);
|
|
mkdirSync(dir, { recursive: true });
|
|
return dir;
|
|
}
|
|
|
|
const tempDirs: string[] = [];
|
|
|
|
describe('validateWorkerPidFile', () => {
|
|
afterEach(() => {
|
|
while (tempDirs.length > 0) {
|
|
const dir = tempDirs.pop();
|
|
if (dir) {
|
|
rmSync(dir, { recursive: true, force: true });
|
|
}
|
|
}
|
|
});
|
|
|
|
it('returns "missing" when PID file does not exist', () => {
|
|
const tempDir = makeTempDir();
|
|
tempDirs.push(tempDir);
|
|
const pidFilePath = path.join(tempDir, 'worker.pid');
|
|
|
|
const status = validateWorkerPidFile({ logAlive: false, pidFilePath });
|
|
expect(status).toBe('missing');
|
|
});
|
|
|
|
it('returns "invalid" when PID file contains bad JSON', () => {
|
|
const tempDir = makeTempDir();
|
|
tempDirs.push(tempDir);
|
|
const pidFilePath = path.join(tempDir, 'worker.pid');
|
|
writeFileSync(pidFilePath, 'not-json!!!');
|
|
|
|
const status = validateWorkerPidFile({ logAlive: false, pidFilePath });
|
|
expect(status).toBe('invalid');
|
|
});
|
|
|
|
it('returns "stale" when PID file references a dead process', () => {
|
|
const tempDir = makeTempDir();
|
|
tempDirs.push(tempDir);
|
|
const pidFilePath = path.join(tempDir, 'worker.pid');
|
|
writeFileSync(pidFilePath, JSON.stringify({
|
|
pid: 2147483647,
|
|
port: 37777,
|
|
startedAt: new Date().toISOString()
|
|
}));
|
|
|
|
const status = validateWorkerPidFile({ logAlive: false, pidFilePath });
|
|
expect(status).toBe('stale');
|
|
});
|
|
|
|
it('returns "alive" when PID file references the current process', () => {
|
|
const tempDir = makeTempDir();
|
|
tempDirs.push(tempDir);
|
|
const pidFilePath = path.join(tempDir, 'worker.pid');
|
|
writeFileSync(pidFilePath, JSON.stringify({
|
|
pid: process.pid,
|
|
port: 37777,
|
|
startedAt: new Date().toISOString()
|
|
}));
|
|
|
|
const status = validateWorkerPidFile({ logAlive: false, pidFilePath });
|
|
expect(status).toBe('alive');
|
|
});
|
|
|
|
// Regression: container restart (docker stop / docker start) reused low PIDs
|
|
// across boots. The PID file on a bind-mounted volume pointed at PID 11;
|
|
// the new worker also came up as PID 11. kill(0) returned "alive" and the
|
|
// worker refused to boot, thinking its own prior incarnation was still up.
|
|
// With the start-token identity check, a stored token that doesn't match
|
|
// the current PID's token should resolve as "stale" and the PID file should
|
|
// be cleared so the new worker can proceed.
|
|
const tokenSupported = process.platform === 'linux' || process.platform === 'darwin';
|
|
it.if(tokenSupported)('returns "stale" when startToken does not match the live PID (PID reused)', () => {
|
|
const tempDir = makeTempDir();
|
|
tempDirs.push(tempDir);
|
|
const pidFilePath = path.join(tempDir, 'worker.pid');
|
|
writeFileSync(pidFilePath, JSON.stringify({
|
|
pid: process.pid,
|
|
port: 37777,
|
|
startedAt: new Date().toISOString(),
|
|
startToken: 'token-from-a-different-incarnation'
|
|
}));
|
|
|
|
const status = validateWorkerPidFile({ logAlive: false, pidFilePath });
|
|
expect(status).toBe('stale');
|
|
});
|
|
});
|
|
|
|
describe('Supervisor assertCanSpawn behavior', () => {
|
|
it('assertCanSpawn throws when stopPromise is active (shutdown in progress)', () => {
|
|
const { getSupervisor } = require('../../src/supervisor/index.js');
|
|
const supervisor = getSupervisor();
|
|
|
|
// When not shutting down, assertCanSpawn should not throw
|
|
expect(() => supervisor.assertCanSpawn('test')).not.toThrow();
|
|
});
|
|
|
|
it('registerProcess and unregisterProcess delegate to the registry', () => {
|
|
const { getSupervisor } = require('../../src/supervisor/index.js');
|
|
const supervisor = getSupervisor();
|
|
const registry = supervisor.getRegistry();
|
|
|
|
const testId = `test-${Date.now()}`;
|
|
supervisor.registerProcess(testId, {
|
|
pid: process.pid,
|
|
type: 'test',
|
|
startedAt: new Date().toISOString()
|
|
});
|
|
|
|
const found = registry.getAll().find((r: { id: string }) => r.id === testId);
|
|
expect(found).toBeDefined();
|
|
expect(found?.type).toBe('test');
|
|
|
|
supervisor.unregisterProcess(testId);
|
|
const afterUnregister = registry.getAll().find((r: { id: string }) => r.id === testId);
|
|
expect(afterUnregister).toBeUndefined();
|
|
});
|
|
});
|
|
|
|
describe('Supervisor start idempotency', () => {
|
|
it('getSupervisor returns the same instance', () => {
|
|
const { getSupervisor } = require('../../src/supervisor/index.js');
|
|
const s1 = getSupervisor();
|
|
const s2 = getSupervisor();
|
|
expect(s1).toBe(s2);
|
|
});
|
|
});
|