fix: detect PID reuse in worker start-guard (container restarts) (#2082)
* fix: detect PID reuse in worker start-guard to survive container restarts The 'Worker already running' guard checked PID liveness with kill(0), which false-positives when a persistent PID file outlives the PID namespace (docker stop / docker start, pm2 graceful reloads). The new worker comes up with the same low PID (e.g. 11) as the old one, kill(0) says 'alive', and the worker refuses to start against its own prior incarnation. Capture a process-start token alongside the PID and verify identity, not just liveness: - Linux: /proc/<pid>/stat field 22 (starttime, jiffies since boot) - macOS/POSIX: `ps -p <pid> -o lstart=` - Windows: unchanged (returns null, falls back to liveness) PID files written by older versions are token-less, so verifyPidFileOwnership falls back to the current liveness-only behavior for backwards compatibility. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * refactor: apply review feedback to PID identity helpers - Collapse ProcessManager re-export down to a single import/export statement. - Make verifyPidFileOwnership a type predicate (info is PidInfo) so callers don't need non-null assertions on the narrowed value. - Drop the `!` assertions at the worker-service GUARD 1 call site now that the predicate narrows. - Tighten the captureProcessStartToken platform doc comment to enumerate process.platform values explicitly. No behavior change — esbuild output is byte-identical (type-only edits). Addresses items 1-3 of the claude-review comment on PR #2082. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * fix: pin LC_ALL=C for `ps lstart=` in captureProcessStartToken Without a locale pin, `ps -o lstart=` emits month/weekday names in the system locale. A bind-mounted PID file written under one locale and read under another would hash to different tokens and the live worker would incorrectly appear stale — reintroducing the very bug this helper exists to prevent. Flagged by Greptile on PR #2082. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * refactor: address second-round review on PID identity helpers - verifyPidFileOwnership: log a DEBUG diagnostic when the PID is alive but the start-token mismatches. Without it, callers can't distinguish the "process dead" path from the "PID reused" path in production logs — the exact case this helper exists to catch. - writePidFile: drop the redundant `?? undefined` coercion. `null` and `undefined` are both falsy for the subsequent ternary, so the coercion was purely cosmetic noise that suggested an important distinction. - Add a unit test for the win32 fallback path in captureProcessStartToken (mocks process.platform) — previously uncovered in CI. Addresses items 1, 2, and 5 of the second claude-review on PR #2082. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -16,6 +16,8 @@ import {
|
||||
spawnDaemon,
|
||||
resolveWorkerRuntimePath,
|
||||
runOneTimeChromaMigration,
|
||||
captureProcessStartToken,
|
||||
verifyPidFileOwnership,
|
||||
type PidInfo
|
||||
} from '../../src/services/infrastructure/index.js';
|
||||
|
||||
@@ -361,6 +363,121 @@ describe('ProcessManager', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('captureProcessStartToken', () => {
|
||||
const supported = process.platform === 'linux' || process.platform === 'darwin';
|
||||
|
||||
it.if(supported)('returns a non-empty token for the current process', () => {
|
||||
const token = captureProcessStartToken(process.pid);
|
||||
expect(typeof token).toBe('string');
|
||||
expect((token ?? '').length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it.if(supported)('returns a stable token across calls for the same PID', () => {
|
||||
const first = captureProcessStartToken(process.pid);
|
||||
const second = captureProcessStartToken(process.pid);
|
||||
expect(first).toBe(second);
|
||||
});
|
||||
|
||||
it('returns null for a non-existent PID', () => {
|
||||
expect(captureProcessStartToken(2147483647)).toBeNull();
|
||||
});
|
||||
|
||||
it('returns null for invalid PIDs', () => {
|
||||
expect(captureProcessStartToken(0)).toBeNull();
|
||||
expect(captureProcessStartToken(-1)).toBeNull();
|
||||
expect(captureProcessStartToken(1.5)).toBeNull();
|
||||
expect(captureProcessStartToken(NaN)).toBeNull();
|
||||
});
|
||||
|
||||
it('returns null on win32 (liveness-only fallback path)', () => {
|
||||
// Simulate Windows to exercise the documented fallback. Real CI doesn't
|
||||
// run on win32, so without this mock the branch is uncovered.
|
||||
const originalPlatform = process.platform;
|
||||
Object.defineProperty(process, 'platform', { value: 'win32', configurable: true });
|
||||
try {
|
||||
expect(captureProcessStartToken(process.pid)).toBeNull();
|
||||
} finally {
|
||||
Object.defineProperty(process, 'platform', { value: originalPlatform, configurable: true });
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe('writePidFile (start-token capture)', () => {
|
||||
const supported = process.platform === 'linux' || process.platform === 'darwin';
|
||||
|
||||
it.if(supported)('auto-captures a startToken when writing for the current process', () => {
|
||||
writePidFile({ pid: process.pid, port: 37777, startedAt: new Date().toISOString() });
|
||||
const persisted = readPidFile();
|
||||
expect(persisted).not.toBeNull();
|
||||
expect(typeof persisted!.startToken).toBe('string');
|
||||
expect((persisted!.startToken ?? '').length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it('preserves a caller-supplied startToken verbatim', () => {
|
||||
const provided = 'caller-supplied-token-xyz';
|
||||
writePidFile({ pid: process.pid, port: 37777, startedAt: new Date().toISOString(), startToken: provided });
|
||||
const persisted = readPidFile();
|
||||
expect(persisted!.startToken).toBe(provided);
|
||||
});
|
||||
|
||||
it('omits startToken when the target PID has no readable token (dead PID)', () => {
|
||||
// pid is dead, so captureProcessStartToken() returns null and writePidFile
|
||||
// should not persist a startToken field.
|
||||
writePidFile({ pid: 2147483647, port: 37777, startedAt: new Date().toISOString() });
|
||||
const persisted = readPidFile();
|
||||
expect(persisted).not.toBeNull();
|
||||
expect(persisted!.startToken).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe('verifyPidFileOwnership', () => {
|
||||
const supported = process.platform === 'linux' || process.platform === 'darwin';
|
||||
|
||||
it('returns false for null input', () => {
|
||||
expect(verifyPidFileOwnership(null)).toBe(false);
|
||||
});
|
||||
|
||||
it('returns false when the PID is not alive', () => {
|
||||
expect(verifyPidFileOwnership({
|
||||
pid: 2147483647,
|
||||
port: 37777,
|
||||
startedAt: new Date().toISOString(),
|
||||
startToken: 'anything'
|
||||
})).toBe(false);
|
||||
});
|
||||
|
||||
it('returns true when no startToken is stored (back-compat with older PID files)', () => {
|
||||
expect(verifyPidFileOwnership({
|
||||
pid: process.pid,
|
||||
port: 37777,
|
||||
startedAt: new Date().toISOString()
|
||||
// intentionally no startToken
|
||||
})).toBe(true);
|
||||
});
|
||||
|
||||
it.if(supported)('returns true when the stored token matches the current PID', () => {
|
||||
const token = captureProcessStartToken(process.pid);
|
||||
expect(token).not.toBeNull();
|
||||
expect(verifyPidFileOwnership({
|
||||
pid: process.pid,
|
||||
port: 37777,
|
||||
startedAt: new Date().toISOString(),
|
||||
startToken: token!
|
||||
})).toBe(true);
|
||||
});
|
||||
|
||||
it.if(supported)('returns false when the stored token does not match (PID reused)', () => {
|
||||
// Simulates the container-restart bug: PID is alive (we pass our own),
|
||||
// but the stored token was written by a prior incarnation.
|
||||
expect(verifyPidFileOwnership({
|
||||
pid: process.pid,
|
||||
port: 37777,
|
||||
startedAt: new Date().toISOString(),
|
||||
startToken: 'token-from-a-different-incarnation'
|
||||
})).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('cleanStalePidFile', () => {
|
||||
it('should remove PID file when process is dead', () => {
|
||||
// Write a PID file with a non-existent PID
|
||||
|
||||
Reference in New Issue
Block a user