Files
claude-mem/tests/supervisor/index.test.ts
T
Alex Newman 99060bac1a fix: detect PID reuse in worker start-guard (container restarts) (#2082)
* fix: detect PID reuse in worker start-guard to survive container restarts

The 'Worker already running' guard checked PID liveness with kill(0), which
false-positives when a persistent PID file outlives the PID namespace (docker
stop / docker start, pm2 graceful reloads). The new worker comes up with the
same low PID (e.g. 11) as the old one, kill(0) says 'alive', and the worker
refuses to start against its own prior incarnation.

Capture a process-start token alongside the PID and verify identity, not just
liveness:
  - Linux: /proc/<pid>/stat field 22 (starttime, jiffies since boot)
  - macOS/POSIX: `ps -p <pid> -o lstart=`
  - Windows: unchanged (returns null, falls back to liveness)

PID files written by older versions are token-less, so verifyPidFileOwnership
falls back to the current liveness-only behavior for backwards compatibility.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* refactor: apply review feedback to PID identity helpers

- Collapse ProcessManager re-export down to a single import/export statement.
- Make verifyPidFileOwnership a type predicate (info is PidInfo) so callers
  don't need non-null assertions on the narrowed value.
- Drop the `!` assertions at the worker-service GUARD 1 call site now that
  the predicate narrows.
- Tighten the captureProcessStartToken platform doc comment to enumerate
  process.platform values explicitly.

No behavior change — esbuild output is byte-identical (type-only edits).
Addresses items 1-3 of the claude-review comment on PR #2082.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* fix: pin LC_ALL=C for `ps lstart=` in captureProcessStartToken

Without a locale pin, `ps -o lstart=` emits month/weekday names in the
system locale. A bind-mounted PID file written under one locale and read
under another would hash to different tokens and the live worker would
incorrectly appear stale — reintroducing the very bug this helper exists
to prevent.

Flagged by Greptile on PR #2082.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* refactor: address second-round review on PID identity helpers

- verifyPidFileOwnership: log a DEBUG diagnostic when the PID is alive but
  the start-token mismatches. Without it, callers can't distinguish the
  "process dead" path from the "PID reused" path in production logs — the
  exact case this helper exists to catch.
- writePidFile: drop the redundant `?? undefined` coercion. `null` and
  `undefined` are both falsy for the subsequent ternary, so the coercion
  was purely cosmetic noise that suggested an important distinction.
- Add a unit test for the win32 fallback path in captureProcessStartToken
  (mocks process.platform) — previously uncovered in CI.

Addresses items 1, 2, and 5 of the second claude-review on PR #2082.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 19:49:03 -07:00

135 lines
4.8 KiB
TypeScript

import { afterEach, describe, expect, it } from 'bun:test';
import { mkdirSync, rmSync, writeFileSync } from 'fs';
import { tmpdir } from 'os';
import path from 'path';
import { validateWorkerPidFile, type ValidateWorkerPidStatus } from '../../src/supervisor/index.js';
function makeTempDir(): string {
const dir = path.join(tmpdir(), `claude-mem-index-${Date.now()}-${Math.random().toString(36).slice(2)}`);
mkdirSync(dir, { recursive: true });
return dir;
}
const tempDirs: string[] = [];
describe('validateWorkerPidFile', () => {
afterEach(() => {
while (tempDirs.length > 0) {
const dir = tempDirs.pop();
if (dir) {
rmSync(dir, { recursive: true, force: true });
}
}
});
it('returns "missing" when PID file does not exist', () => {
const tempDir = makeTempDir();
tempDirs.push(tempDir);
const pidFilePath = path.join(tempDir, 'worker.pid');
const status = validateWorkerPidFile({ logAlive: false, pidFilePath });
expect(status).toBe('missing');
});
it('returns "invalid" when PID file contains bad JSON', () => {
const tempDir = makeTempDir();
tempDirs.push(tempDir);
const pidFilePath = path.join(tempDir, 'worker.pid');
writeFileSync(pidFilePath, 'not-json!!!');
const status = validateWorkerPidFile({ logAlive: false, pidFilePath });
expect(status).toBe('invalid');
});
it('returns "stale" when PID file references a dead process', () => {
const tempDir = makeTempDir();
tempDirs.push(tempDir);
const pidFilePath = path.join(tempDir, 'worker.pid');
writeFileSync(pidFilePath, JSON.stringify({
pid: 2147483647,
port: 37777,
startedAt: new Date().toISOString()
}));
const status = validateWorkerPidFile({ logAlive: false, pidFilePath });
expect(status).toBe('stale');
});
it('returns "alive" when PID file references the current process', () => {
const tempDir = makeTempDir();
tempDirs.push(tempDir);
const pidFilePath = path.join(tempDir, 'worker.pid');
writeFileSync(pidFilePath, JSON.stringify({
pid: process.pid,
port: 37777,
startedAt: new Date().toISOString()
}));
const status = validateWorkerPidFile({ logAlive: false, pidFilePath });
expect(status).toBe('alive');
});
// Regression: container restart (docker stop / docker start) reused low PIDs
// across boots. The PID file on a bind-mounted volume pointed at PID 11;
// the new worker also came up as PID 11. kill(0) returned "alive" and the
// worker refused to boot, thinking its own prior incarnation was still up.
// With the start-token identity check, a stored token that doesn't match
// the current PID's token should resolve as "stale" and the PID file should
// be cleared so the new worker can proceed.
const tokenSupported = process.platform === 'linux' || process.platform === 'darwin';
it.if(tokenSupported)('returns "stale" when startToken does not match the live PID (PID reused)', () => {
const tempDir = makeTempDir();
tempDirs.push(tempDir);
const pidFilePath = path.join(tempDir, 'worker.pid');
writeFileSync(pidFilePath, JSON.stringify({
pid: process.pid,
port: 37777,
startedAt: new Date().toISOString(),
startToken: 'token-from-a-different-incarnation'
}));
const status = validateWorkerPidFile({ logAlive: false, pidFilePath });
expect(status).toBe('stale');
});
});
describe('Supervisor assertCanSpawn behavior', () => {
it('assertCanSpawn throws when stopPromise is active (shutdown in progress)', () => {
const { getSupervisor } = require('../../src/supervisor/index.js');
const supervisor = getSupervisor();
// When not shutting down, assertCanSpawn should not throw
expect(() => supervisor.assertCanSpawn('test')).not.toThrow();
});
it('registerProcess and unregisterProcess delegate to the registry', () => {
const { getSupervisor } = require('../../src/supervisor/index.js');
const supervisor = getSupervisor();
const registry = supervisor.getRegistry();
const testId = `test-${Date.now()}`;
supervisor.registerProcess(testId, {
pid: process.pid,
type: 'test',
startedAt: new Date().toISOString()
});
const found = registry.getAll().find((r: { id: string }) => r.id === testId);
expect(found).toBeDefined();
expect(found?.type).toBe('test');
supervisor.unregisterProcess(testId);
const afterUnregister = registry.getAll().find((r: { id: string }) => r.id === testId);
expect(afterUnregister).toBeUndefined();
});
});
describe('Supervisor start idempotency', () => {
it('getSupervisor returns the same instance', () => {
const { getSupervisor } = require('../../src/supervisor/index.js');
const s1 = getSupervisor();
const s2 = getSupervisor();
expect(s1).toBe(s2);
});
});