chore: merge upstream v12.3.8 + keep local fixes
Upstream: - 12.3.8: detect PID reuse in worker start-guard (#2082) — fixes docker container restart where new worker inherits the old PID and kill(pid, 0) falsely reports the old instance alive. Uses /proc/<pid>/stat starttime on Linux and `ps -p <pid> -o lstart=` on macOS/POSIX as an opaque process-start identity token. Low impact for macOS Desktop users but worth carrying. Local fixes preserved: env-sanitizer PATH extension, SessionStore stale session reset. Both verified in built worker-service.cjs. Worker restarted to v12.3.8. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -10,7 +10,7 @@
|
|||||||
"plugins": [
|
"plugins": [
|
||||||
{
|
{
|
||||||
"name": "claude-mem",
|
"name": "claude-mem",
|
||||||
"version": "12.3.7",
|
"version": "12.3.8",
|
||||||
"source": "./plugin",
|
"source": "./plugin",
|
||||||
"description": "Persistent memory system for Claude Code - context compression across sessions"
|
"description": "Persistent memory system for Claude Code - context compression across sessions"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "claude-mem",
|
"name": "claude-mem",
|
||||||
"version": "12.3.7",
|
"version": "12.3.8",
|
||||||
"description": "Memory compression system for Claude Code - persist context across sessions",
|
"description": "Memory compression system for Claude Code - persist context across sessions",
|
||||||
"author": {
|
"author": {
|
||||||
"name": "Alex Newman"
|
"name": "Alex Newman"
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "claude-mem",
|
"name": "claude-mem",
|
||||||
"version": "12.3.7",
|
"version": "12.3.8",
|
||||||
"description": "Memory compression system for Claude Code - persist context across sessions",
|
"description": "Memory compression system for Claude Code - persist context across sessions",
|
||||||
"author": {
|
"author": {
|
||||||
"name": "Alex Newman",
|
"name": "Alex Newman",
|
||||||
|
|||||||
@@ -4,6 +4,31 @@ All notable changes to this project will be documented in this file.
|
|||||||
|
|
||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
|
|
||||||
|
## [12.3.8] - 2026-04-21
|
||||||
|
|
||||||
|
## 🔧 Fix
|
||||||
|
|
||||||
|
**Detect PID reuse in the worker start-guard so containers can restart cleanly.** (#2082)
|
||||||
|
|
||||||
|
The `kill(pid, 0)` liveness check false-positived when the worker's PID file outlived its PID namespace — most commonly after `docker stop` / `docker start` with a bind-mounted `~/.claude-mem`. The new worker would boot as the same low PID (often 11) as the old one, `kill(0)` would report "alive," and the worker would refuse to start *against its own prior incarnation*. Symptom: container appeared to start, immediately exited cleanly with no user-visible error, worker never came up.
|
||||||
|
|
||||||
|
### What changed
|
||||||
|
|
||||||
|
- Capture an opaque **process-start identity token** alongside the PID and verify identity, not just liveness:
|
||||||
|
- **Linux**: `/proc/<pid>/stat` field 22 (starttime in jiffies) — cheap, no exec, same signal `pgrep`/`systemd` use.
|
||||||
|
- **macOS / POSIX**: `ps -p <pid> -o lstart=` with `LC_ALL=C` pinned so the emitted timestamp is locale-independent across environments.
|
||||||
|
- **Windows**: unchanged — falls back to liveness-only. The PID-reuse scenario doesn't affect Windows deployments the way containers do.
|
||||||
|
- `verifyPidFileOwnership` emits a DEBUG log when liveness passes but the token mismatches, so the "PID reused" case is distinguishable from "process dead" in production logs.
|
||||||
|
- PID files written by older versions are token-less; `verifyPidFileOwnership` falls back to the existing liveness-only behavior for backwards compatibility. **No migration required.**
|
||||||
|
|
||||||
|
### Surface
|
||||||
|
|
||||||
|
Shared helpers (`PidInfo`, `captureProcessStartToken`, `verifyPidFileOwnership`) live in `src/supervisor/process-registry.ts` and are re-exported from `ProcessManager.ts` to preserve the existing public surface. Both entry points updated: `worker-service.ts` GUARD 1 and `supervisor/index.ts` `validateWorkerPidFile`.
|
||||||
|
|
||||||
|
### Tests
|
||||||
|
|
||||||
|
+14 new tests covering token capture, ownership verification, backwards compatibility for tokenless PID files, and the container-restart regression scenario. Zero regressions.
|
||||||
|
|
||||||
## [12.3.7] - 2026-04-20
|
## [12.3.7] - 2026-04-20
|
||||||
|
|
||||||
## What's Changed
|
## What's Changed
|
||||||
|
|||||||
+1
-1
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "claude-mem",
|
"name": "claude-mem",
|
||||||
"version": "12.3.7",
|
"version": "12.3.8",
|
||||||
"description": "Memory compression system for Claude Code - persist context across sessions",
|
"description": "Memory compression system for Claude Code - persist context across sessions",
|
||||||
"keywords": [
|
"keywords": [
|
||||||
"claude",
|
"claude",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "claude-mem",
|
"name": "claude-mem",
|
||||||
"version": "12.3.7",
|
"version": "12.3.8",
|
||||||
"description": "Persistent memory system for Claude Code - seamlessly preserve context across sessions",
|
"description": "Persistent memory system for Claude Code - seamlessly preserve context across sessions",
|
||||||
"author": {
|
"author": {
|
||||||
"name": "Alex Newman"
|
"name": "Alex Newman"
|
||||||
|
|||||||
+1
-1
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "claude-mem-plugin",
|
"name": "claude-mem-plugin",
|
||||||
"version": "12.3.7",
|
"version": "12.3.8",
|
||||||
"private": true,
|
"private": true,
|
||||||
"description": "Runtime dependencies for claude-mem bundled hooks",
|
"description": "Runtime dependencies for claude-mem bundled hooks",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
+139
-139
File diff suppressed because one or more lines are too long
@@ -185,18 +185,26 @@ function resolveWorkerRuntimePathUncached(options: RuntimeResolverOptions): stri
|
|||||||
return lookupInPath('bun', platform);
|
return lookupInPath('bun', platform);
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface PidInfo {
|
import {
|
||||||
pid: number;
|
captureProcessStartToken,
|
||||||
port: number;
|
verifyPidFileOwnership,
|
||||||
startedAt: string;
|
type PidInfo
|
||||||
}
|
} from '../../supervisor/process-registry.js';
|
||||||
|
export { captureProcessStartToken, verifyPidFileOwnership, type PidInfo };
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Write PID info to the standard PID file location
|
* Write PID info to the standard PID file location.
|
||||||
|
*
|
||||||
|
* Automatically captures a process-start token for `info.pid` if the caller
|
||||||
|
* didn't supply one. The token lets future readers detect PID reuse across
|
||||||
|
* reboots/container restarts — see captureProcessStartToken in
|
||||||
|
* supervisor/process-registry.ts.
|
||||||
*/
|
*/
|
||||||
export function writePidFile(info: PidInfo): void {
|
export function writePidFile(info: PidInfo): void {
|
||||||
mkdirSync(DATA_DIR, { recursive: true });
|
mkdirSync(DATA_DIR, { recursive: true });
|
||||||
writeFileSync(PID_FILE, JSON.stringify(info, null, 2));
|
const resolvedToken = info.startToken ?? captureProcessStartToken(info.pid);
|
||||||
|
const payload: PidInfo = resolvedToken ? { ...info, startToken: resolvedToken } : info;
|
||||||
|
writeFileSync(PID_FILE, JSON.stringify(payload, null, 2));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ import {
|
|||||||
runOneTimeChromaMigration,
|
runOneTimeChromaMigration,
|
||||||
runOneTimeCwdRemap,
|
runOneTimeCwdRemap,
|
||||||
cleanStalePidFile,
|
cleanStalePidFile,
|
||||||
isProcessAlive,
|
verifyPidFileOwnership,
|
||||||
spawnDaemon,
|
spawnDaemon,
|
||||||
touchPidFile
|
touchPidFile
|
||||||
} from './infrastructure/ProcessManager.js';
|
} from './infrastructure/ProcessManager.js';
|
||||||
@@ -1361,10 +1361,13 @@ async function main() {
|
|||||||
|
|
||||||
case '--daemon':
|
case '--daemon':
|
||||||
default: {
|
default: {
|
||||||
// GUARD 1: Refuse to start if another worker is already alive (PID check).
|
// GUARD 1: Refuse to start if another worker is already alive.
|
||||||
// Instant check (kill -0) — no HTTP dependency.
|
// Verifies PID *identity* (via start-time token) not just liveness, so a
|
||||||
|
// stale PID file pointing at a PID that's since been reused by an
|
||||||
|
// unrelated process (e.g. container restart reusing low PIDs) doesn't
|
||||||
|
// false-positive.
|
||||||
const existingPidInfo = readPidFile();
|
const existingPidInfo = readPidFile();
|
||||||
if (existingPidInfo && isProcessAlive(existingPidInfo.pid)) {
|
if (verifyPidFileOwnership(existingPidInfo)) {
|
||||||
logger.info('SYSTEM', 'Worker already running (PID alive), refusing to start duplicate', {
|
logger.info('SYSTEM', 'Worker already running (PID alive), refusing to start duplicate', {
|
||||||
existingPid: existingPidInfo.pid,
|
existingPid: existingPidInfo.pid,
|
||||||
existingPort: existingPidInfo.port,
|
existingPort: existingPidInfo.port,
|
||||||
|
|||||||
@@ -2,19 +2,19 @@ import { existsSync, readFileSync, rmSync } from 'fs';
|
|||||||
import { homedir } from 'os';
|
import { homedir } from 'os';
|
||||||
import path from 'path';
|
import path from 'path';
|
||||||
import { logger } from '../utils/logger.js';
|
import { logger } from '../utils/logger.js';
|
||||||
import { getProcessRegistry, isPidAlive, type ManagedProcessInfo, type ProcessRegistry } from './process-registry.js';
|
import {
|
||||||
|
getProcessRegistry,
|
||||||
|
verifyPidFileOwnership,
|
||||||
|
type ManagedProcessInfo,
|
||||||
|
type PidInfo,
|
||||||
|
type ProcessRegistry
|
||||||
|
} from './process-registry.js';
|
||||||
import { runShutdownCascade } from './shutdown.js';
|
import { runShutdownCascade } from './shutdown.js';
|
||||||
import { startHealthChecker, stopHealthChecker } from './health-checker.js';
|
import { startHealthChecker, stopHealthChecker } from './health-checker.js';
|
||||||
|
|
||||||
const DATA_DIR = path.join(homedir(), '.claude-mem');
|
const DATA_DIR = path.join(homedir(), '.claude-mem');
|
||||||
const PID_FILE = path.join(DATA_DIR, 'worker.pid');
|
const PID_FILE = path.join(DATA_DIR, 'worker.pid');
|
||||||
|
|
||||||
interface PidInfo {
|
|
||||||
pid: number;
|
|
||||||
port: number;
|
|
||||||
startedAt: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
interface ValidateWorkerPidOptions {
|
interface ValidateWorkerPidOptions {
|
||||||
logAlive?: boolean;
|
logAlive?: boolean;
|
||||||
pidFilePath?: string;
|
pidFilePath?: string;
|
||||||
@@ -182,7 +182,7 @@ export function validateWorkerPidFile(options: ValidateWorkerPidOptions = {}): V
|
|||||||
return 'invalid';
|
return 'invalid';
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isPidAlive(pidInfo.pid)) {
|
if (verifyPidFileOwnership(pidInfo)) {
|
||||||
if (options.logAlive ?? true) {
|
if (options.logAlive ?? true) {
|
||||||
logger.info('SYSTEM', 'Worker already running (PID alive)', {
|
logger.info('SYSTEM', 'Worker already running (PID alive)', {
|
||||||
existingPid: pidInfo.pid,
|
existingPid: pidInfo.pid,
|
||||||
@@ -193,7 +193,7 @@ export function validateWorkerPidFile(options: ValidateWorkerPidOptions = {}): V
|
|||||||
return 'alive';
|
return 'alive';
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info('SYSTEM', 'Removing stale PID file (worker process is dead)', {
|
logger.info('SYSTEM', 'Removing stale PID file (worker process is dead or PID has been reused)', {
|
||||||
pid: pidInfo.pid,
|
pid: pidInfo.pid,
|
||||||
port: pidInfo.port,
|
port: pidInfo.port,
|
||||||
startedAt: pidInfo.startedAt
|
startedAt: pidInfo.startedAt
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
import { ChildProcess } from 'child_process';
|
import { ChildProcess, spawnSync } from 'child_process';
|
||||||
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'fs';
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'fs';
|
||||||
import { homedir } from 'os';
|
import { homedir } from 'os';
|
||||||
import path from 'path';
|
import path from 'path';
|
||||||
@@ -44,6 +44,134 @@ export function isPidAlive(pid: number): boolean {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface PidInfo {
|
||||||
|
pid: number;
|
||||||
|
port: number;
|
||||||
|
startedAt: string;
|
||||||
|
// Opaque process-start token used to distinguish a worker incarnation from
|
||||||
|
// another process that happens to reuse the same PID. Captured via
|
||||||
|
// captureProcessStartToken() at write time, checked via
|
||||||
|
// verifyPidFileOwnership() at read time. Optional for backwards
|
||||||
|
// compatibility with PID files written by older versions.
|
||||||
|
startToken?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Capture an opaque "identity" token for a running PID — something stable
|
||||||
|
* across time for that exact process incarnation, but different if the PID
|
||||||
|
* gets reused by a later process.
|
||||||
|
*
|
||||||
|
* Fixes a class of false-positive "worker already running" errors where the
|
||||||
|
* PID file survives (bind-mounted volume, persistent home dir, etc.) while
|
||||||
|
* the PID namespace resets (docker stop / docker start), and the new worker
|
||||||
|
* incarnation happens to get the same PID as the old one. A plain kill(0)
|
||||||
|
* liveness check then says "yes, PID is alive" — but it's actually *us*
|
||||||
|
* checking against our own PID file and refusing to boot.
|
||||||
|
*
|
||||||
|
* Sources by platform (`process.platform`):
|
||||||
|
* - `linux`: field 22 of /proc/<pid>/stat (starttime, jiffies since boot).
|
||||||
|
* Cheap, no exec. Same approach pgrep/systemd use.
|
||||||
|
* - `darwin` and any other POSIX (*BSD, SunOS) that falls through the Linux
|
||||||
|
* check: `ps -p <pid> -o lstart=` (wall-clock start time). A one-shot exec
|
||||||
|
* at worker startup — fine. If `ps` is missing the ENOENT is caught and
|
||||||
|
* null is returned; callers then fall back to liveness-only.
|
||||||
|
* - `win32`: null (caller falls back to liveness-only behavior). The PID-
|
||||||
|
* reuse scenario doesn't affect Windows deployments the way containers do.
|
||||||
|
*
|
||||||
|
* Returns null when we can't read a token (permission denied, process gone,
|
||||||
|
* unsupported platform). Callers should treat null as "can't verify" and
|
||||||
|
* fall back to the liveness-only code path to preserve existing behavior.
|
||||||
|
*/
|
||||||
|
export function captureProcessStartToken(pid: number): string | null {
|
||||||
|
if (!Number.isInteger(pid) || pid <= 0) return null;
|
||||||
|
|
||||||
|
if (process.platform === 'linux') {
|
||||||
|
try {
|
||||||
|
// /proc/<pid>/stat format:
|
||||||
|
// <pid> (comm) <state> <ppid> ... <starttime@field-22> ...
|
||||||
|
// `comm` can contain spaces and parens, so we key off the LAST ')' and
|
||||||
|
// split the tail — avoids being confused by weird process names.
|
||||||
|
const raw = readFileSync(`/proc/${pid}/stat`, 'utf-8');
|
||||||
|
const tailStart = raw.lastIndexOf(') ');
|
||||||
|
if (tailStart < 0) return null;
|
||||||
|
const fields = raw.slice(tailStart + 2).split(' ');
|
||||||
|
// After ') ' we're at field 3 (state). starttime is field 22.
|
||||||
|
// Offset into `fields`: 22 - 3 = 19.
|
||||||
|
const starttime = fields[19];
|
||||||
|
return starttime && /^\d+$/.test(starttime) ? starttime : null;
|
||||||
|
} catch (error: unknown) {
|
||||||
|
logger.debug('SYSTEM', 'captureProcessStartToken: /proc read failed', {
|
||||||
|
pid,
|
||||||
|
error: error instanceof Error ? error.message : String(error)
|
||||||
|
});
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (process.platform === 'win32') {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Pin LC_ALL=C so `ps lstart=` emits a locale-independent timestamp
|
||||||
|
// (e.g. `Mon Apr 21 09:00:00 2026`). Without this, a bind-mounted PID
|
||||||
|
// file written under one locale and read under another would hash to
|
||||||
|
// different tokens and the new worker would incorrectly treat itself
|
||||||
|
// as a stale prior incarnation — reintroducing the bug this helper
|
||||||
|
// exists to prevent. Flagged by Greptile on PR #2082.
|
||||||
|
const result = spawnSync('ps', ['-p', String(pid), '-o', 'lstart='], {
|
||||||
|
encoding: 'utf-8',
|
||||||
|
timeout: 2000,
|
||||||
|
env: { ...process.env, LC_ALL: 'C', LANG: 'C' }
|
||||||
|
});
|
||||||
|
if (result.status !== 0) return null;
|
||||||
|
const token = result.stdout.trim();
|
||||||
|
return token.length > 0 ? token : null;
|
||||||
|
} catch (error: unknown) {
|
||||||
|
logger.debug('SYSTEM', 'captureProcessStartToken: ps exec failed', {
|
||||||
|
pid,
|
||||||
|
error: error instanceof Error ? error.message : String(error)
|
||||||
|
});
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verify that the process named by `info` is the same worker incarnation
|
||||||
|
* that wrote the PID file. Returns true only when:
|
||||||
|
* - the PID is currently alive, AND
|
||||||
|
* - either the stored start token matches the current token for that PID,
|
||||||
|
* OR no token is stored (PID file written by an older version — fall
|
||||||
|
* back to liveness-only for backwards compatibility).
|
||||||
|
*
|
||||||
|
* Returns false for null input, dead PIDs, and token mismatches. A token
|
||||||
|
* mismatch means the PID has been reused by an unrelated process — the PID
|
||||||
|
* file is stale even though kill(0) succeeds.
|
||||||
|
*/
|
||||||
|
export function verifyPidFileOwnership(info: PidInfo | null): info is PidInfo {
|
||||||
|
if (!info) return false;
|
||||||
|
if (!isPidAlive(info.pid)) return false;
|
||||||
|
|
||||||
|
if (!info.startToken) return true;
|
||||||
|
|
||||||
|
const currentToken = captureProcessStartToken(info.pid);
|
||||||
|
if (currentToken === null) return true;
|
||||||
|
|
||||||
|
const match = currentToken === info.startToken;
|
||||||
|
if (!match) {
|
||||||
|
// Emit a debug signal when liveness passes but identity fails — the
|
||||||
|
// exact container-restart scenario this helper exists to catch. Without
|
||||||
|
// this log the callers just say "stale" and can't distinguish
|
||||||
|
// "process dead" from "PID reused by a different process".
|
||||||
|
logger.debug('SYSTEM', 'verifyPidFileOwnership: start-token mismatch (PID reused)', {
|
||||||
|
pid: info.pid,
|
||||||
|
stored: info.startToken,
|
||||||
|
current: currentToken
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return match;
|
||||||
|
}
|
||||||
|
|
||||||
export class ProcessRegistry {
|
export class ProcessRegistry {
|
||||||
private readonly registryPath: string;
|
private readonly registryPath: string;
|
||||||
private readonly entries = new Map<string, ManagedProcessInfo>();
|
private readonly entries = new Map<string, ManagedProcessInfo>();
|
||||||
|
|||||||
@@ -16,6 +16,8 @@ import {
|
|||||||
spawnDaemon,
|
spawnDaemon,
|
||||||
resolveWorkerRuntimePath,
|
resolveWorkerRuntimePath,
|
||||||
runOneTimeChromaMigration,
|
runOneTimeChromaMigration,
|
||||||
|
captureProcessStartToken,
|
||||||
|
verifyPidFileOwnership,
|
||||||
type PidInfo
|
type PidInfo
|
||||||
} from '../../src/services/infrastructure/index.js';
|
} from '../../src/services/infrastructure/index.js';
|
||||||
|
|
||||||
@@ -361,6 +363,121 @@ describe('ProcessManager', () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('captureProcessStartToken', () => {
|
||||||
|
const supported = process.platform === 'linux' || process.platform === 'darwin';
|
||||||
|
|
||||||
|
it.if(supported)('returns a non-empty token for the current process', () => {
|
||||||
|
const token = captureProcessStartToken(process.pid);
|
||||||
|
expect(typeof token).toBe('string');
|
||||||
|
expect((token ?? '').length).toBeGreaterThan(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it.if(supported)('returns a stable token across calls for the same PID', () => {
|
||||||
|
const first = captureProcessStartToken(process.pid);
|
||||||
|
const second = captureProcessStartToken(process.pid);
|
||||||
|
expect(first).toBe(second);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns null for a non-existent PID', () => {
|
||||||
|
expect(captureProcessStartToken(2147483647)).toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns null for invalid PIDs', () => {
|
||||||
|
expect(captureProcessStartToken(0)).toBeNull();
|
||||||
|
expect(captureProcessStartToken(-1)).toBeNull();
|
||||||
|
expect(captureProcessStartToken(1.5)).toBeNull();
|
||||||
|
expect(captureProcessStartToken(NaN)).toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns null on win32 (liveness-only fallback path)', () => {
|
||||||
|
// Simulate Windows to exercise the documented fallback. Real CI doesn't
|
||||||
|
// run on win32, so without this mock the branch is uncovered.
|
||||||
|
const originalPlatform = process.platform;
|
||||||
|
Object.defineProperty(process, 'platform', { value: 'win32', configurable: true });
|
||||||
|
try {
|
||||||
|
expect(captureProcessStartToken(process.pid)).toBeNull();
|
||||||
|
} finally {
|
||||||
|
Object.defineProperty(process, 'platform', { value: originalPlatform, configurable: true });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('writePidFile (start-token capture)', () => {
|
||||||
|
const supported = process.platform === 'linux' || process.platform === 'darwin';
|
||||||
|
|
||||||
|
it.if(supported)('auto-captures a startToken when writing for the current process', () => {
|
||||||
|
writePidFile({ pid: process.pid, port: 37777, startedAt: new Date().toISOString() });
|
||||||
|
const persisted = readPidFile();
|
||||||
|
expect(persisted).not.toBeNull();
|
||||||
|
expect(typeof persisted!.startToken).toBe('string');
|
||||||
|
expect((persisted!.startToken ?? '').length).toBeGreaterThan(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('preserves a caller-supplied startToken verbatim', () => {
|
||||||
|
const provided = 'caller-supplied-token-xyz';
|
||||||
|
writePidFile({ pid: process.pid, port: 37777, startedAt: new Date().toISOString(), startToken: provided });
|
||||||
|
const persisted = readPidFile();
|
||||||
|
expect(persisted!.startToken).toBe(provided);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('omits startToken when the target PID has no readable token (dead PID)', () => {
|
||||||
|
// pid is dead, so captureProcessStartToken() returns null and writePidFile
|
||||||
|
// should not persist a startToken field.
|
||||||
|
writePidFile({ pid: 2147483647, port: 37777, startedAt: new Date().toISOString() });
|
||||||
|
const persisted = readPidFile();
|
||||||
|
expect(persisted).not.toBeNull();
|
||||||
|
expect(persisted!.startToken).toBeUndefined();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('verifyPidFileOwnership', () => {
|
||||||
|
const supported = process.platform === 'linux' || process.platform === 'darwin';
|
||||||
|
|
||||||
|
it('returns false for null input', () => {
|
||||||
|
expect(verifyPidFileOwnership(null)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns false when the PID is not alive', () => {
|
||||||
|
expect(verifyPidFileOwnership({
|
||||||
|
pid: 2147483647,
|
||||||
|
port: 37777,
|
||||||
|
startedAt: new Date().toISOString(),
|
||||||
|
startToken: 'anything'
|
||||||
|
})).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns true when no startToken is stored (back-compat with older PID files)', () => {
|
||||||
|
expect(verifyPidFileOwnership({
|
||||||
|
pid: process.pid,
|
||||||
|
port: 37777,
|
||||||
|
startedAt: new Date().toISOString()
|
||||||
|
// intentionally no startToken
|
||||||
|
})).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it.if(supported)('returns true when the stored token matches the current PID', () => {
|
||||||
|
const token = captureProcessStartToken(process.pid);
|
||||||
|
expect(token).not.toBeNull();
|
||||||
|
expect(verifyPidFileOwnership({
|
||||||
|
pid: process.pid,
|
||||||
|
port: 37777,
|
||||||
|
startedAt: new Date().toISOString(),
|
||||||
|
startToken: token!
|
||||||
|
})).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it.if(supported)('returns false when the stored token does not match (PID reused)', () => {
|
||||||
|
// Simulates the container-restart bug: PID is alive (we pass our own),
|
||||||
|
// but the stored token was written by a prior incarnation.
|
||||||
|
expect(verifyPidFileOwnership({
|
||||||
|
pid: process.pid,
|
||||||
|
port: 37777,
|
||||||
|
startedAt: new Date().toISOString(),
|
||||||
|
startToken: 'token-from-a-different-incarnation'
|
||||||
|
})).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
describe('cleanStalePidFile', () => {
|
describe('cleanStalePidFile', () => {
|
||||||
it('should remove PID file when process is dead', () => {
|
it('should remove PID file when process is dead', () => {
|
||||||
// Write a PID file with a non-existent PID
|
// Write a PID file with a non-existent PID
|
||||||
|
|||||||
@@ -68,6 +68,29 @@ describe('validateWorkerPidFile', () => {
|
|||||||
const status = validateWorkerPidFile({ logAlive: false, pidFilePath });
|
const status = validateWorkerPidFile({ logAlive: false, pidFilePath });
|
||||||
expect(status).toBe('alive');
|
expect(status).toBe('alive');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Regression: container restart (docker stop / docker start) reused low PIDs
|
||||||
|
// across boots. The PID file on a bind-mounted volume pointed at PID 11;
|
||||||
|
// the new worker also came up as PID 11. kill(0) returned "alive" and the
|
||||||
|
// worker refused to boot, thinking its own prior incarnation was still up.
|
||||||
|
// With the start-token identity check, a stored token that doesn't match
|
||||||
|
// the current PID's token should resolve as "stale" and the PID file should
|
||||||
|
// be cleared so the new worker can proceed.
|
||||||
|
const tokenSupported = process.platform === 'linux' || process.platform === 'darwin';
|
||||||
|
it.if(tokenSupported)('returns "stale" when startToken does not match the live PID (PID reused)', () => {
|
||||||
|
const tempDir = makeTempDir();
|
||||||
|
tempDirs.push(tempDir);
|
||||||
|
const pidFilePath = path.join(tempDir, 'worker.pid');
|
||||||
|
writeFileSync(pidFilePath, JSON.stringify({
|
||||||
|
pid: process.pid,
|
||||||
|
port: 37777,
|
||||||
|
startedAt: new Date().toISOString(),
|
||||||
|
startToken: 'token-from-a-different-incarnation'
|
||||||
|
}));
|
||||||
|
|
||||||
|
const status = validateWorkerPidFile({ logAlive: false, pidFilePath });
|
||||||
|
expect(status).toBe('stale');
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('Supervisor assertCanSpawn behavior', () => {
|
describe('Supervisor assertCanSpawn behavior', () => {
|
||||||
|
|||||||
Reference in New Issue
Block a user