chore: merge upstream v12.3.8 + keep local fixes

Upstream:
- 12.3.8: detect PID reuse in worker start-guard (#2082) — fixes
  docker container restart where new worker inherits the old PID
  and kill(pid, 0) falsely reports the old instance alive. Uses
  /proc/<pid>/stat starttime on Linux and `ps -p <pid> -o lstart=`
  on macOS/POSIX as an opaque process-start identity token.

Low impact for macOS Desktop users but worth carrying.

Local fixes preserved: env-sanitizer PATH extension, SessionStore
stale session reset. Both verified in built worker-service.cjs.
Worker restarted to v12.3.8.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
JOUNGWOOK KWON
2026-04-22 19:20:51 +09:00
15 changed files with 496 additions and 192 deletions
+1 -1
View File
@@ -10,7 +10,7 @@
"plugins": [
{
"name": "claude-mem",
"version": "12.3.7",
"version": "12.3.8",
"source": "./plugin",
"description": "Persistent memory system for Claude Code - context compression across sessions"
}
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "claude-mem",
"version": "12.3.7",
"version": "12.3.8",
"description": "Memory compression system for Claude Code - persist context across sessions",
"author": {
"name": "Alex Newman"
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "claude-mem",
"version": "12.3.7",
"version": "12.3.8",
"description": "Memory compression system for Claude Code - persist context across sessions",
"author": {
"name": "Alex Newman",
+25
View File
@@ -4,6 +4,31 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [12.3.8] - 2026-04-21
## 🔧 Fix
**Detect PID reuse in the worker start-guard so containers can restart cleanly.** (#2082)
The `kill(pid, 0)` liveness check false-positived when the worker's PID file outlived its PID namespace — most commonly after `docker stop` / `docker start` with a bind-mounted `~/.claude-mem`. The new worker would boot as the same low PID (often 11) as the old one, `kill(0)` would report "alive," and the worker would refuse to start *against its own prior incarnation*. Symptom: container appeared to start, immediately exited cleanly with no user-visible error, worker never came up.
### What changed
- Capture an opaque **process-start identity token** alongside the PID and verify identity, not just liveness:
- **Linux**: `/proc/<pid>/stat` field 22 (starttime in jiffies) — cheap, no exec, same signal `pgrep`/`systemd` use.
- **macOS / POSIX**: `ps -p <pid> -o lstart=` with `LC_ALL=C` pinned so the emitted timestamp is locale-independent across environments.
- **Windows**: unchanged — falls back to liveness-only. The PID-reuse scenario doesn't affect Windows deployments the way containers do.
- `verifyPidFileOwnership` emits a DEBUG log when liveness passes but the token mismatches, so the "PID reused" case is distinguishable from "process dead" in production logs.
- PID files written by older versions are token-less; `verifyPidFileOwnership` falls back to the existing liveness-only behavior for backwards compatibility. **No migration required.**
### Surface
Shared helpers (`PidInfo`, `captureProcessStartToken`, `verifyPidFileOwnership`) live in `src/supervisor/process-registry.ts` and are re-exported from `ProcessManager.ts` to preserve the existing public surface. Both entry points updated: `worker-service.ts` GUARD 1 and `supervisor/index.ts` `validateWorkerPidFile`.
### Tests
+14 new tests covering token capture, ownership verification, backwards compatibility for tokenless PID files, and the container-restart regression scenario. Zero regressions.
## [12.3.7] - 2026-04-20
## What's Changed
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "claude-mem",
"version": "12.3.7",
"version": "12.3.8",
"description": "Memory compression system for Claude Code - persist context across sessions",
"keywords": [
"claude",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "claude-mem",
"version": "12.3.7",
"version": "12.3.8",
"description": "Persistent memory system for Claude Code - seamlessly preserve context across sessions",
"author": {
"name": "Alex Newman"
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "claude-mem-plugin",
"version": "12.3.7",
"version": "12.3.8",
"private": true,
"description": "Runtime dependencies for claude-mem bundled hooks",
"type": "module",
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+15 -7
View File
@@ -185,18 +185,26 @@ function resolveWorkerRuntimePathUncached(options: RuntimeResolverOptions): stri
return lookupInPath('bun', platform);
}
export interface PidInfo {
pid: number;
port: number;
startedAt: string;
}
import {
captureProcessStartToken,
verifyPidFileOwnership,
type PidInfo
} from '../../supervisor/process-registry.js';
export { captureProcessStartToken, verifyPidFileOwnership, type PidInfo };
/**
* Write PID info to the standard PID file location
* Write PID info to the standard PID file location.
*
* Automatically captures a process-start token for `info.pid` if the caller
* didn't supply one. The token lets future readers detect PID reuse across
* reboots/container restarts see captureProcessStartToken in
* supervisor/process-registry.ts.
*/
export function writePidFile(info: PidInfo): void {
mkdirSync(DATA_DIR, { recursive: true });
writeFileSync(PID_FILE, JSON.stringify(info, null, 2));
const resolvedToken = info.startToken ?? captureProcessStartToken(info.pid);
const payload: PidInfo = resolvedToken ? { ...info, startToken: resolvedToken } : info;
writeFileSync(PID_FILE, JSON.stringify(payload, null, 2));
}
/**
+7 -4
View File
@@ -48,7 +48,7 @@ import {
runOneTimeChromaMigration,
runOneTimeCwdRemap,
cleanStalePidFile,
isProcessAlive,
verifyPidFileOwnership,
spawnDaemon,
touchPidFile
} from './infrastructure/ProcessManager.js';
@@ -1361,10 +1361,13 @@ async function main() {
case '--daemon':
default: {
// GUARD 1: Refuse to start if another worker is already alive (PID check).
// Instant check (kill -0) — no HTTP dependency.
// GUARD 1: Refuse to start if another worker is already alive.
// Verifies PID *identity* (via start-time token) not just liveness, so a
// stale PID file pointing at a PID that's since been reused by an
// unrelated process (e.g. container restart reusing low PIDs) doesn't
// false-positive.
const existingPidInfo = readPidFile();
if (existingPidInfo && isProcessAlive(existingPidInfo.pid)) {
if (verifyPidFileOwnership(existingPidInfo)) {
logger.info('SYSTEM', 'Worker already running (PID alive), refusing to start duplicate', {
existingPid: existingPidInfo.pid,
existingPort: existingPidInfo.port,
+9 -9
View File
@@ -2,19 +2,19 @@ import { existsSync, readFileSync, rmSync } from 'fs';
import { homedir } from 'os';
import path from 'path';
import { logger } from '../utils/logger.js';
import { getProcessRegistry, isPidAlive, type ManagedProcessInfo, type ProcessRegistry } from './process-registry.js';
import {
getProcessRegistry,
verifyPidFileOwnership,
type ManagedProcessInfo,
type PidInfo,
type ProcessRegistry
} from './process-registry.js';
import { runShutdownCascade } from './shutdown.js';
import { startHealthChecker, stopHealthChecker } from './health-checker.js';
const DATA_DIR = path.join(homedir(), '.claude-mem');
const PID_FILE = path.join(DATA_DIR, 'worker.pid');
interface PidInfo {
pid: number;
port: number;
startedAt: string;
}
interface ValidateWorkerPidOptions {
logAlive?: boolean;
pidFilePath?: string;
@@ -182,7 +182,7 @@ export function validateWorkerPidFile(options: ValidateWorkerPidOptions = {}): V
return 'invalid';
}
if (isPidAlive(pidInfo.pid)) {
if (verifyPidFileOwnership(pidInfo)) {
if (options.logAlive ?? true) {
logger.info('SYSTEM', 'Worker already running (PID alive)', {
existingPid: pidInfo.pid,
@@ -193,7 +193,7 @@ export function validateWorkerPidFile(options: ValidateWorkerPidOptions = {}): V
return 'alive';
}
logger.info('SYSTEM', 'Removing stale PID file (worker process is dead)', {
logger.info('SYSTEM', 'Removing stale PID file (worker process is dead or PID has been reused)', {
pid: pidInfo.pid,
port: pidInfo.port,
startedAt: pidInfo.startedAt
+129 -1
View File
@@ -1,4 +1,4 @@
import { ChildProcess } from 'child_process';
import { ChildProcess, spawnSync } from 'child_process';
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'fs';
import { homedir } from 'os';
import path from 'path';
@@ -44,6 +44,134 @@ export function isPidAlive(pid: number): boolean {
}
}
export interface PidInfo {
pid: number;
port: number;
startedAt: string;
// Opaque process-start token used to distinguish a worker incarnation from
// another process that happens to reuse the same PID. Captured via
// captureProcessStartToken() at write time, checked via
// verifyPidFileOwnership() at read time. Optional for backwards
// compatibility with PID files written by older versions.
startToken?: string;
}
/**
* Capture an opaque "identity" token for a running PID something stable
* across time for that exact process incarnation, but different if the PID
* gets reused by a later process.
*
* Fixes a class of false-positive "worker already running" errors where the
* PID file survives (bind-mounted volume, persistent home dir, etc.) while
* the PID namespace resets (docker stop / docker start), and the new worker
* incarnation happens to get the same PID as the old one. A plain kill(0)
* liveness check then says "yes, PID is alive" but it's actually *us*
* checking against our own PID file and refusing to boot.
*
* Sources by platform (`process.platform`):
* - `linux`: field 22 of /proc/<pid>/stat (starttime, jiffies since boot).
* Cheap, no exec. Same approach pgrep/systemd use.
* - `darwin` and any other POSIX (*BSD, SunOS) that falls through the Linux
* check: `ps -p <pid> -o lstart=` (wall-clock start time). A one-shot exec
* at worker startup fine. If `ps` is missing the ENOENT is caught and
* null is returned; callers then fall back to liveness-only.
* - `win32`: null (caller falls back to liveness-only behavior). The PID-
* reuse scenario doesn't affect Windows deployments the way containers do.
*
* Returns null when we can't read a token (permission denied, process gone,
* unsupported platform). Callers should treat null as "can't verify" and
* fall back to the liveness-only code path to preserve existing behavior.
*/
export function captureProcessStartToken(pid: number): string | null {
if (!Number.isInteger(pid) || pid <= 0) return null;
if (process.platform === 'linux') {
try {
// /proc/<pid>/stat format:
// <pid> (comm) <state> <ppid> ... <starttime@field-22> ...
// `comm` can contain spaces and parens, so we key off the LAST ')' and
// split the tail — avoids being confused by weird process names.
const raw = readFileSync(`/proc/${pid}/stat`, 'utf-8');
const tailStart = raw.lastIndexOf(') ');
if (tailStart < 0) return null;
const fields = raw.slice(tailStart + 2).split(' ');
// After ') ' we're at field 3 (state). starttime is field 22.
// Offset into `fields`: 22 - 3 = 19.
const starttime = fields[19];
return starttime && /^\d+$/.test(starttime) ? starttime : null;
} catch (error: unknown) {
logger.debug('SYSTEM', 'captureProcessStartToken: /proc read failed', {
pid,
error: error instanceof Error ? error.message : String(error)
});
return null;
}
}
if (process.platform === 'win32') {
return null;
}
try {
// Pin LC_ALL=C so `ps lstart=` emits a locale-independent timestamp
// (e.g. `Mon Apr 21 09:00:00 2026`). Without this, a bind-mounted PID
// file written under one locale and read under another would hash to
// different tokens and the new worker would incorrectly treat itself
// as a stale prior incarnation — reintroducing the bug this helper
// exists to prevent. Flagged by Greptile on PR #2082.
const result = spawnSync('ps', ['-p', String(pid), '-o', 'lstart='], {
encoding: 'utf-8',
timeout: 2000,
env: { ...process.env, LC_ALL: 'C', LANG: 'C' }
});
if (result.status !== 0) return null;
const token = result.stdout.trim();
return token.length > 0 ? token : null;
} catch (error: unknown) {
logger.debug('SYSTEM', 'captureProcessStartToken: ps exec failed', {
pid,
error: error instanceof Error ? error.message : String(error)
});
return null;
}
}
/**
* Verify that the process named by `info` is the same worker incarnation
* that wrote the PID file. Returns true only when:
* - the PID is currently alive, AND
* - either the stored start token matches the current token for that PID,
* OR no token is stored (PID file written by an older version fall
* back to liveness-only for backwards compatibility).
*
* Returns false for null input, dead PIDs, and token mismatches. A token
* mismatch means the PID has been reused by an unrelated process the PID
* file is stale even though kill(0) succeeds.
*/
export function verifyPidFileOwnership(info: PidInfo | null): info is PidInfo {
if (!info) return false;
if (!isPidAlive(info.pid)) return false;
if (!info.startToken) return true;
const currentToken = captureProcessStartToken(info.pid);
if (currentToken === null) return true;
const match = currentToken === info.startToken;
if (!match) {
// Emit a debug signal when liveness passes but identity fails — the
// exact container-restart scenario this helper exists to catch. Without
// this log the callers just say "stale" and can't distinguish
// "process dead" from "PID reused by a different process".
logger.debug('SYSTEM', 'verifyPidFileOwnership: start-token mismatch (PID reused)', {
pid: info.pid,
stored: info.startToken,
current: currentToken
});
}
return match;
}
export class ProcessRegistry {
private readonly registryPath: string;
private readonly entries = new Map<string, ManagedProcessInfo>();
@@ -16,6 +16,8 @@ import {
spawnDaemon,
resolveWorkerRuntimePath,
runOneTimeChromaMigration,
captureProcessStartToken,
verifyPidFileOwnership,
type PidInfo
} from '../../src/services/infrastructure/index.js';
@@ -361,6 +363,121 @@ describe('ProcessManager', () => {
});
});
describe('captureProcessStartToken', () => {
const supported = process.platform === 'linux' || process.platform === 'darwin';
it.if(supported)('returns a non-empty token for the current process', () => {
const token = captureProcessStartToken(process.pid);
expect(typeof token).toBe('string');
expect((token ?? '').length).toBeGreaterThan(0);
});
it.if(supported)('returns a stable token across calls for the same PID', () => {
const first = captureProcessStartToken(process.pid);
const second = captureProcessStartToken(process.pid);
expect(first).toBe(second);
});
it('returns null for a non-existent PID', () => {
expect(captureProcessStartToken(2147483647)).toBeNull();
});
it('returns null for invalid PIDs', () => {
expect(captureProcessStartToken(0)).toBeNull();
expect(captureProcessStartToken(-1)).toBeNull();
expect(captureProcessStartToken(1.5)).toBeNull();
expect(captureProcessStartToken(NaN)).toBeNull();
});
it('returns null on win32 (liveness-only fallback path)', () => {
// Simulate Windows to exercise the documented fallback. Real CI doesn't
// run on win32, so without this mock the branch is uncovered.
const originalPlatform = process.platform;
Object.defineProperty(process, 'platform', { value: 'win32', configurable: true });
try {
expect(captureProcessStartToken(process.pid)).toBeNull();
} finally {
Object.defineProperty(process, 'platform', { value: originalPlatform, configurable: true });
}
});
});
describe('writePidFile (start-token capture)', () => {
const supported = process.platform === 'linux' || process.platform === 'darwin';
it.if(supported)('auto-captures a startToken when writing for the current process', () => {
writePidFile({ pid: process.pid, port: 37777, startedAt: new Date().toISOString() });
const persisted = readPidFile();
expect(persisted).not.toBeNull();
expect(typeof persisted!.startToken).toBe('string');
expect((persisted!.startToken ?? '').length).toBeGreaterThan(0);
});
it('preserves a caller-supplied startToken verbatim', () => {
const provided = 'caller-supplied-token-xyz';
writePidFile({ pid: process.pid, port: 37777, startedAt: new Date().toISOString(), startToken: provided });
const persisted = readPidFile();
expect(persisted!.startToken).toBe(provided);
});
it('omits startToken when the target PID has no readable token (dead PID)', () => {
// pid is dead, so captureProcessStartToken() returns null and writePidFile
// should not persist a startToken field.
writePidFile({ pid: 2147483647, port: 37777, startedAt: new Date().toISOString() });
const persisted = readPidFile();
expect(persisted).not.toBeNull();
expect(persisted!.startToken).toBeUndefined();
});
});
describe('verifyPidFileOwnership', () => {
const supported = process.platform === 'linux' || process.platform === 'darwin';
it('returns false for null input', () => {
expect(verifyPidFileOwnership(null)).toBe(false);
});
it('returns false when the PID is not alive', () => {
expect(verifyPidFileOwnership({
pid: 2147483647,
port: 37777,
startedAt: new Date().toISOString(),
startToken: 'anything'
})).toBe(false);
});
it('returns true when no startToken is stored (back-compat with older PID files)', () => {
expect(verifyPidFileOwnership({
pid: process.pid,
port: 37777,
startedAt: new Date().toISOString()
// intentionally no startToken
})).toBe(true);
});
it.if(supported)('returns true when the stored token matches the current PID', () => {
const token = captureProcessStartToken(process.pid);
expect(token).not.toBeNull();
expect(verifyPidFileOwnership({
pid: process.pid,
port: 37777,
startedAt: new Date().toISOString(),
startToken: token!
})).toBe(true);
});
it.if(supported)('returns false when the stored token does not match (PID reused)', () => {
// Simulates the container-restart bug: PID is alive (we pass our own),
// but the stored token was written by a prior incarnation.
expect(verifyPidFileOwnership({
pid: process.pid,
port: 37777,
startedAt: new Date().toISOString(),
startToken: 'token-from-a-different-incarnation'
})).toBe(false);
});
});
describe('cleanStalePidFile', () => {
it('should remove PID file when process is dead', () => {
// Write a PID file with a non-existent PID
+23
View File
@@ -68,6 +68,29 @@ describe('validateWorkerPidFile', () => {
const status = validateWorkerPidFile({ logAlive: false, pidFilePath });
expect(status).toBe('alive');
});
// Regression: container restart (docker stop / docker start) reused low PIDs
// across boots. The PID file on a bind-mounted volume pointed at PID 11;
// the new worker also came up as PID 11. kill(0) returned "alive" and the
// worker refused to boot, thinking its own prior incarnation was still up.
// With the start-token identity check, a stored token that doesn't match
// the current PID's token should resolve as "stale" and the PID file should
// be cleared so the new worker can proceed.
const tokenSupported = process.platform === 'linux' || process.platform === 'darwin';
it.if(tokenSupported)('returns "stale" when startToken does not match the live PID (PID reused)', () => {
const tempDir = makeTempDir();
tempDirs.push(tempDir);
const pidFilePath = path.join(tempDir, 'worker.pid');
writeFileSync(pidFilePath, JSON.stringify({
pid: process.pid,
port: 37777,
startedAt: new Date().toISOString(),
startToken: 'token-from-a-different-incarnation'
}));
const status = validateWorkerPidFile({ logAlive: false, pidFilePath });
expect(status).toBe('stale');
});
});
describe('Supervisor assertCanSpawn behavior', () => {