fix: hook resilience and worker lifecycle improvements (#957, #923, #984, #987, #1042)

Reduce timeouts to eliminate 10-30s startup delay when worker is dead
(common on WSL2 after hibernate). Add stale PID detection, graceful
error handling across all handlers, and error classification that
distinguishes worker unavailability from handler bugs.

- HEALTH_CHECK 30s→3s, new POST_SPAWN_WAIT (5s), PORT_IN_USE_WAIT (3s)
- isProcessAlive() with EPERM handling, cleanStalePidFile()
- getPluginVersion() try-catch for shutdown race (#1042)
- isWorkerUnavailableError: transport+5xx+429→exit 0, 4xx→exit 2
- No-op handler for unknown event types (#984)
- Wrap all handler fetch calls in try-catch for graceful degradation
- CLAUDE_MEM_HEALTH_TIMEOUT_MS env var override with validation
This commit is contained in:
Rod Boev
2026-02-10 15:34:35 -05:00
parent 6ac5507e4e
commit 418e38ee46
16 changed files with 791 additions and 348 deletions
+28 -13
View File
@@ -9,6 +9,7 @@ import type { EventHandler, NormalizedHookInput, HookResult } from '../types.js'
import { ensureWorkerRunning, getWorkerPort } from '../../shared/worker-utils.js';
import { getProjectContext } from '../../utils/project-name.js';
import { HOOK_EXIT_CODES } from '../../shared/hook-constants.js';
import { logger } from '../../utils/logger.js';
export const contextHandler: EventHandler = {
async execute(input: NormalizedHookInput): Promise<HookResult> {
@@ -35,20 +36,34 @@ export const contextHandler: EventHandler = {
// Note: Removed AbortSignal.timeout due to Windows Bun cleanup issue (libuv assertion)
// Worker service has its own timeouts, so client-side timeout is redundant
const response = await fetch(url);
try {
const response = await fetch(url);
if (!response.ok) {
throw new Error(`Context generation failed: ${response.status}`);
}
const result = await response.text();
const additionalContext = result.trim();
return {
hookSpecificOutput: {
hookEventName: 'SessionStart',
additionalContext
if (!response.ok) {
// Log but don't throw — context fetch failure should not block session start
logger.warn('HOOK', 'Context generation failed, returning empty', { status: response.status });
return {
hookSpecificOutput: { hookEventName: 'SessionStart', additionalContext: '' },
exitCode: HOOK_EXIT_CODES.SUCCESS
};
}
};
const result = await response.text();
const additionalContext = result.trim();
return {
hookSpecificOutput: {
hookEventName: 'SessionStart',
additionalContext
}
};
} catch (error) {
// Worker unreachable — return empty context gracefully
logger.warn('HOOK', 'Context fetch error, returning empty', { error: error instanceof Error ? error.message : String(error) });
return {
hookSpecificOutput: { hookEventName: 'SessionStart', additionalContext: '' },
exitCode: HOOK_EXIT_CODES.SUCCESS
};
}
}
};
+24 -16
View File
@@ -39,25 +39,33 @@ export const fileEditHandler: EventHandler = {
// Send to worker as an observation with file edit metadata
// The observation handler on the worker will process this appropriately
const response = await fetch(`http://127.0.0.1:${port}/api/sessions/observations`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
contentSessionId: sessionId,
tool_name: 'write_file',
tool_input: { filePath, edits },
tool_response: { success: true },
cwd
})
// Note: Removed signal to avoid Windows Bun cleanup issue (libuv assertion)
});
try {
const response = await fetch(`http://127.0.0.1:${port}/api/sessions/observations`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
contentSessionId: sessionId,
tool_name: 'write_file',
tool_input: { filePath, edits },
tool_response: { success: true },
cwd
})
// Note: Removed signal to avoid Windows Bun cleanup issue (libuv assertion)
});
if (!response.ok) {
throw new Error(`File edit observation storage failed: ${response.status}`);
if (!response.ok) {
// Log but don't throw — file edit observation failure should not block editing
logger.warn('HOOK', 'File edit observation storage failed, skipping', { status: response.status, filePath });
return { continue: true, suppressOutput: true, exitCode: HOOK_EXIT_CODES.SUCCESS };
}
logger.debug('HOOK', 'File edit observation sent successfully', { filePath });
} catch (error) {
// Worker unreachable — skip file edit observation gracefully
logger.warn('HOOK', 'File edit observation fetch error, skipping', { error: error instanceof Error ? error.message : String(error) });
return { continue: true, suppressOutput: true, exitCode: HOOK_EXIT_CODES.SUCCESS };
}
logger.debug('HOOK', 'File edit observation sent successfully', { filePath });
return { continue: true, suppressOutput: true };
}
};
+14 -5
View File
@@ -5,6 +5,7 @@
*/
import type { EventHandler } from '../types.js';
import { HOOK_EXIT_CODES } from '../../shared/hook-constants.js';
import { contextHandler } from './context.js';
import { sessionInitHandler } from './session-init.js';
import { observationHandler } from './observation.js';
@@ -35,14 +36,22 @@ const handlers: Record<EventType, EventHandler> = {
/**
* Get the event handler for a given event type.
*
* Returns a no-op handler for unknown event types instead of throwing (fix #984).
* Claude Code may send new event types that the plugin doesn't handle yet —
* throwing would surface as a BLOCKING_ERROR to the user.
*
* @param eventType The type of event to handle
* @returns The appropriate EventHandler
* @throws Error if event type is not recognized
* @returns The appropriate EventHandler, or a no-op handler for unknown types
*/
export function getEventHandler(eventType: EventType): EventHandler {
const handler = handlers[eventType];
export function getEventHandler(eventType: string): EventHandler {
const handler = handlers[eventType as EventType];
if (!handler) {
throw new Error(`Unknown event type: ${eventType}`);
console.error(`[claude-mem] Unknown event type: ${eventType}, returning no-op`);
return {
async execute() {
return { continue: true, suppressOutput: true, exitCode: HOOK_EXIT_CODES.SUCCESS };
}
};
}
return handler;
}
+24 -16
View File
@@ -48,25 +48,33 @@ export const observationHandler: EventHandler = {
}
// Send to worker - worker handles privacy check and database operations
const response = await fetch(`http://127.0.0.1:${port}/api/sessions/observations`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
contentSessionId: sessionId,
tool_name: toolName,
tool_input: toolInput,
tool_response: toolResponse,
cwd
})
// Note: Removed signal to avoid Windows Bun cleanup issue (libuv assertion)
});
try {
const response = await fetch(`http://127.0.0.1:${port}/api/sessions/observations`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
contentSessionId: sessionId,
tool_name: toolName,
tool_input: toolInput,
tool_response: toolResponse,
cwd
})
// Note: Removed signal to avoid Windows Bun cleanup issue (libuv assertion)
});
if (!response.ok) {
throw new Error(`Observation storage failed: ${response.status}`);
if (!response.ok) {
// Log but don't throw — observation storage failure should not block tool use
logger.warn('HOOK', 'Observation storage failed, skipping', { status: response.status, toolName });
return { continue: true, suppressOutput: true, exitCode: HOOK_EXIT_CODES.SUCCESS };
}
logger.debug('HOOK', 'Observation sent successfully', { toolName });
} catch (error) {
// Worker unreachable — skip observation gracefully
logger.warn('HOOK', 'Observation fetch error, skipping', { error: error instanceof Error ? error.message : String(error) });
return { continue: true, suppressOutput: true, exitCode: HOOK_EXIT_CODES.SUCCESS };
}
logger.debug('HOOK', 'Observation sent successfully', { toolName });
return { continue: true, suppressOutput: true };
}
};
+5 -1
View File
@@ -16,7 +16,11 @@ import { logger } from '../../utils/logger.js';
export const sessionCompleteHandler: EventHandler = {
async execute(input: NormalizedHookInput): Promise<HookResult> {
// Ensure worker is running
await ensureWorkerRunning();
const workerReady = await ensureWorkerRunning();
if (!workerReady) {
// Worker not available — skip session completion gracefully
return { continue: true, suppressOutput: true };
}
const { sessionId } = input;
const port = getWorkerPort();
+31 -22
View File
@@ -13,37 +13,46 @@ import { HOOK_EXIT_CODES } from '../../shared/hook-constants.js';
export const userMessageHandler: EventHandler = {
async execute(input: NormalizedHookInput): Promise<HookResult> {
// Ensure worker is running
await ensureWorkerRunning();
const workerReady = await ensureWorkerRunning();
if (!workerReady) {
// Worker not available — skip user message gracefully
return { exitCode: HOOK_EXIT_CODES.SUCCESS };
}
const port = getWorkerPort();
const project = basename(input.cwd ?? process.cwd());
// Fetch formatted context directly from worker API
// Note: Removed AbortSignal.timeout to avoid Windows Bun cleanup issue (libuv assertion)
const response = await fetch(
`http://127.0.0.1:${port}/api/context/inject?project=${encodeURIComponent(project)}&colors=true`,
{ method: 'GET' }
);
try {
const response = await fetch(
`http://127.0.0.1:${port}/api/context/inject?project=${encodeURIComponent(project)}&colors=true`,
{ method: 'GET' }
);
if (!response.ok) {
// Don't throw - context fetch failure should not block the user's prompt
return { exitCode: HOOK_EXIT_CODES.SUCCESS };
if (!response.ok) {
// Don't throw - context fetch failure should not block the user's prompt
return { exitCode: HOOK_EXIT_CODES.SUCCESS };
}
const output = await response.text();
// Write to stderr for user visibility
// Note: Using process.stderr.write instead of console.error to avoid
// Claude Code treating this as a hook error. The actual hook output
// goes to stdout via hook-command.ts JSON serialization.
process.stderr.write(
"\n\n" + String.fromCodePoint(0x1F4DD) + " Claude-Mem Context Loaded\n\n" +
output +
"\n\n" + String.fromCodePoint(0x1F4A1) + " Wrap any message with <private> ... </private> to prevent storing sensitive information.\n" +
"\n" + String.fromCodePoint(0x1F4AC) + " Community https://discord.gg/J4wttp9vDu" +
`\n` + String.fromCodePoint(0x1F4FA) + ` Watch live in browser http://localhost:${port}/\n`
);
} catch (error) {
// Worker unreachable — skip user message gracefully
// User message context error is non-critical — skip gracefully
}
const output = await response.text();
// Write to stderr for user visibility
// Note: Using process.stderr.write instead of console.error to avoid
// Claude Code treating this as a hook error. The actual hook output
// goes to stdout via hook-command.ts JSON serialization.
process.stderr.write(
"\n\n" + String.fromCodePoint(0x1F4DD) + " Claude-Mem Context Loaded\n\n" +
output +
"\n\n" + String.fromCodePoint(0x1F4A1) + " Wrap any message with <private> ... </private> to prevent storing sensitive information.\n" +
"\n" + String.fromCodePoint(0x1F4AC) + " Community https://discord.gg/J4wttp9vDu" +
`\n` + String.fromCodePoint(0x1F4FA) + ` Watch live in browser http://localhost:${port}/\n`
);
return { exitCode: HOOK_EXIT_CODES.SUCCESS };
}
};
+65 -2
View File
@@ -8,6 +8,61 @@ export interface HookCommandOptions {
skipExit?: boolean;
}
/**
* Classify whether an error indicates the worker is unavailable (graceful degradation)
* vs a handler/client bug (blocking error that developers need to see).
*
* Exit 0 (graceful degradation):
* - Transport failures: ECONNREFUSED, ECONNRESET, EPIPE, ETIMEDOUT, fetch failed
* - Timeout errors: timed out, timeout
* - Server errors: HTTP 5xx status codes
*
* Exit 2 (blocking error — handler/client bug):
* - HTTP 4xx status codes (bad request, not found, validation error)
* - Programming errors (TypeError, ReferenceError, SyntaxError)
* - All other unexpected errors
*/
export function isWorkerUnavailableError(error: unknown): boolean {
const message = error instanceof Error ? error.message : String(error);
const lower = message.toLowerCase();
// Transport failures — worker unreachable
const transportPatterns = [
'econnrefused',
'econnreset',
'epipe',
'etimedout',
'enotfound',
'econnaborted',
'enetunreach',
'ehostunreach',
'fetch failed',
'unable to connect',
'socket hang up',
];
if (transportPatterns.some(p => lower.includes(p))) return true;
// Timeout errors — worker didn't respond in time
if (lower.includes('timed out') || lower.includes('timeout')) return true;
// HTTP 5xx server errors — worker has internal problems
if (/failed:\s*5\d{2}/.test(message) || /status[:\s]+5\d{2}/.test(message)) return true;
// HTTP 429 (rate limit) — treat as transient unavailability, not a bug
if (/failed:\s*429/.test(message) || /status[:\s]+429/.test(message)) return true;
// HTTP 4xx client errors — our bug, NOT worker unavailability
if (/failed:\s*4\d{2}/.test(message) || /status[:\s]+4\d{2}/.test(message)) return false;
// Programming errors — code bugs, not worker unavailability
if (error instanceof TypeError || error instanceof ReferenceError || error instanceof SyntaxError) {
return false;
}
// Default: treat unknown errors as blocking (conservative — surface bugs)
return false;
}
export async function hookCommand(platform: string, event: string, options: HookCommandOptions = {}): Promise<number> {
try {
const adapter = getPlatformAdapter(platform);
@@ -26,9 +81,17 @@ export async function hookCommand(platform: string, event: string, options: Hook
}
return exitCode;
} catch (error) {
if (isWorkerUnavailableError(error)) {
// Worker unavailable — degrade gracefully, don't block the user
console.error(`[claude-mem] Worker unavailable, skipping hook: ${error instanceof Error ? error.message : error}`);
if (!options.skipExit) {
process.exit(HOOK_EXIT_CODES.SUCCESS); // = 0 (graceful)
}
return HOOK_EXIT_CODES.SUCCESS;
}
// Handler/client bug — show as blocking error so developers see it
console.error(`Hook error: ${error}`);
// Use exit code 2 (blocking error) so users see the error message
// Exit code 1 only shows in verbose mode per Claude Code docs
if (!options.skipExit) {
process.exit(HOOK_EXIT_CODES.BLOCKING_ERROR); // = 2
}