Files
claude-mem/src/services/infrastructure/HealthMonitor.ts
T
Alex Newman 7966c6cba9 fix: rename save_memory and fix MCP search instructions + startup hook (#1210)
* fix: rename save_memory to save_observation and fix MCP search instructions

Stop the primary agent from proactively saving memories by renaming
save_memory to save_observation with a neutral description. Remove
"Saving Memories" section from SKILL.md. Update context formatters
and output styles to reference the mem-search skill instead of raw
MCP tool names.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* fix: split SessionStart hooks so smart-install failure doesn't block worker start

smart-install.js and worker-start were in the same hook group, so if
smart-install exited non-zero the worker never started. Split into
separate hook groups so they run independently.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* fix: worker startup waits for readiness before hooks fire

Move initializationCompleteFlag to set after DB/search init (not MCP),
add waitForReadiness() polling /api/readiness, and extract shared
pollEndpointUntilOk helper to DRY up health/readiness checks.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 03:30:31 -05:00

165 lines
5.8 KiB
TypeScript

/**
* HealthMonitor - Port monitoring, health checks, and version checking
*
* Extracted from worker-service.ts monolith to provide centralized health monitoring.
* Handles:
* - Port availability checking
* - Worker health/readiness polling
* - Version mismatch detection (critical for plugin updates)
* - HTTP-based shutdown requests
*/
import path from 'path';
import { readFileSync } from 'fs';
import { logger } from '../../utils/logger.js';
import { MARKETPLACE_ROOT } from '../../shared/paths.js';
/**
* Check if a port is in use by querying the health endpoint
*/
export async function isPortInUse(port: number): Promise<boolean> {
try {
// Note: Removed AbortSignal.timeout to avoid Windows Bun cleanup issue (libuv assertion)
const response = await fetch(`http://127.0.0.1:${port}/api/health`);
return response.ok;
} catch (error) {
// [ANTI-PATTERN IGNORED]: Health check polls every 500ms, logging would flood
return false;
}
}
/**
* Poll a localhost endpoint until it returns 200 OK or timeout.
* Shared implementation for liveness and readiness checks.
*/
async function pollEndpointUntilOk(
port: number,
endpointPath: string,
timeoutMs: number,
retryLogMessage: string
): Promise<boolean> {
const start = Date.now();
while (Date.now() - start < timeoutMs) {
try {
// Note: Removed AbortSignal.timeout to avoid Windows Bun cleanup issue (libuv assertion)
const response = await fetch(`http://127.0.0.1:${port}${endpointPath}`);
if (response.ok) return true;
} catch (error) {
// [ANTI-PATTERN IGNORED]: Retry loop - expected failures during startup, will retry
logger.debug('SYSTEM', retryLogMessage, { port }, error as Error);
}
await new Promise(r => setTimeout(r, 500));
}
return false;
}
/**
* Wait for the worker HTTP server to become responsive (liveness check).
* Uses /api/health which returns 200 as soon as the HTTP server is listening.
* For full initialization (DB + search), use waitForReadiness() instead.
*/
export function waitForHealth(port: number, timeoutMs: number = 30000): Promise<boolean> {
return pollEndpointUntilOk(port, '/api/health', timeoutMs, 'Service not ready yet, will retry');
}
/**
* Wait for the worker to be fully initialized (DB + search ready).
* Uses /api/readiness which returns 200 only after core initialization completes.
* Now that initializationCompleteFlag is set after DB/search init (not MCP),
* this typically completes in a few seconds.
*/
export function waitForReadiness(port: number, timeoutMs: number = 30000): Promise<boolean> {
return pollEndpointUntilOk(port, '/api/readiness', timeoutMs, 'Worker not ready yet, will retry');
}
/**
* Wait for a port to become free (no longer responding to health checks)
* Used after shutdown to confirm the port is available for restart
*/
export async function waitForPortFree(port: number, timeoutMs: number = 10000): Promise<boolean> {
const start = Date.now();
while (Date.now() - start < timeoutMs) {
if (!(await isPortInUse(port))) return true;
await new Promise(r => setTimeout(r, 500));
}
return false;
}
/**
* Send HTTP shutdown request to a running worker
* @param port Worker port
* @returns true if shutdown request was acknowledged, false otherwise
*/
export async function httpShutdown(port: number): Promise<boolean> {
try {
// Note: Removed AbortSignal.timeout to avoid Windows Bun cleanup issue (libuv assertion)
const response = await fetch(`http://127.0.0.1:${port}/api/admin/shutdown`, {
method: 'POST'
});
if (!response.ok) {
logger.warn('SYSTEM', 'Shutdown request returned error', { port, status: response.status });
return false;
}
return true;
} catch (error) {
// Connection refused is expected if worker already stopped
if (error instanceof Error && error.message?.includes('ECONNREFUSED')) {
logger.debug('SYSTEM', 'Worker already stopped', { port }, error);
return false;
}
// Unexpected error - log full details
logger.error('SYSTEM', 'Shutdown request failed unexpectedly', { port }, error as Error);
return false;
}
}
/**
* Get the plugin version from the installed marketplace package.json
* This is the "expected" version that should be running
*/
export function getInstalledPluginVersion(): string {
const packageJsonPath = path.join(MARKETPLACE_ROOT, 'package.json');
const packageJson = JSON.parse(readFileSync(packageJsonPath, 'utf-8'));
return packageJson.version;
}
/**
* Get the running worker's version via API
* This is the "actual" version currently running
*/
export async function getRunningWorkerVersion(port: number): Promise<string | null> {
try {
const response = await fetch(`http://127.0.0.1:${port}/api/version`);
if (!response.ok) return null;
const data = await response.json() as { version: string };
return data.version;
} catch {
// Expected: worker not running or version endpoint unavailable
logger.debug('SYSTEM', 'Could not fetch worker version', { port });
return null;
}
}
export interface VersionCheckResult {
matches: boolean;
pluginVersion: string;
workerVersion: string | null;
}
/**
* Check if worker version matches plugin version
* Critical for detecting when plugin is updated but worker is still running old code
* Returns true if versions match or if we can't determine (assume match for graceful degradation)
*/
export async function checkVersionMatch(port: number): Promise<VersionCheckResult> {
const pluginVersion = getInstalledPluginVersion();
const workerVersion = await getRunningWorkerVersion(port);
// If we can't get worker version, assume it matches (graceful degradation)
if (!workerVersion) {
return { matches: true, pluginVersion, workerVersion };
}
return { matches: pluginVersion === workerVersion, pluginVersion, workerVersion };
}