Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions src/lib/agent/agent-interface.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import path from 'path';
import * as fs from 'fs';
import { createRequire } from 'node:module';
import { getUI, type SpinnerHandle } from '@ui';
import { debug, logToFile, initLogFile, getLogFilePath } from '@utils/debug';
import type { WizardRunOptions } from '@utils/types';
Expand Down Expand Up @@ -54,8 +55,13 @@ async function getSDKModule(): Promise<any> {
* This ensures we use the SDK's bundled version rather than the user's installed Claude Code.
*/
function getClaudeCodeExecutablePath(): string {
// require.resolve finds the package's main entry, then we get cli.js from same dir
const sdkPackagePath = require.resolve('@anthropic-ai/claude-agent-sdk');
// Bare `require` is undefined in ESM (tsx dev runs) — fall back to createRequire.
const resolver =
typeof require !== 'undefined'
? require
: createRequire(process.argv[1] ?? `${process.cwd()}/`);
// resolve finds the package's main entry, then we get cli.js from same dir
const sdkPackagePath = resolver.resolve('@anthropic-ai/claude-agent-sdk');
return path.join(path.dirname(sdkPackagePath), 'cli.js');
}

Expand Down Expand Up @@ -795,6 +801,7 @@ export async function runAgent(
abortCases = [],
} = config ?? {};

logToFile('Starting agent run');
const { query } = await getSDKModule();

spinner.start(spinnerMessage);
Expand Down
11 changes: 11 additions & 0 deletions src/lib/agent/agent-runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,14 @@ export async function runProgram(
const skillsBaseUrl = getSkillsBaseUrl(session.localMcp);

// 2. Health check (guarded — skip if TUI already ran it)
if (session.readinessResult) {
logToFile(
`[agent-runner] readiness pre-computed by TUI: decision=${session.readinessResult.decision}` +
`${
session.outageDismissed ? ' (outage dismissed by user)' : ''
} — skipping re-check`,
);
}
if (!session.readinessResult) {
logToFile('[agent-runner] evaluating wizard readiness');
const readinessConfig = session.signup
Expand Down Expand Up @@ -360,6 +368,8 @@ export async function runProgram(
sessionToOptions(session),
);

logToFile('[agent-runner] agent initialized');

const middleware = session.benchmark
? createBenchmarkPipeline(spinner, sessionToOptions(session))
: undefined;
Expand All @@ -371,6 +381,7 @@ export async function runProgram(
host,
skillPath,
});
logToFile(`[agent-runner] prompt assembled (${prompt.length} chars)`);

// 8. Run agent
const agentResult = await executeAgent(
Expand Down
71 changes: 70 additions & 1 deletion src/lib/health-checks/__tests__/health-checks.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import {
checkCloudflareComponentHealth,
checkCloudflareOverallHealth,
checkGithubHealth,
checkGithubReleasesHealth,
checkLlmGatewayHealth,
checkMcpHealth,
checkNpmComponentHealth,
Expand Down Expand Up @@ -696,6 +697,18 @@ describe('health-checks', () => {
);
});

it('returns down on 302 — the gateway probe stays strict, redirects are not OK here', async () => {
(global.fetch as jest.Mock).mockImplementation(
overrideFetch({
[URLS.llmGatewayLiveness]: () =>
Promise.resolve(new Response(null, { status: 302 })),
}),
);
const result = await checkLlmGatewayHealth();
expect(result.status).toBe(ServiceHealthStatus.Down);
expect(result.error).toBe('HTTP 302');
});

it('returns down when gateway responds 503 (e.g. deploying)', async () => {
(global.fetch as jest.Mock).mockImplementation(
overrideFetch({
Expand Down Expand Up @@ -746,7 +759,7 @@ describe('health-checks', () => {
);
const result = await checkLlmGatewayHealth();
expect(result.status).toBe(ServiceHealthStatus.Down);
expect(result.error).toBe('Request timed out');
expect(result.error).toBe('Request timed out after 5000ms');
});
});

Expand All @@ -765,6 +778,34 @@ describe('health-checks', () => {
);
});

it('returns healthy when worker responds 302 (redirect to docs, not followed)', async () => {
(global.fetch as jest.Mock).mockImplementation(
overrideFetch({
[URLS.mcpLanding]: () =>
Promise.resolve(new Response(null, { status: 302 })),
}),
);
const result = await checkMcpHealth();
expect(result.status).toBe(ServiceHealthStatus.Healthy);
expect(result.rawIndicator).toBe('HTTP 302');
expect(global.fetch).toHaveBeenCalledWith(
URLS.mcpLanding,
expect.objectContaining({ redirect: 'manual' }),
);
});

it('returns down on 400 — only 2xx-3xx counts as up', async () => {
(global.fetch as jest.Mock).mockImplementation(
overrideFetch({
[URLS.mcpLanding]: () =>
Promise.resolve(new Response('Bad Request', { status: 400 })),
}),
);
const result = await checkMcpHealth();
expect(result.status).toBe(ServiceHealthStatus.Down);
expect(result.error).toBe('HTTP 400');
});

it('returns down when worker responds 500', async () => {
(global.fetch as jest.Mock).mockImplementation(
overrideFetch({
Expand Down Expand Up @@ -803,6 +844,34 @@ describe('health-checks', () => {
});
});

// -----------------------------------------------------------------------
// GitHub Releases (fetchEndpointHealth – skill-menu.json)
// -----------------------------------------------------------------------

describe('checkGithubReleasesHealth', () => {
it('returns healthy on a final 200 and follows redirects (GitHub 302s asset URLs even for missing assets)', async () => {
const result = await checkGithubReleasesHealth();
expect(result.status).toBe(ServiceHealthStatus.Healthy);
expect(result.rawIndicator).toBe('HTTP 200');
expect(global.fetch).toHaveBeenCalledWith(
URLS.githubReleasesSkillMenu,
expect.objectContaining({ redirect: 'follow' }),
);
});

it('returns down on 404 (release published without the asset)', async () => {
(global.fetch as jest.Mock).mockImplementation(
overrideFetch({
[URLS.githubReleasesSkillMenu]: () =>
Promise.resolve(new Response('Not Found', { status: 404 })),
}),
);
const result = await checkGithubReleasesHealth();
expect(result.status).toBe(ServiceHealthStatus.Down);
expect(result.error).toBe('HTTP 404');
});
});

// -----------------------------------------------------------------------
// checkAllExternalServices
// -----------------------------------------------------------------------
Expand Down
60 changes: 40 additions & 20 deletions src/lib/health-checks/endpoints.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { REMOTE_SKILLS_BASE_URL } from '@lib/constants';
import { logToFile } from '@utils/debug';
import { ServiceHealthStatus, type BaseHealthResult } from './types';

// ---------------------------------------------------------------------------
Expand All @@ -13,7 +14,7 @@ import { ServiceHealthStatus, type BaseHealthResult } from './types';
//
// MCP – Cloudflare Worker
// Source: posthog/services/mcp/src/index.ts
// GET / → 200 (HTML landing page)
// GET / → 302 to posthog.com docs. The redirect proves the worker is up.
// ---------------------------------------------------------------------------

function downResult(error: string): BaseHealthResult {
Expand All @@ -23,33 +24,52 @@ function downResult(error: string): BaseHealthResult {
async function fetchEndpointHealth(
url: string,
timeoutMs = 5000,
expectedStatus = 200,
isExpectedStatus: (status: number) => boolean = (s) => s === 200,
redirect: 'follow' | 'manual' | 'error' = 'follow',
): Promise<BaseHealthResult> {
try {
const controller = new AbortController();
const tid = setTimeout(() => controller.abort(), timeoutMs);
const res = await fetch(url, { signal: controller.signal });
clearTimeout(tid);

if (res.status === expectedStatus) {
return {
status: ServiceHealthStatus.Healthy,
rawIndicator: `HTTP ${res.status}`,
};
const result = await (async (): Promise<BaseHealthResult> => {
try {
const controller = new AbortController();
const tid = setTimeout(() => controller.abort(), timeoutMs);
const res = await fetch(url, {
signal: controller.signal,
redirect,
});
clearTimeout(tid);

if (isExpectedStatus(res.status)) {
return {
status: ServiceHealthStatus.Healthy,
rawIndicator: `HTTP ${res.status}`,
};
}
return downResult(`HTTP ${res.status}`);
} catch (e) {
if (e instanceof Error && e.name === 'AbortError')
return downResult(`Request timed out after ${timeoutMs}ms`);
return downResult(e instanceof Error ? e.message : 'Unknown error');
}
return downResult(`HTTP ${res.status}`);
} catch (e) {
if (e instanceof Error && e.name === 'AbortError')
return downResult('Request timed out');
return downResult(e instanceof Error ? e.message : 'Unknown error');
}
})();

logToFile(
`[health-checks] GET ${url} -> ${result.status}` +
`${result.rawIndicator ? ` (${result.rawIndicator})` : ''}` +
`${result.error ? ` (${result.error})` : ''}`,
);
return result;
}

export const checkLlmGatewayHealth = (): Promise<BaseHealthResult> =>
fetchEndpointHealth('https://gateway.us.posthog.com/_liveness');

export const checkMcpHealth = (): Promise<BaseHealthResult> =>
fetchEndpointHealth('https://mcp.posthog.com/');
fetchEndpointHealth(
'https://mcp.posthog.com/',
5000,
// 2xx-3xx counts as up (redirect to docs)
(s) => s >= 200 && s < 400,
'manual',
);

export const checkGithubReleasesHealth = (): Promise<BaseHealthResult> =>
fetchEndpointHealth(`${REMOTE_SKILLS_BASE_URL}/skill-menu.json`);
6 changes: 5 additions & 1 deletion src/lib/health-checks/readiness.ts
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,11 @@ export async function evaluateWizardReadiness(

const blockingKeys = getBlockingServiceKeys(health, config);
if (blockingKeys.length > 0) {
logToFile(`[health-checks] blocked by: ${blockingKeys.join(', ')}`);
const blockingDetails = blockingKeys.map((key) => {
const h = health[key];
return `${key} (${h.status}${h.error ? ` — ${h.error}` : ''})`;
});
logToFile(`[health-checks] blocked by: ${blockingDetails.join(', ')}`);
return { decision: WizardReadiness.No, health, reasons };
}

Expand Down
7 changes: 6 additions & 1 deletion src/lib/programs/shared/health-check-step.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import {
SIGNUP_WIZARD_READINESS_CONFIG,
getBlockingServiceKeys,
} from '@lib/health-checks/readiness';
import { logToFile } from '@utils/debug';

export function healthCheckReady(session: WizardSession): boolean {
if (!session.readinessResult) return false;
Expand Down Expand Up @@ -49,9 +50,13 @@ export const HEALTH_CHECK_STEP: ProgramStep = {
onInit: (ctx) => {
evaluateWizardReadiness()
.then((readiness) => {
logToFile(
`[health-checks] TUI pre-flight complete: decision=${readiness.decision}`,
);
ctx.setReadinessResult(readiness);
})
.catch(() => {
.catch((err) => {
logToFile('[health-checks] TUI pre-flight failed:', err);
ctx.setReadinessResult({
decision: WizardReadiness.Yes,
health: {} as never,
Expand Down
36 changes: 22 additions & 14 deletions src/lib/runners/run-wizard.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { VERSION } from '@lib/version';
import { runtimeEnv } from '@env';
import { logToFile, getLogFilePath } from '@utils/debug';
import type { ProgramConfig } from '@lib/programs/program-step';
import type { startTUI as StartTUIFn } from '@ui/tui/start-tui';
import type { TaskStreamPush as TaskStreamPushClass } from '@lib/task-stream/task-stream-push';
Expand Down Expand Up @@ -31,7 +31,6 @@ export function runWizard(
const { PostHogDestination } = await import(
'@lib/task-stream/destinations/posthog'
);
const { logToFile } = await import('@utils/debug');

// eslint-disable-next-line @typescript-eslint/no-explicit-any
tui = startTUI(WIZARD_VERSION, config.id as any);
Expand Down Expand Up @@ -81,17 +80,23 @@ export function runWizard(
onSignal = (): void => {
if (signalled || exitInProgress) return;
signalled = true;
logToFile('[run-wizard] signal received, flushing task stream');
if (activeTui.store.session.runPhase === RunPhase.Running) {
activeTui.store.setRunPhase(RunPhase.Error);
}
void activeStream.shutdown(2000).finally(() => {
try {
activeTui.unmount();
} catch {
// terminal may already be torn down
}
process.exit(130);
});
void activeStream
.shutdown(2000)
.catch((e) =>
logToFile('[run-wizard] task stream shutdown error on signal:', e),
)
.finally(() => {
try {
activeTui.unmount();
} catch {
// terminal may already be torn down
}
process.exit(130);
});
};
process.on('SIGINT', onSignal);
process.on('SIGTERM', onSignal);
Expand Down Expand Up @@ -148,10 +153,8 @@ export function runWizard(
activeTui.unmount();
process.exit(0);
} catch (err) {
if (runtimeEnv('DEBUG') || runtimeEnv('POSTHOG_WIZARD_DEBUG')) {
// eslint-disable-next-line no-console
console.error('TUI init failed:', err);
}
// File-log first — the cleanup below can throw or exit.
logToFile('[run-wizard] FATAL:', err);

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will say to carefully consider what we log to the log file vs. just capture as a error using error tracking. Some of this shouldn't really be seen by an end user. There's nothing they can do with that info, and we can always get more info in error tracking, anyway.

(I think this is one example)

// The task-stream debounce timer keeps the event loop alive, so
// we have to drain it before exiting on the error path.
exitInProgress = true;
Expand All @@ -173,6 +176,11 @@ export function runWizard(
// ignore
}
}
// Print after unmount — anything printed into the alt screen is wiped.
// eslint-disable-next-line no-console
console.error('Wizard run failed:', err);
// eslint-disable-next-line no-console
console.error(`Full logs: ${getLogFilePath()}`);
process.exit(1);
}
})();
Expand Down
3 changes: 3 additions & 0 deletions src/ui/tui/primitives/ScreenErrorBoundary.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import { Box, Text } from 'ink';
import { Component, type ReactNode } from 'react';
import type { WizardStore } from '@ui/tui/store';
import { OutroKind, RunPhase } from '@lib/wizard-session';
import { logToFile } from '@utils/debug';

interface Props {
store: WizardStore;
Expand All @@ -29,6 +30,8 @@ export class ScreenErrorBoundary extends Component<Props, State> {
componentDidCatch(error: Error): void {
const { store } = this.props;

// The console.error below is wiped with the alt screen; this survives.
logToFile('[screen-error-boundary]', error);
// eslint-disable-next-line no-console
console.error('[ScreenErrorBoundary]', error.message, error.stack);

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Duplicate?

Maybe we should add error tracking to PostHog, too (or instead of the console log)


Expand Down
Loading
Loading