From 076a794bc2101784b7fc2944ff988fa7f274c46e Mon Sep 17 00:00:00 2001 From: notgitika Date: Sun, 8 Mar 2026 00:14:10 -0500 Subject: [PATCH 1/9] feat: add evals control plane operations --- .../assets.snapshot.test.ts.snap | 2 + src/assets/cdk/test/cdk.test.ts | 2 + src/cli/commands/create/action.ts | 2 + .../commands/logs/__tests__/action.test.ts | 16 +- src/cli/commands/remove/command.tsx | 2 + src/cli/commands/remove/types.ts | 2 +- src/cli/commands/status/action.ts | 18 +- src/cli/commands/status/command.tsx | 22 +- .../__tests__/checks-extended.test.ts | 20 ++ src/cli/logging/remove-logger.ts | 2 +- .../agent/generate/write-agent-to-project.ts | 2 + .../deploy/__tests__/preflight.test.ts | 2 +- src/cli/operations/deploy/preflight.ts | 5 +- .../operations/dev/__tests__/config.test.ts | 34 +++ src/cli/primitives/EvaluatorPrimitive.ts | 222 ++++++++++++++++ .../primitives/OnlineEvalConfigPrimitive.ts | 219 ++++++++++++++++ src/cli/primitives/index.ts | 4 + src/cli/primitives/registry.ts | 6 + src/cli/tui/components/ResourceGraph.tsx | 2 + src/cli/tui/hooks/useCreateEvaluator.ts | 56 ++++ src/cli/tui/hooks/useCreateOnlineEval.ts | 59 +++++ src/cli/tui/hooks/useRemove.ts | 38 +++ src/cli/tui/screens/add/AddFlow.tsx | 36 +++ src/cli/tui/screens/add/AddScreen.tsx | 2 + src/cli/tui/screens/create/useCreateFlow.ts | 2 + .../screens/evaluator/AddEvaluatorFlow.tsx | 76 ++++++ .../screens/evaluator/AddEvaluatorScreen.tsx | 164 ++++++++++++ src/cli/tui/screens/evaluator/index.ts | 2 + src/cli/tui/screens/evaluator/types.ts | 131 ++++++++++ .../evaluator/useAddEvaluatorWizard.ts | 121 +++++++++ .../screens/online-eval/AddOnlineEvalFlow.tsx | 86 +++++++ .../online-eval/AddOnlineEvalScreen.tsx | 151 +++++++++++ src/cli/tui/screens/online-eval/index.ts | 2 + src/cli/tui/screens/online-eval/types.ts | 41 +++ .../online-eval/useAddOnlineEvalWizard.ts | 86 +++++++ .../screens/remove/RemoveEvaluatorScreen.tsx | 26 ++ src/cli/tui/screens/remove/RemoveFlow.tsx | 243 +++++++++++++++++- .../screens/remove/RemoveOnlineEvalScreen.tsx | 26 ++ src/cli/tui/screens/remove/RemoveScreen.tsx | 22 +- .../remove/__tests__/RemoveScreen.test.tsx | 4 + src/cli/tui/screens/remove/index.ts | 2 + src/cli/tui/screens/remove/useRemoveFlow.ts | 2 + src/schema/schemas/agentcore-project.ts | 136 +++++++--- src/schema/schemas/primitives/evaluator.ts | 74 ++++++ src/schema/schemas/primitives/index.ts | 21 ++ .../schemas/primitives/online-eval-config.ts | 29 +++ 46 files changed, 2175 insertions(+), 47 deletions(-) create mode 100644 src/cli/primitives/EvaluatorPrimitive.ts create mode 100644 src/cli/primitives/OnlineEvalConfigPrimitive.ts create mode 100644 src/cli/tui/hooks/useCreateEvaluator.ts create mode 100644 src/cli/tui/hooks/useCreateOnlineEval.ts create mode 100644 src/cli/tui/screens/evaluator/AddEvaluatorFlow.tsx create mode 100644 src/cli/tui/screens/evaluator/AddEvaluatorScreen.tsx create mode 100644 src/cli/tui/screens/evaluator/index.ts create mode 100644 src/cli/tui/screens/evaluator/types.ts create mode 100644 src/cli/tui/screens/evaluator/useAddEvaluatorWizard.ts create mode 100644 src/cli/tui/screens/online-eval/AddOnlineEvalFlow.tsx create mode 100644 src/cli/tui/screens/online-eval/AddOnlineEvalScreen.tsx create mode 100644 src/cli/tui/screens/online-eval/index.ts create mode 100644 src/cli/tui/screens/online-eval/types.ts create mode 100644 src/cli/tui/screens/online-eval/useAddOnlineEvalWizard.ts create mode 100644 src/cli/tui/screens/remove/RemoveEvaluatorScreen.tsx create mode 100644 src/cli/tui/screens/remove/RemoveOnlineEvalScreen.tsx create mode 100644 src/schema/schemas/primitives/evaluator.ts create mode 100644 src/schema/schemas/primitives/online-eval-config.ts diff --git a/src/assets/__tests__/__snapshots__/assets.snapshot.test.ts.snap b/src/assets/__tests__/__snapshots__/assets.snapshot.test.ts.snap index 0e2f5950..52c7d853 100644 --- a/src/assets/__tests__/__snapshots__/assets.snapshot.test.ts.snap +++ b/src/assets/__tests__/__snapshots__/assets.snapshot.test.ts.snap @@ -372,6 +372,8 @@ test('AgentCoreStack synthesizes with empty spec', () => { agents: [], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }, }); const template = Template.fromStack(stack); diff --git a/src/assets/cdk/test/cdk.test.ts b/src/assets/cdk/test/cdk.test.ts index 5ff491d1..40021c58 100644 --- a/src/assets/cdk/test/cdk.test.ts +++ b/src/assets/cdk/test/cdk.test.ts @@ -11,6 +11,8 @@ test('AgentCoreStack synthesizes with empty spec', () => { agents: [], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }, }); const template = Template.fromStack(stack); diff --git a/src/cli/commands/create/action.ts b/src/cli/commands/create/action.ts index c99f69dc..eba7385b 100644 --- a/src/cli/commands/create/action.ts +++ b/src/cli/commands/create/action.ts @@ -28,6 +28,8 @@ function createDefaultProjectSpec(projectName: string): AgentCoreProjectSpec { agents: [], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; } diff --git a/src/cli/commands/logs/__tests__/action.test.ts b/src/cli/commands/logs/__tests__/action.test.ts index 81e1f39f..9f41b66f 100644 --- a/src/cli/commands/logs/__tests__/action.test.ts +++ b/src/cli/commands/logs/__tests__/action.test.ts @@ -55,6 +55,8 @@ describe('resolveAgentContext', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }, deployedState: { targets: { @@ -111,6 +113,8 @@ describe('resolveAgentContext', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }, }); const result = resolveAgentContext(context, {}); @@ -147,6 +151,8 @@ describe('resolveAgentContext', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }, deployedState: { targets: { @@ -187,7 +193,15 @@ describe('resolveAgentContext', () => { it('errors when no agents defined', () => { const context = makeContext({ - project: { name: 'TestProject', version: 1, agents: [], memories: [], credentials: [] }, + project: { + name: 'TestProject', + version: 1, + agents: [], + memories: [], + credentials: [], + evaluators: [], + onlineEvalConfigs: [], + }, }); const result = resolveAgentContext(context, {}); expect(result.success).toBe(false); diff --git a/src/cli/commands/remove/command.tsx b/src/cli/commands/remove/command.tsx index 8ada29c0..e0a45f07 100644 --- a/src/cli/commands/remove/command.tsx +++ b/src/cli/commands/remove/command.tsx @@ -29,6 +29,8 @@ async function handleRemoveAll(_options: RemoveAllOptions): Promise ({ + resourceType: 'evaluator', + name: e.name, + deploymentState: 'local-only' as ResourceDeploymentState, + detail: `${e.level} — LLM-as-a-Judge`, + })); + + const onlineEvalConfigs: ResourceStatusEntry[] = (project.onlineEvalConfigs ?? []).map(c => ({ + resourceType: 'online-eval', + name: c.name, + deploymentState: 'local-only' as ResourceDeploymentState, + detail: `${c.agents.length} agent${c.agents.length !== 1 ? 's' : ''}, ${c.evaluators.length} evaluator${c.evaluators.length !== 1 ? 's' : ''}`, + })); + + return [...agents, ...credentials, ...memories, ...gateways, ...evaluators, ...onlineEvalConfigs]; } export async function handleProjectStatus( diff --git a/src/cli/commands/status/command.tsx b/src/cli/commands/status/command.tsx index 09279fd6..1a80e4af 100644 --- a/src/cli/commands/status/command.tsx +++ b/src/cli/commands/status/command.tsx @@ -7,7 +7,7 @@ import { DEPLOYMENT_STATE_COLORS, DEPLOYMENT_STATE_LABELS } from './constants'; import type { Command } from '@commander-js/extra-typings'; import { Box, Text, render } from 'ink'; -const VALID_RESOURCE_TYPES = ['agent', 'memory', 'credential', 'gateway'] as const; +const VALID_RESOURCE_TYPES = ['agent', 'memory', 'credential', 'gateway', 'evaluator', 'online-eval'] as const; const VALID_STATES = ['deployed', 'local-only', 'pending-removal'] as const; interface StatusCliOptions { @@ -126,6 +126,8 @@ export const registerStatus = (program: Command) => { const credentials = filtered.filter(r => r.resourceType === 'credential'); const memories = filtered.filter(r => r.resourceType === 'memory'); const gateways = filtered.filter(r => r.resourceType === 'gateway'); + const evaluators = filtered.filter(r => r.resourceType === 'evaluator'); + const onlineEvals = filtered.filter(r => r.resourceType === 'online-eval'); render( @@ -170,6 +172,24 @@ export const registerStatus = (program: Command) => { )} + {evaluators.length > 0 && ( + + Evaluators + {evaluators.map(entry => ( + + ))} + + )} + + {onlineEvals.length > 0 && ( + + Online Eval Configs + {onlineEvals.map(entry => ( + + ))} + + )} + {filtered.length === 0 && No resources match the given filters.} ); diff --git a/src/cli/external-requirements/__tests__/checks-extended.test.ts b/src/cli/external-requirements/__tests__/checks-extended.test.ts index 30384086..42e44152 100644 --- a/src/cli/external-requirements/__tests__/checks-extended.test.ts +++ b/src/cli/external-requirements/__tests__/checks-extended.test.ts @@ -48,6 +48,8 @@ describe('requiresUv', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; expect(requiresUv(project)).toBe(true); }); @@ -68,6 +70,8 @@ describe('requiresUv', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; expect(requiresUv(project)).toBe(false); }); @@ -79,6 +83,8 @@ describe('requiresUv', () => { agents: [], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; expect(requiresUv(project)).toBe(false); }); @@ -101,6 +107,8 @@ describe('requiresContainerRuntime', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; expect(requiresContainerRuntime(project)).toBe(true); }); @@ -121,6 +129,8 @@ describe('requiresContainerRuntime', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; expect(requiresContainerRuntime(project)).toBe(false); }); @@ -132,6 +142,8 @@ describe('requiresContainerRuntime', () => { agents: [], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; expect(requiresContainerRuntime(project)).toBe(false); }); @@ -160,6 +172,8 @@ describe('requiresContainerRuntime', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; expect(requiresContainerRuntime(project)).toBe(true); }); @@ -222,6 +236,8 @@ describe('checkDependencyVersions', () => { agents: [], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; const result = await checkDependencyVersions(project); @@ -237,6 +253,8 @@ describe('checkDependencyVersions', () => { agents: [], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; const result = await checkDependencyVersions(project); @@ -260,6 +278,8 @@ describe('checkDependencyVersions', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; const result = await checkDependencyVersions(project); diff --git a/src/cli/logging/remove-logger.ts b/src/cli/logging/remove-logger.ts index a21201ff..f40ace6c 100644 --- a/src/cli/logging/remove-logger.ts +++ b/src/cli/logging/remove-logger.ts @@ -7,7 +7,7 @@ const REMOVE_LOGS_SUBDIR = 'remove'; export interface RemoveLoggerOptions { /** Type of resource being removed */ - resourceType: 'agent' | 'memory' | 'identity' | 'gateway' | 'gateway-target'; + resourceType: 'agent' | 'memory' | 'identity' | 'gateway' | 'gateway-target' | 'evaluator' | 'online-eval'; /** Name of the resource being removed */ resourceName: string; } diff --git a/src/cli/operations/agent/generate/write-agent-to-project.ts b/src/cli/operations/agent/generate/write-agent-to-project.ts index 85819835..37b001ea 100644 --- a/src/cli/operations/agent/generate/write-agent-to-project.ts +++ b/src/cli/operations/agent/generate/write-agent-to-project.ts @@ -67,6 +67,8 @@ export async function writeAgentToProject(config: GenerateConfig, options?: Writ agents: [agent], memories, credentials, + evaluators: [], + onlineEvalConfigs: [], }; await configIO.writeProjectSpec(project); diff --git a/src/cli/operations/deploy/__tests__/preflight.test.ts b/src/cli/operations/deploy/__tests__/preflight.test.ts index dd148df4..0818acf7 100644 --- a/src/cli/operations/deploy/__tests__/preflight.test.ts +++ b/src/cli/operations/deploy/__tests__/preflight.test.ts @@ -81,7 +81,7 @@ describe('validateProject', () => { mockReadDeployedState.mockRejectedValue(new Error('No deployed state')); await expect(validateProject()).rejects.toThrow( - 'No resources defined in project. Add an agent with "agentcore add agent", a memory with "agentcore add memory", or a gateway with "agentcore add gateway" before deploying.' + 'No resources defined in project. Add at least one resource (agent, memory, evaluator, or gateway) before deploying.' ); }); diff --git a/src/cli/operations/deploy/preflight.ts b/src/cli/operations/deploy/preflight.ts index 9c5025a5..fe522053 100644 --- a/src/cli/operations/deploy/preflight.ts +++ b/src/cli/operations/deploy/preflight.ts @@ -82,6 +82,7 @@ export async function validateProject(): Promise { let isTeardownDeploy = false; const hasAgents = projectSpec.agents && projectSpec.agents.length > 0; const hasMemories = projectSpec.memories && projectSpec.memories.length > 0; + const hasEvaluators = projectSpec.evaluators && projectSpec.evaluators.length > 0; // Check for gateways in mcp.json let hasGateways = false; @@ -92,7 +93,7 @@ export async function validateProject(): Promise { // No mcp.json or invalid — no gateways } - if (!hasAgents && !hasGateways && !hasMemories) { + if (!hasAgents && !hasGateways && !hasMemories && !hasEvaluators) { let hasExistingStack = false; try { const deployedState = await configIO.readDeployedState(); @@ -102,7 +103,7 @@ export async function validateProject(): Promise { } if (!hasExistingStack) { throw new Error( - 'No resources defined in project. Add an agent with "agentcore add agent", a memory with "agentcore add memory", or a gateway with "agentcore add gateway" before deploying.' + 'No resources defined in project. Add at least one resource (agent, memory, evaluator, or gateway) before deploying.' ); } isTeardownDeploy = true; diff --git a/src/cli/operations/dev/__tests__/config.test.ts b/src/cli/operations/dev/__tests__/config.test.ts index c6e04210..3751e6c6 100644 --- a/src/cli/operations/dev/__tests__/config.test.ts +++ b/src/cli/operations/dev/__tests__/config.test.ts @@ -16,6 +16,8 @@ describe('getDevConfig', () => { agents: [], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; const config = getDevConfig(workingDir, project); @@ -38,6 +40,8 @@ describe('getDevConfig', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; const config = getDevConfig(workingDir, project); @@ -60,6 +64,8 @@ describe('getDevConfig', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; const config = getDevConfig(workingDir, project, '/test/project/agentcore'); @@ -88,6 +94,8 @@ describe('getDevConfig', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; expect(() => getDevConfig(workingDir, project, undefined, 'NonExistentAgent')).toThrow( @@ -111,6 +119,8 @@ describe('getDevConfig', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; expect(() => getDevConfig(workingDir, project, undefined, 'NodeAgent')).toThrow('Dev mode only supports Python'); @@ -132,6 +142,8 @@ describe('getDevConfig', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; const config = getDevConfig(workingDir, project, '/test/project/agentcore'); @@ -156,6 +168,8 @@ describe('getDevConfig', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; // No configRoot provided @@ -180,6 +194,8 @@ describe('getDevConfig', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; const config = getDevConfig(workingDir, project, '/test/project/agentcore'); @@ -204,6 +220,8 @@ describe('getDevConfig', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; const config = getDevConfig(workingDir, project, '/test/project/agentcore'); @@ -228,6 +246,8 @@ describe('getDevConfig', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; const config = getDevConfig(workingDir, project, '/test/project/agentcore'); @@ -265,6 +285,8 @@ describe('getAgentPort', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; expect(getAgentPort(project, 'Agent1', 8080)).toBe(8080); @@ -278,6 +300,8 @@ describe('getAgentPort', () => { agents: [], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; expect(getAgentPort(project, 'NonExistent', 9000)).toBe(9000); @@ -296,6 +320,8 @@ describe('getDevSupportedAgents', () => { agents: [], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; expect(getDevSupportedAgents(project)).toEqual([]); @@ -317,6 +343,8 @@ describe('getDevSupportedAgents', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; expect(getDevSupportedAgents(project)).toEqual([]); @@ -346,6 +374,8 @@ describe('getDevSupportedAgents', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; const supported = getDevSupportedAgents(project); @@ -369,6 +399,8 @@ describe('getDevSupportedAgents', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; const supported = getDevSupportedAgents(project); @@ -400,6 +432,8 @@ describe('getDevSupportedAgents', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; const supported = getDevSupportedAgents(project); diff --git a/src/cli/primitives/EvaluatorPrimitive.ts b/src/cli/primitives/EvaluatorPrimitive.ts new file mode 100644 index 00000000..bf0cb7d8 --- /dev/null +++ b/src/cli/primitives/EvaluatorPrimitive.ts @@ -0,0 +1,222 @@ +import { findConfigRoot } from '../../lib'; +import type { EvaluationLevel, Evaluator, EvaluatorConfig } from '../../schema'; +import { EvaluationLevelSchema, EvaluatorSchema } from '../../schema'; +import { getErrorMessage } from '../errors'; +import type { RemovalPreview, RemovalResult, SchemaChange } from '../operations/remove/types'; +import { BasePrimitive } from './BasePrimitive'; +import type { AddResult, AddScreenComponent, RemovableResource } from './types'; +import type { Command } from '@commander-js/extra-typings'; + +export interface AddEvaluatorOptions { + name: string; + level: EvaluationLevel; + description?: string; + config: EvaluatorConfig; +} + +export type RemovableEvaluator = RemovableResource; + +/** + * EvaluatorPrimitive handles all evaluator add/remove operations. + */ +export class EvaluatorPrimitive extends BasePrimitive { + readonly kind = 'evaluator' as const; + readonly label = 'Evaluator'; + override readonly article = 'an'; + readonly primitiveSchema = EvaluatorSchema; + + async add(options: AddEvaluatorOptions): Promise> { + try { + const evaluator = await this.createEvaluator(options); + return { success: true, evaluatorName: evaluator.name }; + } catch (err) { + return { success: false, error: getErrorMessage(err) }; + } + } + + async remove(evaluatorName: string): Promise { + try { + const project = await this.readProjectSpec(); + + const index = project.evaluators.findIndex(e => e.name === evaluatorName); + if (index === -1) { + return { success: false, error: `Evaluator "${evaluatorName}" not found.` }; + } + + // Warn if referenced by online eval configs + const referencingConfigs = project.onlineEvalConfigs.filter(c => c.evaluators.includes(evaluatorName)); + if (referencingConfigs.length > 0) { + const configNames = referencingConfigs.map(c => c.name).join(', '); + return { + success: false, + error: `Evaluator "${evaluatorName}" is referenced by online eval config(s): ${configNames}. Remove those references first.`, + }; + } + + project.evaluators.splice(index, 1); + await this.writeProjectSpec(project); + + return { success: true }; + } catch (err) { + return { success: false, error: getErrorMessage(err) }; + } + } + + async previewRemove(evaluatorName: string): Promise { + const project = await this.readProjectSpec(); + + const evaluator = project.evaluators.find(e => e.name === evaluatorName); + if (!evaluator) { + throw new Error(`Evaluator "${evaluatorName}" not found.`); + } + + const summary: string[] = [`Removing evaluator: ${evaluatorName}`]; + const schemaChanges: SchemaChange[] = []; + + const referencingConfigs = project.onlineEvalConfigs.filter(c => c.evaluators.includes(evaluatorName)); + if (referencingConfigs.length > 0) { + summary.push( + `Blocked: Referenced by online eval config(s): ${referencingConfigs.map(c => c.name).join(', ')}. Remove those references first.` + ); + } + + const afterSpec = { + ...project, + evaluators: project.evaluators.filter(e => e.name !== evaluatorName), + }; + + schemaChanges.push({ + file: 'agentcore/agentcore.json', + before: project, + after: afterSpec, + }); + + return { summary, directoriesToDelete: [], schemaChanges }; + } + + async getRemovable(): Promise { + try { + const project = await this.readProjectSpec(); + return project.evaluators.map(e => ({ name: e.name })); + } catch { + return []; + } + } + + async getAllNames(): Promise { + try { + const project = await this.readProjectSpec(); + return project.evaluators.map(e => e.name); + } catch { + return []; + } + } + + registerCommands(addCmd: Command, removeCmd: Command): void { + addCmd + .command('eval') + .description('Add a custom evaluator to the project') + .option('--name ', 'Evaluator name [non-interactive]') + .option('--level ', 'Evaluation level: SESSION, TRACE, TOOL_CALL [non-interactive]') + .option('--config ', 'Path to evaluator config JSON file [non-interactive]') + .option('--json', 'Output as JSON [non-interactive]') + .action(async (cliOptions: { name?: string; level?: string; config?: string; json?: boolean }) => { + try { + if (!findConfigRoot()) { + console.error('No agentcore project found. Run `agentcore create` first.'); + process.exit(1); + } + + if (cliOptions.name || cliOptions.json) { + if (!cliOptions.name || !cliOptions.level || !cliOptions.config) { + const error = '--name, --level, and --config are all required in non-interactive mode'; + if (cliOptions.json) { + console.log(JSON.stringify({ success: false, error })); + } else { + console.error(error); + } + process.exit(1); + } + + const levelResult = EvaluationLevelSchema.safeParse(cliOptions.level); + if (!levelResult.success) { + const error = `Invalid --level "${cliOptions.level}". Must be one of: SESSION, TRACE, TOOL_CALL`; + if (cliOptions.json) { + console.log(JSON.stringify({ success: false, error })); + } else { + console.error(error); + } + process.exit(1); + } + + const { readFileSync } = await import('fs'); + const configJson = JSON.parse(readFileSync(cliOptions.config, 'utf-8')) as EvaluatorConfig; + + const result = await this.add({ + name: cliOptions.name, + level: levelResult.data, + config: configJson, + }); + + if (cliOptions.json) { + console.log(JSON.stringify(result)); + } else if (result.success) { + console.log(`Added evaluator '${result.evaluatorName}'`); + } else { + console.error(result.error); + } + process.exit(result.success ? 0 : 1); + } else { + // TUI fallback + const [{ render }, { default: React }, { AddFlow }] = await Promise.all([ + import('ink'), + import('react'), + import('../tui/screens/add/AddFlow'), + ]); + const { clear, unmount } = render( + React.createElement(AddFlow, { + isInteractive: false, + onExit: () => { + clear(); + unmount(); + process.exit(0); + }, + }) + ); + } + } catch (error) { + if (cliOptions.json) { + console.log(JSON.stringify({ success: false, error: getErrorMessage(error) })); + } else { + console.error(getErrorMessage(error)); + } + process.exit(1); + } + }); + + this.registerRemoveSubcommand(removeCmd); + } + + addScreen(): AddScreenComponent { + return null; + } + + private async createEvaluator(options: AddEvaluatorOptions): Promise { + const project = await this.readProjectSpec(); + + this.checkDuplicate(project.evaluators, options.name); + + const evaluator: Evaluator = { + type: 'CustomEvaluator', + name: options.name, + level: options.level, + ...(options.description && { description: options.description }), + config: options.config, + }; + + project.evaluators.push(evaluator); + await this.writeProjectSpec(project); + + return evaluator; + } +} diff --git a/src/cli/primitives/OnlineEvalConfigPrimitive.ts b/src/cli/primitives/OnlineEvalConfigPrimitive.ts new file mode 100644 index 00000000..36e66069 --- /dev/null +++ b/src/cli/primitives/OnlineEvalConfigPrimitive.ts @@ -0,0 +1,219 @@ +import { findConfigRoot } from '../../lib'; +import type { OnlineEvalConfig } from '../../schema'; +import { OnlineEvalConfigSchema } from '../../schema'; +import { getErrorMessage } from '../errors'; +import type { RemovalPreview, RemovalResult, SchemaChange } from '../operations/remove/types'; +import { BasePrimitive } from './BasePrimitive'; +import type { AddResult, AddScreenComponent, RemovableResource } from './types'; +import type { Command } from '@commander-js/extra-typings'; + +export interface AddOnlineEvalConfigOptions { + name: string; + agents: string[]; + evaluators: string[]; + samplingRate: number; + enableOnCreate?: boolean; +} + +export type RemovableOnlineEvalConfig = RemovableResource; + +/** + * OnlineEvalConfigPrimitive handles all online eval config add/remove operations. + */ +export class OnlineEvalConfigPrimitive extends BasePrimitive { + readonly kind = 'online-eval' as const; + readonly label = 'Online Eval Config'; + override readonly article = 'an'; + readonly primitiveSchema = OnlineEvalConfigSchema; + + async add(options: AddOnlineEvalConfigOptions): Promise> { + try { + const config = await this.createOnlineEvalConfig(options); + return { success: true, configName: config.name }; + } catch (err) { + return { success: false, error: getErrorMessage(err) }; + } + } + + async remove(configName: string): Promise { + try { + const project = await this.readProjectSpec(); + + const index = project.onlineEvalConfigs.findIndex(c => c.name === configName); + if (index === -1) { + return { success: false, error: `Online eval config "${configName}" not found.` }; + } + + project.onlineEvalConfigs.splice(index, 1); + await this.writeProjectSpec(project); + + return { success: true }; + } catch (err) { + return { success: false, error: getErrorMessage(err) }; + } + } + + async previewRemove(configName: string): Promise { + const project = await this.readProjectSpec(); + + const config = project.onlineEvalConfigs.find(c => c.name === configName); + if (!config) { + throw new Error(`Online eval config "${configName}" not found.`); + } + + const summary: string[] = [ + `Removing online eval config: ${configName}`, + `Monitors agents: ${config.agents.join(', ')}`, + `Uses evaluators: ${config.evaluators.join(', ')}`, + ]; + const schemaChanges: SchemaChange[] = []; + + const afterSpec = { + ...project, + onlineEvalConfigs: project.onlineEvalConfigs.filter(c => c.name !== configName), + }; + + schemaChanges.push({ + file: 'agentcore/agentcore.json', + before: project, + after: afterSpec, + }); + + return { summary, directoriesToDelete: [], schemaChanges }; + } + + async getRemovable(): Promise { + try { + const project = await this.readProjectSpec(); + return project.onlineEvalConfigs.map(c => ({ name: c.name })); + } catch { + return []; + } + } + + async getAllNames(): Promise { + try { + const project = await this.readProjectSpec(); + return project.onlineEvalConfigs.map(c => c.name); + } catch { + return []; + } + } + + registerCommands(addCmd: Command, removeCmd: Command): void { + addCmd + .command('online-eval') + .description('Add an online eval config to the project') + .option('--name ', 'Config name [non-interactive]') + .option('-a, --agent ', 'Agent name(s) to monitor [non-interactive]') + .option('-e, --evaluator ', 'Evaluator name(s) or Builtin.* IDs [non-interactive]') + .option('--sampling-rate ', 'Sampling percentage (0.01-100) [non-interactive]') + .option('--json', 'Output as JSON [non-interactive]') + .action( + async (cliOptions: { + name?: string; + agent?: string[]; + evaluator?: string[]; + samplingRate?: string; + json?: boolean; + }) => { + try { + if (!findConfigRoot()) { + console.error('No agentcore project found. Run `agentcore create` first.'); + process.exit(1); + } + + if (cliOptions.name || cliOptions.json) { + if (!cliOptions.name || !cliOptions.agent || !cliOptions.evaluator || !cliOptions.samplingRate) { + const error = + '--name, --agent, --evaluator, and --sampling-rate are all required in non-interactive mode'; + if (cliOptions.json) { + console.log(JSON.stringify({ success: false, error })); + } else { + console.error(error); + } + process.exit(1); + } + + const samplingRate = parseFloat(cliOptions.samplingRate); + if (isNaN(samplingRate) || samplingRate < 0.01 || samplingRate > 100) { + const error = `Invalid --sampling-rate "${cliOptions.samplingRate}". Must be a number between 0.01 and 100`; + if (cliOptions.json) { + console.log(JSON.stringify({ success: false, error })); + } else { + console.error(error); + } + process.exit(1); + } + + const result = await this.add({ + name: cliOptions.name, + agents: cliOptions.agent, + evaluators: cliOptions.evaluator, + samplingRate, + }); + + if (cliOptions.json) { + console.log(JSON.stringify(result)); + } else if (result.success) { + console.log(`Added online eval config '${result.configName}'`); + } else { + console.error(result.error); + } + process.exit(result.success ? 0 : 1); + } else { + // TUI fallback + const [{ render }, { default: React }, { AddFlow }] = await Promise.all([ + import('ink'), + import('react'), + import('../tui/screens/add/AddFlow'), + ]); + const { clear, unmount } = render( + React.createElement(AddFlow, { + isInteractive: false, + onExit: () => { + clear(); + unmount(); + process.exit(0); + }, + }) + ); + } + } catch (error) { + if (cliOptions.json) { + console.log(JSON.stringify({ success: false, error: getErrorMessage(error) })); + } else { + console.error(getErrorMessage(error)); + } + process.exit(1); + } + } + ); + + this.registerRemoveSubcommand(removeCmd); + } + + addScreen(): AddScreenComponent { + return null; + } + + private async createOnlineEvalConfig(options: AddOnlineEvalConfigOptions): Promise { + const project = await this.readProjectSpec(); + + this.checkDuplicate(project.onlineEvalConfigs, options.name, 'Online eval config'); + + const config: OnlineEvalConfig = { + type: 'OnlineEvalConfig', + name: options.name, + agents: options.agents, + evaluators: options.evaluators, + samplingRate: options.samplingRate, + enableOnCreate: options.enableOnCreate ?? true, + }; + + project.onlineEvalConfigs.push(config); + await this.writeProjectSpec(project); + + return config; + } +} diff --git a/src/cli/primitives/index.ts b/src/cli/primitives/index.ts index 0c995da6..2ef948e5 100644 --- a/src/cli/primitives/index.ts +++ b/src/cli/primitives/index.ts @@ -2,6 +2,8 @@ export { BasePrimitive } from './BasePrimitive'; export { MemoryPrimitive } from './MemoryPrimitive'; export { CredentialPrimitive } from './CredentialPrimitive'; export { AgentPrimitive } from './AgentPrimitive'; +export { EvaluatorPrimitive } from './EvaluatorPrimitive'; +export { OnlineEvalConfigPrimitive } from './OnlineEvalConfigPrimitive'; export { GatewayPrimitive } from './GatewayPrimitive'; export { GatewayTargetPrimitive } from './GatewayTargetPrimitive'; export { @@ -9,6 +11,8 @@ export { agentPrimitive, memoryPrimitive, credentialPrimitive, + evaluatorPrimitive, + onlineEvalConfigPrimitive, gatewayPrimitive, gatewayTargetPrimitive, getPrimitive, diff --git a/src/cli/primitives/registry.ts b/src/cli/primitives/registry.ts index 4dd33b4d..290e0d60 100644 --- a/src/cli/primitives/registry.ts +++ b/src/cli/primitives/registry.ts @@ -1,9 +1,11 @@ import { AgentPrimitive } from './AgentPrimitive'; import type { BasePrimitive } from './BasePrimitive'; import { CredentialPrimitive } from './CredentialPrimitive'; +import { EvaluatorPrimitive } from './EvaluatorPrimitive'; import { GatewayPrimitive } from './GatewayPrimitive'; import { GatewayTargetPrimitive } from './GatewayTargetPrimitive'; import { MemoryPrimitive } from './MemoryPrimitive'; +import { OnlineEvalConfigPrimitive } from './OnlineEvalConfigPrimitive'; import type { RemovableResource } from './types'; /** @@ -12,6 +14,8 @@ import type { RemovableResource } from './types'; export const agentPrimitive = new AgentPrimitive(); export const memoryPrimitive = new MemoryPrimitive(); export const credentialPrimitive = new CredentialPrimitive(); +export const evaluatorPrimitive = new EvaluatorPrimitive(); +export const onlineEvalConfigPrimitive = new OnlineEvalConfigPrimitive(); export const gatewayPrimitive = new GatewayPrimitive(); export const gatewayTargetPrimitive = new GatewayTargetPrimitive(); @@ -22,6 +26,8 @@ export const ALL_PRIMITIVES: BasePrimitive[] = [ agentPrimitive, memoryPrimitive, credentialPrimitive, + evaluatorPrimitive, + onlineEvalConfigPrimitive, gatewayPrimitive, gatewayTargetPrimitive, ]; diff --git a/src/cli/tui/components/ResourceGraph.tsx b/src/cli/tui/components/ResourceGraph.tsx index 816a96cc..cbea8692 100644 --- a/src/cli/tui/components/ResourceGraph.tsx +++ b/src/cli/tui/components/ResourceGraph.tsx @@ -16,6 +16,8 @@ const ICONS = { gateway: '◆', tool: '⚙', runtime: '▶', + evaluator: '✦', + 'online-eval': '↻', } as const; interface ResourceGraphProps { diff --git a/src/cli/tui/hooks/useCreateEvaluator.ts b/src/cli/tui/hooks/useCreateEvaluator.ts new file mode 100644 index 00000000..bf3015bd --- /dev/null +++ b/src/cli/tui/hooks/useCreateEvaluator.ts @@ -0,0 +1,56 @@ +import type { EvaluatorConfig } from '../../../schema'; +import { evaluatorPrimitive } from '../../primitives/registry'; +import { useCallback, useEffect, useState } from 'react'; + +interface CreateEvaluatorConfig { + name: string; + level: string; + config: EvaluatorConfig; +} + +export function useCreateEvaluator() { + const [status, setStatus] = useState<{ state: 'idle' | 'loading' | 'success' | 'error'; error?: string }>({ + state: 'idle', + }); + + const create = useCallback(async (config: CreateEvaluatorConfig) => { + setStatus({ state: 'loading' }); + try { + const addResult = await evaluatorPrimitive.add({ + name: config.name, + level: config.level as 'SESSION' | 'TRACE' | 'TOOL_CALL', + config: config.config, + }); + if (!addResult.success) { + throw new Error(addResult.error ?? 'Failed to create evaluator'); + } + setStatus({ state: 'success' }); + return { ok: true as const, evaluatorName: config.name }; + } catch (err) { + const message = err instanceof Error ? err.message : 'Failed to create evaluator.'; + setStatus({ state: 'error', error: message }); + return { ok: false as const, error: message }; + } + }, []); + + const reset = useCallback(() => { + setStatus({ state: 'idle' }); + }, []); + + return { status, createEvaluator: create, reset }; +} + +export function useExistingEvaluatorNames() { + const [names, setNames] = useState([]); + + useEffect(() => { + void evaluatorPrimitive.getAllNames().then(setNames); + }, []); + + const refresh = useCallback(async () => { + const result = await evaluatorPrimitive.getAllNames(); + setNames(result); + }, []); + + return { names, refresh }; +} diff --git a/src/cli/tui/hooks/useCreateOnlineEval.ts b/src/cli/tui/hooks/useCreateOnlineEval.ts new file mode 100644 index 00000000..ab4ed1c4 --- /dev/null +++ b/src/cli/tui/hooks/useCreateOnlineEval.ts @@ -0,0 +1,59 @@ +import { onlineEvalConfigPrimitive } from '../../primitives/registry'; +import { useCallback, useEffect, useState } from 'react'; + +interface CreateOnlineEvalConfig { + name: string; + agents: string[]; + evaluators: string[]; + samplingRate: number; + enableOnCreate?: boolean; +} + +export function useCreateOnlineEval() { + const [status, setStatus] = useState<{ state: 'idle' | 'loading' | 'success' | 'error'; error?: string }>({ + state: 'idle', + }); + + const create = useCallback(async (config: CreateOnlineEvalConfig) => { + setStatus({ state: 'loading' }); + try { + const addResult = await onlineEvalConfigPrimitive.add({ + name: config.name, + agents: config.agents, + evaluators: config.evaluators, + samplingRate: config.samplingRate, + enableOnCreate: config.enableOnCreate, + }); + if (!addResult.success) { + throw new Error(addResult.error ?? 'Failed to create online eval config'); + } + setStatus({ state: 'success' }); + return { ok: true as const, configName: config.name }; + } catch (err) { + const message = err instanceof Error ? err.message : 'Failed to create online eval config.'; + setStatus({ state: 'error', error: message }); + return { ok: false as const, error: message }; + } + }, []); + + const reset = useCallback(() => { + setStatus({ state: 'idle' }); + }, []); + + return { status, createOnlineEval: create, reset }; +} + +export function useExistingOnlineEvalNames() { + const [names, setNames] = useState([]); + + useEffect(() => { + void onlineEvalConfigPrimitive.getAllNames().then(setNames); + }, []); + + const refresh = useCallback(async () => { + const result = await onlineEvalConfigPrimitive.getAllNames(); + setNames(result); + }, []); + + return { names, refresh }; +} diff --git a/src/cli/tui/hooks/useRemove.ts b/src/cli/tui/hooks/useRemove.ts index dd6b5468..31a7519f 100644 --- a/src/cli/tui/hooks/useRemove.ts +++ b/src/cli/tui/hooks/useRemove.ts @@ -6,9 +6,11 @@ import type { RemovableMemory } from '../../primitives/MemoryPrimitive'; import { agentPrimitive, credentialPrimitive, + evaluatorPrimitive, gatewayPrimitive, gatewayTargetPrimitive, memoryPrimitive, + onlineEvalConfigPrimitive, } from '../../primitives/registry'; import { useCallback, useEffect, useRef, useState } from 'react'; @@ -117,6 +119,16 @@ export function useRemovableIdentities() { return { identities, ...rest }; } +export function useRemovableEvaluators() { + const { items: evaluators, ...rest } = useRemovableResources(() => evaluatorPrimitive.getRemovable()); + return { evaluators, ...rest }; +} + +export function useRemovableOnlineEvalConfigs() { + const { items: onlineEvalConfigs, ...rest } = useRemovableResources(() => onlineEvalConfigPrimitive.getRemovable()); + return { onlineEvalConfigs, ...rest }; +} + // ============================================================================ // Preview Hook // ============================================================================ @@ -172,6 +184,14 @@ export function useRemovalPreview() { (name: string) => loadPreview(n => credentialPrimitive.previewRemove(n), name), [loadPreview] ); + const loadEvaluatorPreview = useCallback( + (name: string) => loadPreview(n => evaluatorPrimitive.previewRemove(n), name), + [loadPreview] + ); + const loadOnlineEvalPreview = useCallback( + (name: string) => loadPreview(n => onlineEvalConfigPrimitive.previewRemove(n), name), + [loadPreview] + ); const reset = useCallback(() => { setState({ isLoading: false, preview: null, error: null }); @@ -184,6 +204,8 @@ export function useRemovalPreview() { loadGatewayTargetPreview, loadMemoryPreview, loadIdentityPreview, + loadEvaluatorPreview, + loadOnlineEvalPreview, reset, }; } @@ -238,3 +260,19 @@ export function useRemoveIdentity() { name => name ); } + +export function useRemoveEvaluator() { + return useRemoveResource( + (name: string) => evaluatorPrimitive.remove(name), + 'evaluator', + name => name + ); +} + +export function useRemoveOnlineEvalConfig() { + return useRemoveResource( + (name: string) => onlineEvalConfigPrimitive.remove(name), + 'online-eval', + name => name + ); +} diff --git a/src/cli/tui/screens/add/AddFlow.tsx b/src/cli/tui/screens/add/AddFlow.tsx index 690d25af..35926ad9 100644 --- a/src/cli/tui/screens/add/AddFlow.tsx +++ b/src/cli/tui/screens/add/AddFlow.tsx @@ -6,9 +6,11 @@ import { AddAgentFlow } from '../agent/AddAgentFlow'; import type { AddAgentConfig } from '../agent/types'; import { FRAMEWORK_OPTIONS } from '../agent/types'; import { useAddAgent } from '../agent/useAddAgent'; +import { AddEvaluatorFlow } from '../evaluator'; import { AddIdentityFlow } from '../identity'; import { AddGatewayFlow, AddGatewayTargetFlow } from '../mcp'; import { AddMemoryFlow } from '../memory/AddMemoryFlow'; +import { AddOnlineEvalFlow } from '../online-eval'; import type { AddResourceType } from './AddScreen'; import { AddScreen } from './AddScreen'; import { AddSuccessScreen } from './AddSuccessScreen'; @@ -23,6 +25,8 @@ type FlowState = | { name: 'tool-wizard' } | { name: 'memory-wizard' } | { name: 'identity-wizard' } + | { name: 'evaluator-wizard' } + | { name: 'online-eval-wizard' } | { name: 'agent-create-success'; agentName: string; @@ -172,6 +176,12 @@ export function AddFlow(props: AddFlowProps) { case 'identity': setFlow({ name: 'identity-wizard' }); break; + case 'evaluator': + setFlow({ name: 'evaluator-wizard' }); + break; + case 'online-eval': + setFlow({ name: 'online-eval-wizard' }); + break; } }, []); @@ -366,6 +376,32 @@ export function AddFlow(props: AddFlowProps) { ); } + // Evaluator wizard + if (flow.name === 'evaluator-wizard') { + return ( + setFlow({ name: 'select' })} + onDev={props.onDev} + onDeploy={props.onDeploy} + /> + ); + } + + // Online eval config wizard + if (flow.name === 'online-eval-wizard') { + return ( + setFlow({ name: 'select' })} + onDev={props.onDev} + onDeploy={props.onDeploy} + /> + ); + } + return ( void; + onBack: () => void; + onDev?: () => void; + onDeploy?: () => void; +} + +export function AddEvaluatorFlow({ isInteractive = true, onExit, onBack, onDev, onDeploy }: AddEvaluatorFlowProps) { + const { createEvaluator, reset: resetCreate } = useCreateEvaluator(); + const { names: existingNames } = useExistingEvaluatorNames(); + const [flow, setFlow] = useState({ name: 'create-wizard' }); + + useEffect(() => { + if (!isInteractive && flow.name === 'create-success') { + onExit(); + } + }, [isInteractive, flow.name, onExit]); + + const handleCreateComplete = useCallback( + (config: AddEvaluatorConfig) => { + void createEvaluator(config).then(result => { + if (result.ok) { + setFlow({ name: 'create-success', evaluatorName: result.evaluatorName }); + return; + } + setFlow({ name: 'error', message: result.error }); + }); + }, + [createEvaluator] + ); + + if (flow.name === 'create-wizard') { + return ( + + ); + } + + if (flow.name === 'create-success') { + return ( + + ); + } + + return ( + { + resetCreate(); + setFlow({ name: 'create-wizard' }); + }} + onExit={onExit} + /> + ); +} diff --git a/src/cli/tui/screens/evaluator/AddEvaluatorScreen.tsx b/src/cli/tui/screens/evaluator/AddEvaluatorScreen.tsx new file mode 100644 index 00000000..8969c010 --- /dev/null +++ b/src/cli/tui/screens/evaluator/AddEvaluatorScreen.tsx @@ -0,0 +1,164 @@ +import type { EvaluationLevel, EvaluatorConfig } from '../../../../schema'; +import { EvaluatorNameSchema } from '../../../../schema'; +import type { SelectableItem } from '../../components'; +import { ConfirmReview, Panel, Screen, StepIndicator, TextInput, WizardSelect } from '../../components'; +import { HELP_TEXT } from '../../constants'; +import { useListNavigation } from '../../hooks'; +import { generateUniqueName } from '../../utils'; +import type { AddEvaluatorConfig } from './types'; +import { + DEFAULT_INSTRUCTIONS, + DEFAULT_MODEL, + EVALUATION_LEVEL_OPTIONS, + EVALUATOR_STEP_LABELS, + LEVEL_PLACEHOLDERS, + RATING_SCALE_PRESETS, + validateInstructionPlaceholders, +} from './types'; +import { useAddEvaluatorWizard } from './useAddEvaluatorWizard'; +import React, { useMemo } from 'react'; + +interface AddEvaluatorScreenProps { + onComplete: (config: AddEvaluatorConfig) => void; + onExit: () => void; + existingEvaluatorNames: string[]; +} + +function formatRatingScale(ratingScale: EvaluatorConfig['llmAsAJudge']['ratingScale']): string { + if ('numerical' in ratingScale && ratingScale.numerical) { + return ratingScale.numerical.map(r => `${r.value}=${r.label}`).join(', '); + } + if ('categorical' in ratingScale && ratingScale.categorical) { + return ratingScale.categorical.map(r => r.label).join(', '); + } + return 'Unknown'; +} + +export function AddEvaluatorScreen({ onComplete, onExit, existingEvaluatorNames }: AddEvaluatorScreenProps) { + const wizard = useAddEvaluatorWizard(); + + const levelItems: SelectableItem[] = useMemo( + () => EVALUATION_LEVEL_OPTIONS.map(opt => ({ id: opt.id, title: opt.title, description: opt.description })), + [] + ); + + const ratingScaleItems: SelectableItem[] = useMemo( + () => RATING_SCALE_PRESETS.map(opt => ({ id: opt.id, title: opt.title, description: opt.description })), + [] + ); + + const isNameStep = wizard.step === 'name'; + const isLevelStep = wizard.step === 'level'; + const isModelStep = wizard.step === 'model'; + const isInstructionsStep = wizard.step === 'instructions'; + const isRatingScaleStep = wizard.step === 'ratingScale'; + const isConfirmStep = wizard.step === 'confirm'; + + const levelNav = useListNavigation({ + items: levelItems, + onSelect: item => wizard.setLevel(item.id as EvaluationLevel), + onExit: () => wizard.goBack(), + isActive: isLevelStep, + }); + + const ratingScaleNav = useListNavigation({ + items: ratingScaleItems, + onSelect: item => { + const preset = RATING_SCALE_PRESETS.find(p => p.id === item.id); + if (preset) wizard.setRatingScale(preset.ratingScale); + }, + onExit: () => wizard.goBack(), + isActive: isRatingScaleStep, + }); + + useListNavigation({ + items: [{ id: 'confirm', title: 'Confirm' }], + onSelect: () => onComplete(wizard.config), + onExit: () => wizard.goBack(), + isActive: isConfirmStep, + }); + + const helpText = + isLevelStep || isRatingScaleStep + ? HELP_TEXT.NAVIGATE_SELECT + : isConfirmStep + ? HELP_TEXT.CONFIRM_CANCEL + : HELP_TEXT.TEXT_INPUT; + + const headerContent = ; + + return ( + + + {isNameStep && ( + !existingEvaluatorNames.includes(value) || 'Evaluator name already exists'} + /> + )} + + {isLevelStep && ( + + )} + + {isModelStep && ( + wizard.goBack()} + /> + )} + + {isInstructionsStep && ( + `{${p}}`).join(', ')})`} + initialValue={DEFAULT_INSTRUCTIONS[wizard.config.level]} + onSubmit={wizard.setInstructions} + onCancel={() => wizard.goBack()} + customValidation={value => validateInstructionPlaceholders(value, wizard.config.level)} + /> + )} + + {isRatingScaleStep && ( + + )} + + {isConfirmStep && ( + 60 + ? wizard.config.config.llmAsAJudge.instructions.slice(0, 60) + '...' + : wizard.config.config.llmAsAJudge.instructions, + }, + { label: 'Rating Scale', value: formatRatingScale(wizard.config.config.llmAsAJudge.ratingScale) }, + ]} + /> + )} + + + ); +} diff --git a/src/cli/tui/screens/evaluator/index.ts b/src/cli/tui/screens/evaluator/index.ts new file mode 100644 index 00000000..1e85211d --- /dev/null +++ b/src/cli/tui/screens/evaluator/index.ts @@ -0,0 +1,2 @@ +export { AddEvaluatorFlow } from './AddEvaluatorFlow'; +export { AddEvaluatorScreen } from './AddEvaluatorScreen'; diff --git a/src/cli/tui/screens/evaluator/types.ts b/src/cli/tui/screens/evaluator/types.ts new file mode 100644 index 00000000..f22a56c4 --- /dev/null +++ b/src/cli/tui/screens/evaluator/types.ts @@ -0,0 +1,131 @@ +import type { EvaluationLevel, EvaluatorConfig } from '../../../../schema'; + +// ───────────────────────────────────────────────────────────────────────────── +// Evaluator Flow Types +// ───────────────────────────────────────────────────────────────────────────── + +export type AddEvaluatorStep = 'name' | 'level' | 'model' | 'instructions' | 'ratingScale' | 'confirm'; + +export interface AddEvaluatorConfig { + name: string; + level: EvaluationLevel; + config: EvaluatorConfig; +} + +export const EVALUATOR_STEP_LABELS: Record = { + name: 'Name', + level: 'Level', + model: 'Model', + instructions: 'Prompt', + ratingScale: 'Scale', + confirm: 'Confirm', +}; + +// ───────────────────────────────────────────────────────────────────────────── +// UI Option Constants +// ───────────────────────────────────────────────────────────────────────────── + +export const EVALUATION_LEVEL_OPTIONS = [ + { id: 'SESSION', title: 'Session', description: 'Evaluate entire conversation sessions' }, + { id: 'TRACE', title: 'Trace', description: 'Evaluate individual agent traces' }, + { id: 'TOOL_CALL', title: 'Tool Call', description: 'Evaluate individual tool calls' }, +] as const; + +export const DEFAULT_MODEL = 'us.anthropic.claude-sonnet-4-5-20250929-v1:0'; + +// ───────────────────────────────────────────────────────────────────────────── +// Placeholder Constants +// ───────────────────────────────────────────────────────────────────────────── + +/** + * Allowed placeholders per evaluation level. The API requires instructions + * to contain at least one placeholder from the evaluator's level. + */ +export const LEVEL_PLACEHOLDERS: Record = { + SESSION: ['available_tools', 'context', 'actual_trajectory', 'expected_trajectory', 'assertions'], + TRACE: ['available_tools', 'context', 'actual_trajectory', 'expected_trajectory', 'assertions'], + TOOL_CALL: ['tool_name', 'tool_input', 'tool_output', 'context'], +}; + +/** + * Default instruction templates per level that include required placeholders. + */ +export const DEFAULT_INSTRUCTIONS: Record = { + SESSION: + 'Evaluate the agent session. Context: {context}. The agent trajectory was: {actual_trajectory}. Rate the overall quality of the response.', + TRACE: + 'Evaluate the agent trace. Context: {context}. The agent trajectory was: {actual_trajectory}. Rate the quality of this trace.', + TOOL_CALL: + 'Evaluate the tool call. Tool: {tool_name}. Input: {tool_input}. Output: {tool_output}. Rate the quality of this tool usage.', +}; + +/** + * Validates that instructions contain at least one placeholder for the given level. + */ +export function validateInstructionPlaceholders(instructions: string, level: EvaluationLevel): string | true { + const placeholders = LEVEL_PLACEHOLDERS[level]; + const hasPlaceholder = placeholders.some(p => instructions.includes(`{${p}}`)); + if (!hasPlaceholder) { + return `Instructions must contain at least one placeholder: ${placeholders.map(p => `{${p}}`).join(', ')}`; + } + return true; +} + +export interface RatingScalePreset { + id: string; + title: string; + description: string; + ratingScale: EvaluatorConfig['llmAsAJudge']['ratingScale']; +} + +export const RATING_SCALE_PRESETS: RatingScalePreset[] = [ + { + id: '1-5-quality', + title: '1–5 Quality (Numerical)', + description: 'Five-point quality scale from Poor to Excellent', + ratingScale: { + numerical: [ + { value: 1, label: 'Poor', definition: 'Fails to meet expectations' }, + { value: 2, label: 'Fair', definition: 'Partially meets expectations' }, + { value: 3, label: 'Good', definition: 'Meets expectations' }, + { value: 4, label: 'Very Good', definition: 'Exceeds expectations' }, + { value: 5, label: 'Excellent', definition: 'Far exceeds expectations' }, + ], + }, + }, + { + id: '1-3-simple', + title: '1–3 Simple (Numerical)', + description: 'Three-point scale: Low, Medium, High', + ratingScale: { + numerical: [ + { value: 1, label: 'Low', definition: 'Below acceptable quality' }, + { value: 2, label: 'Medium', definition: 'Acceptable quality' }, + { value: 3, label: 'High', definition: 'Above acceptable quality' }, + ], + }, + }, + { + id: 'pass-fail', + title: 'Pass / Fail (Categorical)', + description: 'Binary pass or fail assessment', + ratingScale: { + categorical: [ + { label: 'Pass', definition: 'Meets the evaluation criteria' }, + { label: 'Fail', definition: 'Does not meet the evaluation criteria' }, + ], + }, + }, + { + id: 'good-neutral-bad', + title: 'Good / Neutral / Bad (Categorical)', + description: 'Three-tier categorical assessment', + ratingScale: { + categorical: [ + { label: 'Good', definition: 'Positive outcome, meets or exceeds criteria' }, + { label: 'Neutral', definition: 'Acceptable but unremarkable outcome' }, + { label: 'Bad', definition: 'Negative outcome, fails to meet criteria' }, + ], + }, + }, +]; diff --git a/src/cli/tui/screens/evaluator/useAddEvaluatorWizard.ts b/src/cli/tui/screens/evaluator/useAddEvaluatorWizard.ts new file mode 100644 index 00000000..6288eab9 --- /dev/null +++ b/src/cli/tui/screens/evaluator/useAddEvaluatorWizard.ts @@ -0,0 +1,121 @@ +import type { EvaluationLevel, EvaluatorConfig } from '../../../../schema'; +import type { AddEvaluatorConfig, AddEvaluatorStep } from './types'; +import { DEFAULT_MODEL } from './types'; +import { useCallback, useState } from 'react'; + +const ALL_STEPS: AddEvaluatorStep[] = ['name', 'level', 'model', 'instructions', 'ratingScale', 'confirm']; + +function getDefaultConfig(): AddEvaluatorConfig { + return { + name: '', + level: 'SESSION', + config: { + llmAsAJudge: { + model: DEFAULT_MODEL, + instructions: '', + ratingScale: { + numerical: [ + { value: 1, label: 'Poor', definition: 'Fails to meet expectations' }, + { value: 5, label: 'Excellent', definition: 'Far exceeds expectations' }, + ], + }, + }, + }, + }; +} + +export function useAddEvaluatorWizard() { + const [config, setConfig] = useState(getDefaultConfig); + const [step, setStep] = useState('name'); + + const currentIndex = ALL_STEPS.indexOf(step); + + const goBack = useCallback(() => { + const prevStep = ALL_STEPS[currentIndex - 1]; + if (prevStep) setStep(prevStep); + }, [currentIndex]); + + const nextStep = useCallback((currentStep: AddEvaluatorStep): AddEvaluatorStep | undefined => { + const idx = ALL_STEPS.indexOf(currentStep); + return ALL_STEPS[idx + 1]; + }, []); + + const setName = useCallback( + (name: string) => { + setConfig(c => ({ ...c, name })); + const next = nextStep('name'); + if (next) setStep(next); + }, + [nextStep] + ); + + const setLevel = useCallback( + (level: EvaluationLevel) => { + setConfig(c => ({ ...c, level })); + const next = nextStep('level'); + if (next) setStep(next); + }, + [nextStep] + ); + + const setModel = useCallback( + (model: string) => { + setConfig(c => ({ + ...c, + config: { + llmAsAJudge: { ...c.config.llmAsAJudge, model }, + }, + })); + const next = nextStep('model'); + if (next) setStep(next); + }, + [nextStep] + ); + + const setInstructions = useCallback( + (instructions: string) => { + setConfig(c => ({ + ...c, + config: { + llmAsAJudge: { ...c.config.llmAsAJudge, instructions }, + }, + })); + const next = nextStep('instructions'); + if (next) setStep(next); + }, + [nextStep] + ); + + const setRatingScale = useCallback( + (ratingScale: EvaluatorConfig['llmAsAJudge']['ratingScale']) => { + setConfig(c => ({ + ...c, + config: { + llmAsAJudge: { ...c.config.llmAsAJudge, ratingScale }, + }, + })); + const next = nextStep('ratingScale'); + if (next) setStep(next); + }, + [nextStep] + ); + + const reset = useCallback(() => { + setConfig(getDefaultConfig()); + setStep('name'); + }, []); + + return { + config, + step, + steps: ALL_STEPS, + currentIndex, + goBack, + setName, + setLevel, + setModel, + setInstructions, + setRatingScale, + reset, + }; +} diff --git a/src/cli/tui/screens/online-eval/AddOnlineEvalFlow.tsx b/src/cli/tui/screens/online-eval/AddOnlineEvalFlow.tsx new file mode 100644 index 00000000..d838f429 --- /dev/null +++ b/src/cli/tui/screens/online-eval/AddOnlineEvalFlow.tsx @@ -0,0 +1,86 @@ +import { ErrorPrompt } from '../../components'; +import { useExistingEvaluatorNames } from '../../hooks/useCreateEvaluator'; +import { useAvailableAgents } from '../../hooks/useCreateMcp'; +import { useCreateOnlineEval, useExistingOnlineEvalNames } from '../../hooks/useCreateOnlineEval'; +import { AddSuccessScreen } from '../add/AddSuccessScreen'; +import { AddOnlineEvalScreen } from './AddOnlineEvalScreen'; +import type { AddOnlineEvalConfig } from './types'; +import React, { useCallback, useEffect, useState } from 'react'; + +type FlowState = + | { name: 'create-wizard' } + | { name: 'create-success'; configName: string } + | { name: 'error'; message: string }; + +interface AddOnlineEvalFlowProps { + isInteractive?: boolean; + onExit: () => void; + onBack: () => void; + onDev?: () => void; + onDeploy?: () => void; +} + +export function AddOnlineEvalFlow({ isInteractive = true, onExit, onBack, onDev, onDeploy }: AddOnlineEvalFlowProps) { + const { createOnlineEval, reset: resetCreate } = useCreateOnlineEval(); + const { names: existingConfigNames } = useExistingOnlineEvalNames(); + const { agents: availableAgents } = useAvailableAgents(); + const { names: availableEvaluators } = useExistingEvaluatorNames(); + const [flow, setFlow] = useState({ name: 'create-wizard' }); + + useEffect(() => { + if (!isInteractive && flow.name === 'create-success') { + onExit(); + } + }, [isInteractive, flow.name, onExit]); + + const handleCreateComplete = useCallback( + (config: AddOnlineEvalConfig) => { + void createOnlineEval(config).then(result => { + if (result.ok) { + setFlow({ name: 'create-success', configName: result.configName }); + return; + } + setFlow({ name: 'error', message: result.error }); + }); + }, + [createOnlineEval] + ); + + if (flow.name === 'create-wizard') { + return ( + + ); + } + + if (flow.name === 'create-success') { + return ( + + ); + } + + return ( + { + resetCreate(); + setFlow({ name: 'create-wizard' }); + }} + onExit={onExit} + /> + ); +} diff --git a/src/cli/tui/screens/online-eval/AddOnlineEvalScreen.tsx b/src/cli/tui/screens/online-eval/AddOnlineEvalScreen.tsx new file mode 100644 index 00000000..7390a874 --- /dev/null +++ b/src/cli/tui/screens/online-eval/AddOnlineEvalScreen.tsx @@ -0,0 +1,151 @@ +import { OnlineEvalConfigNameSchema } from '../../../../schema'; +import type { SelectableItem } from '../../components'; +import { ConfirmReview, Panel, Screen, StepIndicator, TextInput, WizardMultiSelect } from '../../components'; +import { HELP_TEXT } from '../../constants'; +import { useListNavigation, useMultiSelectNavigation } from '../../hooks'; +import { generateUniqueName } from '../../utils'; +import type { AddOnlineEvalConfig } from './types'; +import { BUILTIN_EVALUATORS, DEFAULT_SAMPLING_RATE, ONLINE_EVAL_STEP_LABELS } from './types'; +import { useAddOnlineEvalWizard } from './useAddOnlineEvalWizard'; +import React, { useMemo } from 'react'; + +interface AddOnlineEvalScreenProps { + onComplete: (config: AddOnlineEvalConfig) => void; + onExit: () => void; + existingConfigNames: string[]; + availableAgents: string[]; + availableEvaluators: string[]; +} + +export function AddOnlineEvalScreen({ + onComplete, + onExit, + existingConfigNames, + availableAgents, + availableEvaluators, +}: AddOnlineEvalScreenProps) { + const wizard = useAddOnlineEvalWizard(); + + const agentItems: SelectableItem[] = useMemo( + () => availableAgents.map(name => ({ id: name, title: name, description: 'Agent' })), + [availableAgents] + ); + + const evaluatorItems: SelectableItem[] = useMemo(() => { + const custom = availableEvaluators.map(name => ({ id: name, title: name, description: 'Custom evaluator' })); + const builtin = BUILTIN_EVALUATORS.map(b => ({ id: b.id, title: b.title, description: b.description })); + return [...custom, ...builtin]; + }, [availableEvaluators]); + + const isNameStep = wizard.step === 'name'; + const isAgentsStep = wizard.step === 'agents'; + const isEvaluatorsStep = wizard.step === 'evaluators'; + const isSamplingRateStep = wizard.step === 'samplingRate'; + const isConfirmStep = wizard.step === 'confirm'; + + const agentsNav = useMultiSelectNavigation({ + items: agentItems, + getId: item => item.id, + onConfirm: ids => wizard.setAgents(ids), + onExit: () => wizard.goBack(), + isActive: isAgentsStep, + requireSelection: true, + }); + + const evaluatorsNav = useMultiSelectNavigation({ + items: evaluatorItems, + getId: item => item.id, + onConfirm: ids => wizard.setEvaluators(ids), + onExit: () => wizard.goBack(), + isActive: isEvaluatorsStep, + requireSelection: true, + }); + + useListNavigation({ + items: [{ id: 'confirm', title: 'Confirm' }], + onSelect: () => onComplete(wizard.config), + onExit: () => wizard.goBack(), + isActive: isConfirmStep, + }); + + const helpText = + isAgentsStep || isEvaluatorsStep + ? 'Space toggle · Enter confirm · Esc back' + : isConfirmStep + ? HELP_TEXT.CONFIRM_CANCEL + : HELP_TEXT.TEXT_INPUT; + + const headerContent = ( + + ); + + return ( + + + {isNameStep && ( + !existingConfigNames.includes(value) || 'Config name already exists'} + /> + )} + + {isAgentsStep && ( + + )} + + {isEvaluatorsStep && ( + + )} + + {isSamplingRateStep && ( + { + const rate = parseFloat(value); + if (isNaN(rate) || rate < 0.01 || rate > 100) return; + wizard.setSamplingRate(rate); + }} + onCancel={() => wizard.goBack()} + customValidation={value => { + const rate = parseFloat(value); + if (isNaN(rate)) return 'Must be a number'; + if (rate < 0.01 || rate > 100) return 'Must be between 0.01 and 100'; + return true; + }} + /> + )} + + {isConfirmStep && ( + + )} + + + ); +} diff --git a/src/cli/tui/screens/online-eval/index.ts b/src/cli/tui/screens/online-eval/index.ts new file mode 100644 index 00000000..fcd0d5f4 --- /dev/null +++ b/src/cli/tui/screens/online-eval/index.ts @@ -0,0 +1,2 @@ +export { AddOnlineEvalFlow } from './AddOnlineEvalFlow'; +export { AddOnlineEvalScreen } from './AddOnlineEvalScreen'; diff --git a/src/cli/tui/screens/online-eval/types.ts b/src/cli/tui/screens/online-eval/types.ts new file mode 100644 index 00000000..0c2d70b7 --- /dev/null +++ b/src/cli/tui/screens/online-eval/types.ts @@ -0,0 +1,41 @@ +// ───────────────────────────────────────────────────────────────────────────── +// Online Eval Config Flow Types +// ───────────────────────────────────────────────────────────────────────────── + +export type AddOnlineEvalStep = 'name' | 'agents' | 'evaluators' | 'samplingRate' | 'confirm'; + +export interface AddOnlineEvalConfig { + name: string; + agents: string[]; + evaluators: string[]; + samplingRate: number; + enableOnCreate: boolean; +} + +export const ONLINE_EVAL_STEP_LABELS: Record = { + name: 'Name', + agents: 'Agents', + evaluators: 'Evaluators', + samplingRate: 'Rate', + confirm: 'Confirm', +}; + +// ───────────────────────────────────────────────────────────────────────────── +// Built-in Evaluators +// ───────────────────────────────────────────────────────────────────────────── + +export const BUILTIN_EVALUATORS = [ + { id: 'Builtin.Helpfulness', title: 'Builtin.Helpfulness', description: 'Measures how helpful agent responses are' }, + { + id: 'Builtin.GoalSuccessRate', + title: 'Builtin.GoalSuccessRate', + description: 'Measures whether the agent achieved the user goal', + }, + { + id: 'Builtin.Faithfulness', + title: 'Builtin.Faithfulness', + description: 'Measures factual consistency with source material', + }, +] as const; + +export const DEFAULT_SAMPLING_RATE = 10; diff --git a/src/cli/tui/screens/online-eval/useAddOnlineEvalWizard.ts b/src/cli/tui/screens/online-eval/useAddOnlineEvalWizard.ts new file mode 100644 index 00000000..a4743cb3 --- /dev/null +++ b/src/cli/tui/screens/online-eval/useAddOnlineEvalWizard.ts @@ -0,0 +1,86 @@ +import type { AddOnlineEvalConfig, AddOnlineEvalStep } from './types'; +import { DEFAULT_SAMPLING_RATE } from './types'; +import { useCallback, useState } from 'react'; + +const ALL_STEPS: AddOnlineEvalStep[] = ['name', 'agents', 'evaluators', 'samplingRate', 'confirm']; + +function getDefaultConfig(): AddOnlineEvalConfig { + return { + name: '', + agents: [], + evaluators: [], + samplingRate: DEFAULT_SAMPLING_RATE, + enableOnCreate: true, + }; +} + +export function useAddOnlineEvalWizard() { + const [config, setConfig] = useState(getDefaultConfig); + const [step, setStep] = useState('name'); + + const currentIndex = ALL_STEPS.indexOf(step); + + const goBack = useCallback(() => { + const prevStep = ALL_STEPS[currentIndex - 1]; + if (prevStep) setStep(prevStep); + }, [currentIndex]); + + const nextStep = useCallback((currentStep: AddOnlineEvalStep): AddOnlineEvalStep | undefined => { + const idx = ALL_STEPS.indexOf(currentStep); + return ALL_STEPS[idx + 1]; + }, []); + + const setName = useCallback( + (name: string) => { + setConfig(c => ({ ...c, name })); + const next = nextStep('name'); + if (next) setStep(next); + }, + [nextStep] + ); + + const setAgents = useCallback( + (agents: string[]) => { + setConfig(c => ({ ...c, agents })); + const next = nextStep('agents'); + if (next) setStep(next); + }, + [nextStep] + ); + + const setEvaluators = useCallback( + (evaluators: string[]) => { + setConfig(c => ({ ...c, evaluators })); + const next = nextStep('evaluators'); + if (next) setStep(next); + }, + [nextStep] + ); + + const setSamplingRate = useCallback( + (samplingRate: number) => { + setConfig(c => ({ ...c, samplingRate })); + const next = nextStep('samplingRate'); + if (next) setStep(next); + }, + [nextStep] + ); + + const reset = useCallback(() => { + setConfig(getDefaultConfig()); + setStep('name'); + }, []); + + return { + config, + step, + steps: ALL_STEPS, + currentIndex, + goBack, + setName, + setAgents, + setEvaluators, + setSamplingRate, + reset, + }; +} diff --git a/src/cli/tui/screens/remove/RemoveEvaluatorScreen.tsx b/src/cli/tui/screens/remove/RemoveEvaluatorScreen.tsx new file mode 100644 index 00000000..a0d5f0fa --- /dev/null +++ b/src/cli/tui/screens/remove/RemoveEvaluatorScreen.tsx @@ -0,0 +1,26 @@ +import type { RemovableEvaluator } from '../../../primitives/EvaluatorPrimitive'; +import { SelectScreen } from '../../components'; +import React from 'react'; + +interface RemoveEvaluatorScreenProps { + evaluators: RemovableEvaluator[]; + onSelect: (evaluatorName: string) => void; + onExit: () => void; +} + +export function RemoveEvaluatorScreen({ evaluators, onSelect, onExit }: RemoveEvaluatorScreenProps) { + const items = evaluators.map(evaluator => ({ + id: evaluator.name, + title: evaluator.name, + description: 'Custom Evaluator', + })); + + return ( + onSelect(item.id)} + onExit={onExit} + /> + ); +} diff --git a/src/cli/tui/screens/remove/RemoveFlow.tsx b/src/cli/tui/screens/remove/RemoveFlow.tsx index 066874bb..635f18cb 100644 --- a/src/cli/tui/screens/remove/RemoveFlow.tsx +++ b/src/cli/tui/screens/remove/RemoveFlow.tsx @@ -2,24 +2,30 @@ import type { RemovableGatewayTarget, RemovalPreview } from '../../../operations import { ErrorPrompt, Panel, Screen } from '../../components'; import { useRemovableAgents, + useRemovableEvaluators, useRemovableGatewayTargets, useRemovableGateways, useRemovableIdentities, useRemovableMemories, + useRemovableOnlineEvalConfigs, useRemovalPreview, useRemoveAgent, + useRemoveEvaluator, useRemoveGateway, useRemoveGatewayTarget, useRemoveIdentity, useRemoveMemory, + useRemoveOnlineEvalConfig, } from '../../hooks/useRemove'; import { RemoveAgentScreen } from './RemoveAgentScreen'; import { RemoveAllScreen } from './RemoveAllScreen'; import { RemoveConfirmScreen } from './RemoveConfirmScreen'; +import { RemoveEvaluatorScreen } from './RemoveEvaluatorScreen'; import { RemoveGatewayScreen } from './RemoveGatewayScreen'; import { RemoveGatewayTargetScreen } from './RemoveGatewayTargetScreen'; import { RemoveIdentityScreen } from './RemoveIdentityScreen'; import { RemoveMemoryScreen } from './RemoveMemoryScreen'; +import { RemoveOnlineEvalScreen } from './RemoveOnlineEvalScreen'; import type { RemoveResourceType } from './RemoveScreen'; import { RemoveScreen } from './RemoveScreen'; import { RemoveSuccessScreen } from './RemoveSuccessScreen'; @@ -34,17 +40,23 @@ type FlowState = | { name: 'select-gateway-target' } | { name: 'select-memory' } | { name: 'select-identity' } + | { name: 'select-evaluator' } + | { name: 'select-online-eval' } | { name: 'confirm-agent'; agentName: string; preview: RemovalPreview } | { name: 'confirm-gateway'; gatewayName: string; preview: RemovalPreview } | { name: 'confirm-gateway-target'; tool: RemovableGatewayTarget; preview: RemovalPreview } | { name: 'confirm-memory'; memoryName: string; preview: RemovalPreview } | { name: 'confirm-identity'; identityName: string; preview: RemovalPreview } + | { name: 'confirm-evaluator'; evaluatorName: string; preview: RemovalPreview } + | { name: 'confirm-online-eval'; configName: string; preview: RemovalPreview } | { name: 'loading'; message: string } | { name: 'agent-success'; agentName: string; logFilePath?: string } | { name: 'gateway-success'; gatewayName: string; logFilePath?: string } | { name: 'tool-success'; toolName: string; logFilePath?: string } | { name: 'memory-success'; memoryName: string; logFilePath?: string } | { name: 'identity-success'; identityName: string; logFilePath?: string } + | { name: 'evaluator-success'; evaluatorName: string; logFilePath?: string } + | { name: 'online-eval-success'; configName: string; logFilePath?: string } | { name: 'remove-all' } | { name: 'error'; message: string }; @@ -57,7 +69,7 @@ interface RemoveFlowProps { /** Force mode - skip confirmation */ force?: boolean; /** Initial resource type to start at (for CLI subcommands) */ - initialResourceType?: 'agent' | 'gateway' | 'gateway-target' | 'memory' | 'identity'; + initialResourceType?: 'agent' | 'gateway' | 'gateway-target' | 'memory' | 'identity' | 'evaluator' | 'online-eval'; /** Initial resource name to auto-select (for CLI --name flag) */ initialResourceName?: string; } @@ -83,6 +95,10 @@ export function RemoveFlow({ return { name: 'select-memory' }; case 'identity': return { name: 'select-identity' }; + case 'evaluator': + return { name: 'select-evaluator' }; + case 'online-eval': + return { name: 'select-online-eval' }; default: return { name: 'select' }; } @@ -95,9 +111,22 @@ export function RemoveFlow({ const { tools: mcpTools, isLoading: isLoadingTools, refresh: refreshTools } = useRemovableGatewayTargets(); const { memories, isLoading: isLoadingMemories, refresh: refreshMemories } = useRemovableMemories(); const { identities, isLoading: isLoadingIdentities, refresh: refreshIdentities } = useRemovableIdentities(); + const { evaluators, isLoading: isLoadingEvaluators, refresh: refreshEvaluators } = useRemovableEvaluators(); + const { + onlineEvalConfigs, + isLoading: isLoadingOnlineEvals, + refresh: refreshOnlineEvals, + } = useRemovableOnlineEvalConfigs(); // Check if any data is still loading - const isLoading = isLoadingAgents || isLoadingGateways || isLoadingTools || isLoadingMemories || isLoadingIdentities; + const isLoading = + isLoadingAgents || + isLoadingGateways || + isLoadingTools || + isLoadingMemories || + isLoadingIdentities || + isLoadingEvaluators || + isLoadingOnlineEvals; // Preview hook const { @@ -106,6 +135,8 @@ export function RemoveFlow({ loadGatewayTargetPreview, loadMemoryPreview, loadIdentityPreview, + loadEvaluatorPreview, + loadOnlineEvalPreview, reset: resetPreview, } = useRemovalPreview(); @@ -115,6 +146,8 @@ export function RemoveFlow({ const { remove: removeGatewayTargetOp, reset: resetRemoveGatewayTarget } = useRemoveGatewayTarget(); const { remove: removeMemoryOp, reset: resetRemoveMemory } = useRemoveMemory(); const { remove: removeIdentityOp, reset: resetRemoveIdentity } = useRemoveIdentity(); + const { remove: removeEvaluatorOp, reset: resetRemoveEvaluator } = useRemoveEvaluator(); + const { remove: removeOnlineEvalOp, reset: resetRemoveOnlineEval } = useRemoveOnlineEvalConfig(); // Track pending result state const pendingResultRef = useRef(null); @@ -135,7 +168,15 @@ export function RemoveFlow({ // In non-interactive mode, exit after success useEffect(() => { if (!isInteractive) { - const successStates = ['agent-success', 'gateway-success', 'tool-success', 'memory-success', 'identity-success']; + const successStates = [ + 'agent-success', + 'gateway-success', + 'tool-success', + 'memory-success', + 'identity-success', + 'evaluator-success', + 'online-eval-success', + ]; if (successStates.includes(flow.name)) { onExit(); } @@ -162,6 +203,12 @@ export function RemoveFlow({ case 'identity': setFlow({ name: 'select-identity' }); break; + case 'evaluator': + setFlow({ name: 'select-evaluator' }); + break; + case 'online-eval': + setFlow({ name: 'select-online-eval' }); + break; case 'all': setFlow({ name: 'remove-all' }); break; @@ -281,6 +328,50 @@ export function RemoveFlow({ [loadIdentityPreview, force, removeIdentityOp] ); + const handleSelectEvaluator = useCallback( + async (evaluatorName: string) => { + const result = await loadEvaluatorPreview(evaluatorName); + if (result.ok) { + if (force) { + setFlow({ name: 'loading', message: `Removing evaluator ${evaluatorName}...` }); + const removeResult = await removeEvaluatorOp(evaluatorName, result.preview); + if (removeResult.success) { + setFlow({ name: 'evaluator-success', evaluatorName }); + } else { + setFlow({ name: 'error', message: removeResult.error }); + } + } else { + setFlow({ name: 'confirm-evaluator', evaluatorName, preview: result.preview }); + } + } else { + setFlow({ name: 'error', message: result.error }); + } + }, + [loadEvaluatorPreview, force, removeEvaluatorOp] + ); + + const handleSelectOnlineEval = useCallback( + async (configName: string) => { + const result = await loadOnlineEvalPreview(configName); + if (result.ok) { + if (force) { + setFlow({ name: 'loading', message: `Removing online eval config ${configName}...` }); + const removeResult = await removeOnlineEvalOp(configName, result.preview); + if (removeResult.success) { + setFlow({ name: 'online-eval-success', configName }); + } else { + setFlow({ name: 'error', message: removeResult.error }); + } + } else { + setFlow({ name: 'confirm-online-eval', configName, preview: result.preview }); + } + } else { + setFlow({ name: 'error', message: result.error }); + } + }, + [loadOnlineEvalPreview, force, removeOnlineEvalOp] + ); + // Auto-select resource when initialResourceName is provided and data is loaded useEffect(() => { if (!initialResourceName || isLoading || hasTriggeredInitialSelection.current) { @@ -305,6 +396,12 @@ export function RemoveFlow({ case 'identity': void handleSelectIdentity(initialResourceName); break; + case 'evaluator': + void handleSelectEvaluator(initialResourceName); + break; + case 'online-eval': + void handleSelectOnlineEval(initialResourceName); + break; } }, 0); }, [ @@ -315,6 +412,8 @@ export function RemoveFlow({ handleSelectGateway, handleSelectMemory, handleSelectIdentity, + handleSelectEvaluator, + handleSelectOnlineEval, ]); // Confirm handlers - pass preview for logging @@ -398,6 +497,38 @@ export function RemoveFlow({ [removeIdentityOp] ); + const handleConfirmEvaluator = useCallback( + async (evaluatorName: string, preview: RemovalPreview) => { + pendingResultRef.current = null; + setResultReady(false); + setFlow({ name: 'loading', message: `Removing evaluator ${evaluatorName}...` }); + const result = await removeEvaluatorOp(evaluatorName, preview); + if (result.success) { + pendingResultRef.current = { name: 'evaluator-success', evaluatorName, logFilePath: result.logFilePath }; + } else { + pendingResultRef.current = { name: 'error', message: result.error }; + } + setResultReady(true); + }, + [removeEvaluatorOp] + ); + + const handleConfirmOnlineEval = useCallback( + async (configName: string, preview: RemovalPreview) => { + pendingResultRef.current = null; + setResultReady(false); + setFlow({ name: 'loading', message: `Removing online eval config ${configName}...` }); + const result = await removeOnlineEvalOp(configName, preview); + if (result.success) { + pendingResultRef.current = { name: 'online-eval-success', configName, logFilePath: result.logFilePath }; + } else { + pendingResultRef.current = { name: 'error', message: result.error }; + } + setResultReady(true); + }, + [removeOnlineEvalOp] + ); + const resetAll = useCallback(() => { resetPreview(); resetRemoveAgent(); @@ -405,6 +536,8 @@ export function RemoveFlow({ resetRemoveGatewayTarget(); resetRemoveMemory(); resetRemoveIdentity(); + resetRemoveEvaluator(); + resetRemoveOnlineEval(); }, [ resetPreview, resetRemoveAgent, @@ -412,11 +545,29 @@ export function RemoveFlow({ resetRemoveGatewayTarget, resetRemoveMemory, resetRemoveIdentity, + resetRemoveEvaluator, + resetRemoveOnlineEval, ]); const refreshAll = useCallback(async () => { - await Promise.all([refreshAgents(), refreshGateways(), refreshTools(), refreshMemories(), refreshIdentities()]); - }, [refreshAgents, refreshGateways, refreshTools, refreshMemories, refreshIdentities]); + await Promise.all([ + refreshAgents(), + refreshGateways(), + refreshTools(), + refreshMemories(), + refreshIdentities(), + refreshEvaluators(), + refreshOnlineEvals(), + ]); + }, [ + refreshAgents, + refreshGateways, + refreshTools, + refreshMemories, + refreshIdentities, + refreshEvaluators, + refreshOnlineEvals, + ]); // Select screen - wait for data to load to avoid arrow position issues if (flow.name === 'select') { @@ -432,6 +583,8 @@ export function RemoveFlow({ mcpToolCount={mcpTools.length} memoryCount={memories.length} identityCount={identities.length} + evaluatorCount={evaluators.length} + onlineEvalCount={onlineEvalConfigs.length} /> ); } @@ -514,6 +667,32 @@ export function RemoveFlow({ ); } + if (flow.name === 'select-evaluator') { + if (initialResourceName && isLoading) { + return null; + } + return ( + void handleSelectEvaluator(name)} + onExit={() => setFlow({ name: 'select' })} + /> + ); + } + + if (flow.name === 'select-online-eval') { + if (initialResourceName && isLoading) { + return null; + } + return ( + void handleSelectOnlineEval(name)} + onExit={() => setFlow({ name: 'select' })} + /> + ); + } + // Confirmation screens if (flow.name === 'confirm-agent') { return ( @@ -570,6 +749,28 @@ export function RemoveFlow({ ); } + if (flow.name === 'confirm-evaluator') { + return ( + void handleConfirmEvaluator(flow.evaluatorName, flow.preview)} + onCancel={() => setFlow({ name: 'select-evaluator' })} + /> + ); + } + + if (flow.name === 'confirm-online-eval') { + return ( + void handleConfirmOnlineEval(flow.configName, flow.preview)} + onCancel={() => setFlow({ name: 'select-online-eval' })} + /> + ); + } + // Success screens if (flow.name === 'agent-success') { return ( @@ -651,6 +852,38 @@ export function RemoveFlow({ ); } + if (flow.name === 'evaluator-success') { + return ( + { + resetAll(); + void refreshAll().then(() => setFlow({ name: 'select' })); + }} + onExit={onExit} + /> + ); + } + + if (flow.name === 'online-eval-success') { + return ( + { + resetAll(); + void refreshAll().then(() => setFlow({ name: 'select' })); + }} + onExit={onExit} + /> + ); + } + // Remove all screen if (flow.name === 'remove-all') { return ; diff --git a/src/cli/tui/screens/remove/RemoveOnlineEvalScreen.tsx b/src/cli/tui/screens/remove/RemoveOnlineEvalScreen.tsx new file mode 100644 index 00000000..faab02f4 --- /dev/null +++ b/src/cli/tui/screens/remove/RemoveOnlineEvalScreen.tsx @@ -0,0 +1,26 @@ +import type { RemovableOnlineEvalConfig } from '../../../primitives/OnlineEvalConfigPrimitive'; +import { SelectScreen } from '../../components'; +import React from 'react'; + +interface RemoveOnlineEvalScreenProps { + configs: RemovableOnlineEvalConfig[]; + onSelect: (configName: string) => void; + onExit: () => void; +} + +export function RemoveOnlineEvalScreen({ configs, onSelect, onExit }: RemoveOnlineEvalScreenProps) { + const items = configs.map(config => ({ + id: config.name, + title: config.name, + description: 'Online Eval Config', + })); + + return ( + onSelect(item.id)} + onExit={onExit} + /> + ); +} diff --git a/src/cli/tui/screens/remove/RemoveScreen.tsx b/src/cli/tui/screens/remove/RemoveScreen.tsx index bcb7307c..59441d0e 100644 --- a/src/cli/tui/screens/remove/RemoveScreen.tsx +++ b/src/cli/tui/screens/remove/RemoveScreen.tsx @@ -6,6 +6,8 @@ const REMOVE_RESOURCES = [ { id: 'agent', title: 'Agent', description: 'Remove an agent from the project' }, { id: 'memory', title: 'Memory', description: 'Remove a memory provider' }, { id: 'identity', title: 'Identity', description: 'Remove an identity provider' }, + { id: 'evaluator', title: 'Evaluator', description: 'Remove a custom evaluator' }, + { id: 'online-eval', title: 'Online Eval Config', description: 'Remove an online eval config' }, { id: 'gateway', title: 'Gateway', description: 'Remove a gateway' }, { id: 'gateway-target', title: 'Gateway Target', description: 'Remove a gateway target' }, { id: 'all', title: 'All', description: 'Reset entire agentcore project' }, @@ -26,6 +28,10 @@ interface RemoveScreenProps { memoryCount: number; /** Number of identities available for removal */ identityCount: number; + /** Number of evaluators available for removal */ + evaluatorCount: number; + /** Number of online eval configs available for removal */ + onlineEvalCount: number; } export function RemoveScreen({ @@ -36,6 +42,8 @@ export function RemoveScreen({ mcpToolCount, memoryCount, identityCount, + evaluatorCount, + onlineEvalCount, }: RemoveScreenProps) { const items: SelectableItem[] = useMemo(() => { return REMOVE_RESOURCES.map(r => { @@ -73,6 +81,18 @@ export function RemoveScreen({ description = 'No identities to remove'; } break; + case 'evaluator': + if (evaluatorCount === 0) { + disabled = true; + description = 'No evaluators to remove'; + } + break; + case 'online-eval': + if (onlineEvalCount === 0) { + disabled = true; + description = 'No online eval configs to remove'; + } + break; case 'all': // 'all' is always available break; @@ -80,7 +100,7 @@ export function RemoveScreen({ return { ...r, disabled, description }; }); - }, [agentCount, gatewayCount, mcpToolCount, memoryCount, identityCount]); + }, [agentCount, gatewayCount, mcpToolCount, memoryCount, identityCount, evaluatorCount, onlineEvalCount]); const isDisabled = (item: SelectableItem) => item.disabled ?? false; diff --git a/src/cli/tui/screens/remove/__tests__/RemoveScreen.test.tsx b/src/cli/tui/screens/remove/__tests__/RemoveScreen.test.tsx index e1e32e05..4d52e68c 100644 --- a/src/cli/tui/screens/remove/__tests__/RemoveScreen.test.tsx +++ b/src/cli/tui/screens/remove/__tests__/RemoveScreen.test.tsx @@ -17,6 +17,8 @@ describe('RemoveScreen', () => { mcpToolCount={1} memoryCount={1} identityCount={1} + evaluatorCount={1} + onlineEvalCount={1} /> ); @@ -39,6 +41,8 @@ describe('RemoveScreen', () => { mcpToolCount={0} memoryCount={0} identityCount={0} + evaluatorCount={0} + onlineEvalCount={0} /> ); diff --git a/src/cli/tui/screens/remove/index.ts b/src/cli/tui/screens/remove/index.ts index 71d78c30..4a470fff 100644 --- a/src/cli/tui/screens/remove/index.ts +++ b/src/cli/tui/screens/remove/index.ts @@ -1,10 +1,12 @@ export { RemoveAgentScreen } from './RemoveAgentScreen'; export { RemoveAllScreen } from './RemoveAllScreen'; export { RemoveConfirmScreen } from './RemoveConfirmScreen'; +export { RemoveEvaluatorScreen } from './RemoveEvaluatorScreen'; export { RemoveFlow } from './RemoveFlow'; export { RemoveGatewayScreen } from './RemoveGatewayScreen'; export { RemoveIdentityScreen } from './RemoveIdentityScreen'; export { RemoveGatewayTargetScreen } from './RemoveGatewayTargetScreen'; export { RemoveMemoryScreen } from './RemoveMemoryScreen'; +export { RemoveOnlineEvalScreen } from './RemoveOnlineEvalScreen'; export { RemoveScreen, type RemoveResourceType } from './RemoveScreen'; export { RemoveSuccessScreen } from './RemoveSuccessScreen'; diff --git a/src/cli/tui/screens/remove/useRemoveFlow.ts b/src/cli/tui/screens/remove/useRemoveFlow.ts index 2c8fea13..114fab96 100644 --- a/src/cli/tui/screens/remove/useRemoveFlow.ts +++ b/src/cli/tui/screens/remove/useRemoveFlow.ts @@ -34,6 +34,8 @@ function createDefaultProjectSpec(projectName: string): AgentCoreProjectSpec { agents: [], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; } diff --git a/src/schema/schemas/agentcore-project.ts b/src/schema/schemas/agentcore-project.ts index fda34160..d367ffb8 100644 --- a/src/schema/schemas/agentcore-project.ts +++ b/src/schema/schemas/agentcore-project.ts @@ -8,13 +8,20 @@ */ import { isReservedProjectName } from '../constants'; import { AgentEnvSpecSchema } from './agent-env'; +import { EvaluationLevelSchema, EvaluatorConfigSchema, EvaluatorNameSchema } from './primitives/evaluator'; import { DEFAULT_STRATEGY_NAMESPACES, MemoryStrategySchema, MemoryStrategyTypeSchema } from './primitives/memory'; +import { OnlineEvalConfigSchema } from './primitives/online-eval-config'; import { uniqueBy } from './zod-util'; import { z } from 'zod'; // Re-export for convenience export { DEFAULT_STRATEGY_NAMESPACES, MemoryStrategySchema, MemoryStrategyTypeSchema }; +export { EvaluationLevelSchema }; export type { MemoryStrategy, MemoryStrategyType } from './primitives/memory'; +export type { OnlineEvalConfig } from './primitives/online-eval-config'; +export { OnlineEvalConfigSchema, OnlineEvalConfigNameSchema } from './primitives/online-eval-config'; +export type { EvaluationLevel, EvaluatorConfig, LlmAsAJudgeConfig, RatingScale } from './primitives/evaluator'; +export { EvaluatorNameSchema } from './primitives/evaluator'; // ============================================================================ // Project Name Schema @@ -112,42 +119,107 @@ export const CredentialSchema = z.discriminatedUnion('type', [ApiKeyCredentialSc export type Credential = z.infer; // ============================================================================ -// Project Schema (Top Level) +// Evaluator Schema // ============================================================================ -export const AgentCoreProjectSpecSchema = z.object({ - name: ProjectNameSchema, - version: z.number().int(), +export const EvaluatorTypeSchema = z.literal('CustomEvaluator'); +export type EvaluatorType = z.infer; - agents: z - .array(AgentEnvSpecSchema) - .default([]) - .superRefine( - uniqueBy( - agent => agent.name, - name => `Duplicate agent name: ${name}` - ) - ), +export const EvaluatorSchema = z.object({ + type: EvaluatorTypeSchema, + name: EvaluatorNameSchema, + level: EvaluationLevelSchema, + description: z.string().optional(), + config: EvaluatorConfigSchema, +}); - memories: z - .array(MemorySchema) - .default([]) - .superRefine( - uniqueBy( - memory => memory.name, - name => `Duplicate memory name: ${name}` - ) - ), +export type Evaluator = z.infer; - credentials: z - .array(CredentialSchema) - .default([]) - .superRefine( - uniqueBy( - credential => credential.name, - name => `Duplicate credential name: ${name}` - ) - ), -}); +// ============================================================================ +// Project Schema (Top Level) +// ============================================================================ + +const BUILTIN_EVALUATOR_PREFIX = 'Builtin.'; + +export const AgentCoreProjectSpecSchema = z + .object({ + name: ProjectNameSchema, + version: z.number().int(), + + agents: z + .array(AgentEnvSpecSchema) + .default([]) + .superRefine( + uniqueBy( + agent => agent.name, + name => `Duplicate agent name: ${name}` + ) + ), + + memories: z + .array(MemorySchema) + .default([]) + .superRefine( + uniqueBy( + memory => memory.name, + name => `Duplicate memory name: ${name}` + ) + ), + + credentials: z + .array(CredentialSchema) + .default([]) + .superRefine( + uniqueBy( + credential => credential.name, + name => `Duplicate credential name: ${name}` + ) + ), + + evaluators: z + .array(EvaluatorSchema) + .default([]) + .superRefine( + uniqueBy( + evaluator => evaluator.name, + name => `Duplicate evaluator name: ${name}` + ) + ), + + onlineEvalConfigs: z + .array(OnlineEvalConfigSchema) + .default([]) + .superRefine( + uniqueBy( + config => config.name, + name => `Duplicate online eval config name: ${name}` + ) + ), + }) + .superRefine((spec, ctx) => { + // Cross-field validation: onlineEvalConfigs reference valid agents and evaluators + const agentNames = new Set(spec.agents.map(a => a.name)); + const evaluatorNames = new Set(spec.evaluators.map(e => e.name)); + + for (const config of spec.onlineEvalConfigs) { + for (const agentName of config.agents) { + if (!agentNames.has(agentName)) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `Online eval config "${config.name}" references unknown agent "${agentName}"`, + }); + } + } + + for (const evalName of config.evaluators) { + if (!evalName.startsWith(BUILTIN_EVALUATOR_PREFIX) && !evaluatorNames.has(evalName)) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `Online eval config "${config.name}" references unknown evaluator "${evalName}"`, + }); + } + } + } + }); export type AgentCoreProjectSpec = z.infer; diff --git a/src/schema/schemas/primitives/evaluator.ts b/src/schema/schemas/primitives/evaluator.ts new file mode 100644 index 00000000..632cfd85 --- /dev/null +++ b/src/schema/schemas/primitives/evaluator.ts @@ -0,0 +1,74 @@ +import { z } from 'zod'; + +// ============================================================================ +// Evaluator Types +// ============================================================================ + +export const EvaluationLevelSchema = z.enum(['SESSION', 'TRACE', 'TOOL_CALL']); +export type EvaluationLevel = z.infer; + +export const EvaluatorNameSchema = z + .string() + .min(1, 'Name is required') + .max(48) + .regex( + /^[a-zA-Z][a-zA-Z0-9_]{0,47}$/, + 'Must begin with a letter and contain only alphanumeric characters and underscores (max 48 chars)' + ); + +// ============================================================================ +// Rating Scale +// ============================================================================ + +export const NumericalRatingSchema = z.object({ + value: z.number().int(), + label: z.string().min(1), + definition: z.string().min(1), +}); + +export type NumericalRating = z.infer; + +export const CategoricalRatingSchema = z.object({ + label: z.string().min(1), + definition: z.string().min(1), +}); + +export type CategoricalRating = z.infer; + +export const RatingScaleSchema = z + .object({ + numerical: z.array(NumericalRatingSchema).optional(), + categorical: z.array(CategoricalRatingSchema).optional(), + }) + .refine( + scale => { + const hasNumerical = Boolean(scale.numerical); + const hasCategorical = Boolean(scale.categorical); + return hasNumerical !== hasCategorical; + }, + { message: 'Rating scale must have either numerical or categorical, not both' } + ); + +export type RatingScale = z.infer; + +// ============================================================================ +// LLM-as-a-Judge Config +// ============================================================================ + +export const LlmAsAJudgeConfigSchema = z.object({ + model: z.string().min(1, 'Model ID is required'), + instructions: z.string().min(1, 'Evaluation instructions are required'), + ratingScale: RatingScaleSchema, +}); + +export type LlmAsAJudgeConfig = z.infer; + +// ============================================================================ +// Evaluator Config +// ============================================================================ + +export const EvaluatorConfigSchema = z.object({ + llmAsAJudge: LlmAsAJudgeConfigSchema, +}); + +export type EvaluatorConfig = z.infer; diff --git a/src/schema/schemas/primitives/index.ts b/src/schema/schemas/primitives/index.ts index e7f572e8..7b29a435 100644 --- a/src/schema/schemas/primitives/index.ts +++ b/src/schema/schemas/primitives/index.ts @@ -5,3 +5,24 @@ export { MemoryStrategySchema, MemoryStrategyTypeSchema, } from './memory'; + +export type { + EvaluationLevel, + EvaluatorConfig, + LlmAsAJudgeConfig, + RatingScale, + NumericalRating, + CategoricalRating, +} from './evaluator'; +export { + EvaluationLevelSchema, + EvaluatorConfigSchema, + EvaluatorNameSchema, + LlmAsAJudgeConfigSchema, + RatingScaleSchema, + NumericalRatingSchema, + CategoricalRatingSchema, +} from './evaluator'; + +export type { OnlineEvalConfig } from './online-eval-config'; +export { OnlineEvalConfigSchema, OnlineEvalConfigNameSchema } from './online-eval-config'; diff --git a/src/schema/schemas/primitives/online-eval-config.ts b/src/schema/schemas/primitives/online-eval-config.ts new file mode 100644 index 00000000..ea6ef95c --- /dev/null +++ b/src/schema/schemas/primitives/online-eval-config.ts @@ -0,0 +1,29 @@ +import { z } from 'zod'; + +// ============================================================================ +// Online Eval Config Types +// ============================================================================ + +export const OnlineEvalConfigNameSchema = z + .string() + .min(1, 'Name is required') + .max(48) + .regex( + /^[a-zA-Z][a-zA-Z0-9_]{0,47}$/, + 'Must begin with a letter and contain only alphanumeric characters and underscores (max 48 chars)' + ); + +export const OnlineEvalConfigSchema = z.object({ + type: z.literal('OnlineEvalConfig'), + name: OnlineEvalConfigNameSchema, + /** Agent names this online eval config monitors */ + agents: z.array(z.string().min(1)).min(1, 'At least one agent is required'), + /** Evaluator names (custom) or Builtin.* IDs */ + evaluators: z.array(z.string().min(1)).min(1, 'At least one evaluator is required'), + /** Sampling rate as a percentage (0.01 to 100) */ + samplingRate: z.number().min(0.01).max(100), + /** Whether to start the pipeline immediately on deploy */ + enableOnCreate: z.boolean().default(true), +}); + +export type OnlineEvalConfig = z.infer; From 674c6b1088b88b18a97e3a00e71342c22ae0fb32 Mon Sep 17 00:00:00 2001 From: notgitika Date: Wed, 11 Mar 2026 20:25:40 -0400 Subject: [PATCH 2/9] feat: add functionality to run evaluation and online evals --- src/cli/aws/agentcore-control.ts | 49 ++- src/cli/aws/agentcore.ts | 104 +++++ src/cli/cli.ts | 8 + .../__tests__/outputs-extended.test.ts | 168 +++++++- src/cli/cloudformation/outputs.ts | 96 ++++- src/cli/commands/deploy/actions.ts | 12 + src/cli/commands/eval/command.tsx | 138 +++++++ src/cli/commands/eval/index.ts | 1 + src/cli/commands/index.ts | 4 + src/cli/commands/logs/command.tsx | 36 +- src/cli/commands/pause/command.tsx | 41 ++ src/cli/commands/pause/index.ts | 1 + src/cli/commands/resume/command.tsx | 41 ++ src/cli/commands/resume/index.ts | 1 + src/cli/commands/run/command.tsx | 94 +++++ src/cli/commands/run/index.ts | 1 + .../commands/status/__tests__/action.test.ts | 110 +++++ src/cli/commands/status/action.ts | 23 +- src/cli/operations/eval/get-eval-run.ts | 17 + src/cli/operations/eval/index.ts | 18 + src/cli/operations/eval/list-eval-runs.ts | 26 ++ src/cli/operations/eval/logs-eval.ts | 140 +++++++ src/cli/operations/eval/pause-resume.ts | 72 ++++ src/cli/operations/eval/run-eval.ts | 387 ++++++++++++++++++ src/cli/operations/eval/storage.ts | 54 +++ src/cli/operations/eval/types.ts | 63 +++ .../primitives/OnlineEvalConfigPrimitive.ts | 4 +- src/cli/tui/App.tsx | 8 + src/cli/tui/copy.ts | 4 + src/cli/tui/hooks/useCreateOnlineEval.ts | 2 - .../tui/screens/cli-only/CliOnlyScreen.tsx | 28 ++ src/cli/tui/screens/cli-only/index.ts | 1 + src/cli/tui/screens/deploy/useDeployFlow.ts | 12 + src/cli/tui/screens/eval/EvalScreen.tsx | 88 ++++ src/cli/tui/screens/eval/index.ts | 1 + .../screens/evaluator/AddEvaluatorScreen.tsx | 24 +- src/cli/tui/screens/evaluator/types.ts | 4 +- src/cli/tui/screens/online-eval/types.ts | 1 - .../online-eval/useAddOnlineEvalWizard.ts | 1 - src/cli/tui/utils/commands.ts | 2 +- src/schema/schemas/deployed-state.ts | 25 ++ .../schemas/primitives/online-eval-config.ts | 4 +- 42 files changed, 1878 insertions(+), 36 deletions(-) create mode 100644 src/cli/commands/eval/command.tsx create mode 100644 src/cli/commands/eval/index.ts create mode 100644 src/cli/commands/pause/command.tsx create mode 100644 src/cli/commands/pause/index.ts create mode 100644 src/cli/commands/resume/command.tsx create mode 100644 src/cli/commands/resume/index.ts create mode 100644 src/cli/commands/run/command.tsx create mode 100644 src/cli/commands/run/index.ts create mode 100644 src/cli/operations/eval/get-eval-run.ts create mode 100644 src/cli/operations/eval/index.ts create mode 100644 src/cli/operations/eval/list-eval-runs.ts create mode 100644 src/cli/operations/eval/logs-eval.ts create mode 100644 src/cli/operations/eval/pause-resume.ts create mode 100644 src/cli/operations/eval/run-eval.ts create mode 100644 src/cli/operations/eval/storage.ts create mode 100644 src/cli/operations/eval/types.ts create mode 100644 src/cli/tui/screens/cli-only/CliOnlyScreen.tsx create mode 100644 src/cli/tui/screens/cli-only/index.ts create mode 100644 src/cli/tui/screens/eval/EvalScreen.tsx create mode 100644 src/cli/tui/screens/eval/index.ts diff --git a/src/cli/aws/agentcore-control.ts b/src/cli/aws/agentcore-control.ts index 84ba4766..4ef9ff64 100644 --- a/src/cli/aws/agentcore-control.ts +++ b/src/cli/aws/agentcore-control.ts @@ -1,5 +1,9 @@ import { getCredentialProvider } from './account'; -import { BedrockAgentCoreControlClient, GetAgentRuntimeCommand } from '@aws-sdk/client-bedrock-agentcore-control'; +import { + BedrockAgentCoreControlClient, + GetAgentRuntimeCommand, + UpdateOnlineEvaluationConfigCommand, +} from '@aws-sdk/client-bedrock-agentcore-control'; export interface GetAgentRuntimeStatusOptions { region: string; @@ -35,3 +39,46 @@ export async function getAgentRuntimeStatus(options: GetAgentRuntimeStatusOption status: response.status, }; } + +// ============================================================================ +// Online Eval Config +// ============================================================================ + +export type OnlineEvalExecutionStatus = 'ENABLED' | 'DISABLED'; + +export interface UpdateOnlineEvalStatusOptions { + region: string; + onlineEvaluationConfigId: string; + executionStatus: OnlineEvalExecutionStatus; +} + +export interface UpdateOnlineEvalStatusResult { + configId: string; + executionStatus: string; + status: string; +} + +/** + * Update the execution status of an online evaluation config (pause/resume). + */ +export async function updateOnlineEvalExecutionStatus( + options: UpdateOnlineEvalStatusOptions +): Promise { + const client = new BedrockAgentCoreControlClient({ + region: options.region, + credentials: getCredentialProvider(), + }); + + const command = new UpdateOnlineEvaluationConfigCommand({ + onlineEvaluationConfigId: options.onlineEvaluationConfigId, + executionStatus: options.executionStatus, + }); + + const response = await client.send(command); + + return { + configId: response.onlineEvaluationConfigId ?? options.onlineEvaluationConfigId, + executionStatus: response.executionStatus ?? options.executionStatus, + status: response.status ?? 'UNKNOWN', + }; +} diff --git a/src/cli/aws/agentcore.ts b/src/cli/aws/agentcore.ts index 8baf9f72..e84bd6a1 100644 --- a/src/cli/aws/agentcore.ts +++ b/src/cli/aws/agentcore.ts @@ -1,9 +1,11 @@ import { getCredentialProvider } from './account'; import { BedrockAgentCoreClient, + EvaluateCommand, InvokeAgentRuntimeCommand, StopRuntimeSessionCommand, } from '@aws-sdk/client-bedrock-agentcore'; +import type { DocumentType } from '@smithy/types'; /** Logger interface for SSE events */ export interface SSELogger { @@ -234,6 +236,108 @@ export async function invokeAgentRuntime(options: InvokeAgentRuntimeOptions): Pr }; } +// ============================================================================ +// Evaluate +// ============================================================================ + +export interface EvaluateOptions { + region: string; + evaluatorId: string; + sessionSpans: DocumentType[]; + targetSpanIds?: string[]; + targetTraceIds?: string[]; +} + +export interface EvaluationResultContext { + sessionId: string | undefined; + traceId: string | undefined; + spanId: string | undefined; +} + +export interface EvaluationResultTokenUsage { + inputTokens: number; + outputTokens: number; + totalTokens: number; +} + +export interface EvaluationResult { + evaluatorArn: string | undefined; + evaluatorId: string | undefined; + evaluatorName: string | undefined; + explanation: string | undefined; + value: number | undefined; + label: string | undefined; + errorMessage: string | undefined; + errorCode: string | undefined; + context: EvaluationResultContext | undefined; + tokenUsage: EvaluationResultTokenUsage | undefined; +} + +export interface EvaluateResult { + evaluationResults: EvaluationResult[]; +} + +/** + * Run on-demand evaluation of agent traces using a specified evaluator. + */ +export async function evaluate(options: EvaluateOptions): Promise { + const client = new BedrockAgentCoreClient({ + region: options.region, + credentials: getCredentialProvider(), + }); + + const evaluationTarget = options.targetSpanIds + ? { spanIds: options.targetSpanIds } + : options.targetTraceIds + ? { traceIds: options.targetTraceIds } + : undefined; + + const command = new EvaluateCommand({ + evaluatorId: options.evaluatorId, + evaluationInput: { + sessionSpans: options.sessionSpans, + }, + ...(evaluationTarget ? { evaluationTarget } : {}), + }); + + const response = await client.send(command); + + if (!response.evaluationResults) { + throw new Error('No evaluation results returned'); + } + + return { + evaluationResults: response.evaluationResults.map(r => { + const spanContext = r.context && 'spanContext' in r.context ? r.context.spanContext : undefined; + + return { + evaluatorArn: r.evaluatorArn, + evaluatorId: r.evaluatorId, + evaluatorName: r.evaluatorName, + explanation: r.explanation, + value: r.value, + label: r.label, + errorMessage: r.errorMessage, + errorCode: r.errorCode, + context: spanContext + ? { + sessionId: spanContext.sessionId, + traceId: spanContext.traceId, + spanId: spanContext.spanId, + } + : undefined, + tokenUsage: r.tokenUsage + ? { + inputTokens: r.tokenUsage.inputTokens ?? 0, + outputTokens: r.tokenUsage.outputTokens ?? 0, + totalTokens: r.tokenUsage.totalTokens ?? 0, + } + : undefined, + }; + }), + }; +} + /** * Stop a runtime session. */ diff --git a/src/cli/cli.ts b/src/cli/cli.ts index 4d992ad7..621e0ef4 100644 --- a/src/cli/cli.ts +++ b/src/cli/cli.ts @@ -2,11 +2,15 @@ import { registerAdd } from './commands/add'; import { registerCreate } from './commands/create'; import { registerDeploy } from './commands/deploy'; import { registerDev } from './commands/dev'; +import { registerEval } from './commands/eval'; import { registerHelp } from './commands/help'; import { registerInvoke } from './commands/invoke'; import { registerLogs } from './commands/logs'; import { registerPackage } from './commands/package'; +import { registerPause } from './commands/pause'; import { registerRemove } from './commands/remove'; +import { registerResume } from './commands/resume'; +import { registerRun } from './commands/run'; import { registerStatus } from './commands/status'; import { registerTraces } from './commands/traces'; import { registerUpdate } from './commands/update'; @@ -130,11 +134,15 @@ export function registerCommands(program: Command) { registerDev(program); registerDeploy(program); registerCreate(program); + registerEval(program); registerHelp(program); registerInvoke(program); registerLogs(program); registerPackage(program); + registerPause(program); const removeCmd = registerRemove(program); + registerResume(program); + registerRun(program); registerStatus(program); registerTraces(program); registerUpdate(program); diff --git a/src/cli/cloudformation/__tests__/outputs-extended.test.ts b/src/cli/cloudformation/__tests__/outputs-extended.test.ts index 85aab1c8..16112c58 100644 --- a/src/cli/cloudformation/__tests__/outputs-extended.test.ts +++ b/src/cli/cloudformation/__tests__/outputs-extended.test.ts @@ -1,4 +1,4 @@ -import { buildDeployedState, parseAgentOutputs } from '../outputs.js'; +import { buildDeployedState, parseAgentOutputs, parseEvaluatorOutputs, parseOnlineEvalOutputs } from '../outputs.js'; import type { StackOutputs } from '../outputs.js'; import { describe, expect, it } from 'vitest'; @@ -233,4 +233,170 @@ describe('buildDeployedState', () => { const state = buildDeployedState({ targetName: 'default', stackName: 'Stack', agents: {}, gateways: {} }); expect(state.targets.default!.resources?.agents).toBeUndefined(); }); + + it('includes evaluators in deployed state when provided', () => { + const evaluators = { + MyEval: { + evaluatorId: 'proj_MyEval-abc', + evaluatorArn: 'arn:aws:bedrock:us-east-1:123:evaluator/proj_MyEval-abc', + }, + }; + + const state = buildDeployedState({ + targetName: 'default', + stackName: 'Stack', + agents: {}, + gateways: {}, + evaluators, + }); + expect(state.targets.default!.resources?.evaluators).toEqual(evaluators); + }); + + it('omits evaluators from deployed state when empty', () => { + const state = buildDeployedState({ + targetName: 'default', + stackName: 'Stack', + agents: {}, + gateways: {}, + evaluators: {}, + }); + expect(state.targets.default!.resources?.evaluators).toBeUndefined(); + }); + + it('includes onlineEvalConfigs in deployed state when provided', () => { + const onlineEvalConfigs = { + TestConfig: { + onlineEvaluationConfigId: 'proj_TestConfig-xyz', + onlineEvaluationConfigArn: 'arn:aws:bedrock:us-east-1:123:online-evaluation-config/proj_TestConfig-xyz', + }, + }; + + const state = buildDeployedState({ + targetName: 'default', + stackName: 'Stack', + agents: {}, + gateways: {}, + onlineEvalConfigs, + }); + expect(state.targets.default!.resources?.onlineEvalConfigs).toEqual(onlineEvalConfigs); + }); + + it('omits onlineEvalConfigs from deployed state when empty', () => { + const state = buildDeployedState({ + targetName: 'default', + stackName: 'Stack', + agents: {}, + gateways: {}, + onlineEvalConfigs: {}, + }); + expect(state.targets.default!.resources?.onlineEvalConfigs).toBeUndefined(); + }); +}); + +describe('parseEvaluatorOutputs', () => { + it('parses evaluator Id and Arn from stack outputs', () => { + const outputs: StackOutputs = { + ApplicationEvaluatorMyEvalIdOutputABC123: 'proj_MyEval-abc', + ApplicationEvaluatorMyEvalArnOutputDEF456: 'arn:aws:bedrock:us-east-1:123:evaluator/proj_MyEval-abc', + }; + + const result = parseEvaluatorOutputs(outputs, ['MyEval']); + expect(result.MyEval).toBeDefined(); + expect(result.MyEval!.evaluatorId).toBe('proj_MyEval-abc'); + expect(result.MyEval!.evaluatorArn).toBe('arn:aws:bedrock:us-east-1:123:evaluator/proj_MyEval-abc'); + }); + + it('parses multiple evaluators', () => { + const outputs: StackOutputs = { + ApplicationEvaluatorEvalAIdOutputA: 'id-a', + ApplicationEvaluatorEvalAArnOutputB: 'arn:a', + ApplicationEvaluatorEvalBIdOutputC: 'id-b', + ApplicationEvaluatorEvalBArnOutputD: 'arn:b', + }; + + const result = parseEvaluatorOutputs(outputs, ['EvalA', 'EvalB']); + expect(Object.keys(result)).toHaveLength(2); + expect(result.EvalA!.evaluatorId).toBe('id-a'); + expect(result.EvalB!.evaluatorId).toBe('id-b'); + }); + + it('skips evaluator when Id output is missing', () => { + const outputs: StackOutputs = { + ApplicationEvaluatorMyEvalArnOutputDEF456: 'arn:eval', + }; + + const result = parseEvaluatorOutputs(outputs, ['MyEval']); + expect(result.MyEval).toBeUndefined(); + }); + + it('skips evaluator when Arn output is missing', () => { + const outputs: StackOutputs = { + ApplicationEvaluatorMyEvalIdOutputABC123: 'eval-id', + }; + + const result = parseEvaluatorOutputs(outputs, ['MyEval']); + expect(result.MyEval).toBeUndefined(); + }); + + it('returns empty record for no matching outputs', () => { + const result = parseEvaluatorOutputs({ UnrelatedOutput: 'value' }, ['MyEval']); + expect(result).toEqual({}); + }); + + it('maps PascalCase output keys back to original underscore names', () => { + // Evaluator name "my_eval" becomes "MyEval" in PascalCase + const outputs: StackOutputs = { + ApplicationEvaluatorMyEvalIdOutputA: 'id-1', + ApplicationEvaluatorMyEvalArnOutputB: 'arn:1', + }; + + const result = parseEvaluatorOutputs(outputs, ['my_eval']); + expect(result.my_eval).toBeDefined(); + expect(result.my_eval!.evaluatorId).toBe('id-1'); + }); +}); + +describe('parseOnlineEvalOutputs', () => { + it('parses online eval config Id and Arn from stack outputs', () => { + const outputs: StackOutputs = { + ApplicationOnlineEvalTestConfigIdOutputABC: 'proj_TestConfig-xyz', + ApplicationOnlineEvalTestConfigArnOutputDEF: + 'arn:aws:bedrock:us-east-1:123:online-evaluation-config/proj_TestConfig-xyz', + }; + + const result = parseOnlineEvalOutputs(outputs, ['TestConfig']); + expect(result.TestConfig).toBeDefined(); + expect(result.TestConfig!.onlineEvaluationConfigId).toBe('proj_TestConfig-xyz'); + expect(result.TestConfig!.onlineEvaluationConfigArn).toBe( + 'arn:aws:bedrock:us-east-1:123:online-evaluation-config/proj_TestConfig-xyz' + ); + }); + + it('parses multiple online eval configs', () => { + const outputs: StackOutputs = { + ApplicationOnlineEvalConfigAIdOutputA: 'id-a', + ApplicationOnlineEvalConfigAArnOutputB: 'arn:a', + ApplicationOnlineEvalConfigBIdOutputC: 'id-b', + ApplicationOnlineEvalConfigBArnOutputD: 'arn:b', + }; + + const result = parseOnlineEvalOutputs(outputs, ['ConfigA', 'ConfigB']); + expect(Object.keys(result)).toHaveLength(2); + expect(result.ConfigA!.onlineEvaluationConfigId).toBe('id-a'); + expect(result.ConfigB!.onlineEvaluationConfigId).toBe('id-b'); + }); + + it('skips config when Id output is missing', () => { + const outputs: StackOutputs = { + ApplicationOnlineEvalTestConfigArnOutputDEF: 'arn:config', + }; + + const result = parseOnlineEvalOutputs(outputs, ['TestConfig']); + expect(result.TestConfig).toBeUndefined(); + }); + + it('returns empty record for empty outputs', () => { + const result = parseOnlineEvalOutputs({}, ['TestConfig']); + expect(result).toEqual({}); + }); }); diff --git a/src/cli/cloudformation/outputs.ts b/src/cli/cloudformation/outputs.ts index 86ec368f..073fc05a 100644 --- a/src/cli/cloudformation/outputs.ts +++ b/src/cli/cloudformation/outputs.ts @@ -1,4 +1,11 @@ -import type { AgentCoreDeployedState, DeployedState, MemoryDeployedState, TargetDeployedState } from '../../schema'; +import type { + AgentCoreDeployedState, + DeployedState, + EvaluatorDeployedState, + MemoryDeployedState, + OnlineEvalDeployedState, + TargetDeployedState, +} from '../../schema'; import { getCredentialProvider } from '../aws'; import { toPascalId } from './logical-ids'; import { getStackName } from './stack-discovery'; @@ -202,6 +209,68 @@ export function parseMemoryOutputs(outputs: StackOutputs, memoryNames: string[]) return memories; } +/** + * Parse stack outputs into deployed state for evaluators. + * + * Output key pattern: ApplicationEvaluator{PascalName}(Id|Arn)Output{Hash} + */ +export function parseEvaluatorOutputs( + outputs: StackOutputs, + evaluatorNames: string[] +): Record { + const evaluators: Record = {}; + const outputKeys = Object.keys(outputs); + + for (const evalName of evaluatorNames) { + const pascal = toPascalId('Evaluator', evalName); + const idPrefix = `Application${pascal}IdOutput`; + const arnPrefix = `Application${pascal}ArnOutput`; + + const idKey = outputKeys.find(k => k.startsWith(idPrefix)); + const arnKey = outputKeys.find(k => k.startsWith(arnPrefix)); + + if (idKey && arnKey) { + evaluators[evalName] = { + evaluatorId: outputs[idKey]!, + evaluatorArn: outputs[arnKey]!, + }; + } + } + + return evaluators; +} + +/** + * Parse stack outputs into deployed state for online evaluation configs. + * + * Output key pattern: ApplicationOnlineEval{PascalName}(Id|Arn)Output{Hash} + */ +export function parseOnlineEvalOutputs( + outputs: StackOutputs, + onlineEvalNames: string[] +): Record { + const configs: Record = {}; + const outputKeys = Object.keys(outputs); + + for (const configName of onlineEvalNames) { + const pascal = toPascalId('OnlineEval', configName); + const idPrefix = `Application${pascal}IdOutput`; + const arnPrefix = `Application${pascal}ArnOutput`; + + const idKey = outputKeys.find(k => k.startsWith(idPrefix)); + const arnKey = outputKeys.find(k => k.startsWith(arnPrefix)); + + if (idKey && arnKey) { + configs[configName] = { + onlineEvaluationConfigId: outputs[idKey]!, + onlineEvaluationConfigArn: outputs[arnKey]!, + }; + } + } + + return configs; +} + export interface BuildDeployedStateOptions { targetName: string; stackName: string; @@ -211,13 +280,26 @@ export interface BuildDeployedStateOptions { identityKmsKeyArn?: string; credentials?: Record; memories?: Record; + evaluators?: Record; + onlineEvalConfigs?: Record; } /** * Build deployed state from stack outputs. */ export function buildDeployedState(opts: BuildDeployedStateOptions): DeployedState { - const { targetName, stackName, agents, gateways, existingState, identityKmsKeyArn, credentials, memories } = opts; + const { + targetName, + stackName, + agents, + gateways, + existingState, + identityKmsKeyArn, + credentials, + memories, + evaluators, + onlineEvalConfigs, + } = opts; const targetState: TargetDeployedState = { resources: { agents: Object.keys(agents).length > 0 ? agents : undefined, @@ -239,6 +321,16 @@ export function buildDeployedState(opts: BuildDeployedStateOptions): DeployedSta targetState.resources!.credentials = credentials; } + // Add evaluator state if evaluators exist + if (evaluators && Object.keys(evaluators).length > 0) { + targetState.resources!.evaluators = evaluators; + } + + // Add online eval config state if configs exist + if (onlineEvalConfigs && Object.keys(onlineEvalConfigs).length > 0) { + targetState.resources!.onlineEvalConfigs = onlineEvalConfigs; + } + return { targets: { ...existingState?.targets, diff --git a/src/cli/commands/deploy/actions.ts b/src/cli/commands/deploy/actions.ts index 721a050a..6289d2d2 100644 --- a/src/cli/commands/deploy/actions.ts +++ b/src/cli/commands/deploy/actions.ts @@ -6,8 +6,10 @@ import { buildDeployedState, getStackOutputs, parseAgentOutputs, + parseEvaluatorOutputs, parseGatewayOutputs, parseMemoryOutputs, + parseOnlineEvalOutputs, } from '../../cloudformation'; import { getErrorMessage } from '../../errors'; import { ExecLogger } from '../../logging'; @@ -374,6 +376,14 @@ export async function handleDeploy(options: ValidatedDeployOptions): Promise e.name); + const evaluators = parseEvaluatorOutputs(outputs, evaluatorNames); + + // Parse online eval config outputs + const onlineEvalNames = (context.projectSpec.onlineEvalConfigs ?? []).map(c => c.name); + const onlineEvalConfigs = parseOnlineEvalOutputs(outputs, onlineEvalNames); + // Parse gateway outputs const gatewaySpecs = mcpSpec?.agentCoreGateways?.reduce( @@ -395,6 +405,8 @@ export async function handleDeploy(options: ValidatedDeployOptions): Promise { + const evalCmd = program.command('eval').description(COMMAND_DESCRIPTIONS.eval); + + evalCmd + .command('list') + .description('List past eval runs') + .option('-a, --agent ', 'Filter by agent name') + .option('-n, --limit ', 'Maximum number of runs to show') + .option('--json', 'Output as JSON') + .action((cliOptions: { agent?: string; limit?: string; json?: boolean }) => { + requireProject(); + + try { + const result = handleListEvalRuns({ + agent: cliOptions.agent, + limit: cliOptions.limit ? parseInt(cliOptions.limit, 10) : undefined, + json: cliOptions.json, + }); + + if (cliOptions.json) { + console.log(JSON.stringify(result)); + process.exit(result.success ? 0 : 1); + return; + } + + if (!result.success) { + render({result.error}); + process.exit(1); + } + + const runs = result.runs ?? []; + if (runs.length === 0) { + console.log('No eval runs found. Run `agentcore run eval` to create one.'); + return; + } + + console.log( + `\n${'Run ID'.padEnd(42)} ${'Agent'.padEnd(20)} ${'Evaluators'.padEnd(30)} ${'Sessions'.padEnd(10)} Date` + ); + console.log('─'.repeat(120)); + + for (const run of runs) { + const scores = run.results.map(r => `${r.evaluator}=${r.aggregateScore.toFixed(2)}`).join(', '); + const date = new Date(run.timestamp).toLocaleDateString(); + console.log( + `${run.runId.padEnd(42)} ${run.agent.padEnd(20)} ${scores.padEnd(30)} ${String(run.sessionCount).padEnd(10)} ${date}` + ); + } + console.log(''); + } catch (error) { + if (cliOptions.json) { + console.log(JSON.stringify({ success: false, error: getErrorMessage(error) })); + } else { + render(Error: {getErrorMessage(error)}); + } + process.exit(1); + } + }); + + evalCmd + .command('get') + .description('Get details of a specific eval run') + .argument('', 'Eval run ID') + .option('--sessions', 'Show per-session score breakdown') + .option('--json', 'Output as JSON') + .action( + ( + runId: string, + cliOptions: { + sessions?: boolean; + json?: boolean; + } + ) => { + requireProject(); + + try { + const result = handleGetEvalRun({ runId, sessions: cliOptions.sessions, json: cliOptions.json }); + + if (cliOptions.json) { + console.log(JSON.stringify(result)); + process.exit(result.success ? 0 : 1); + return; + } + + if (!result.success) { + render({result.error}); + process.exit(1); + } + + const run = result.run!; + console.log(`\nEval Run: ${run.runId}`); + console.log(`Agent: ${run.agent}`); + console.log(`Date: ${new Date(run.timestamp).toISOString()}`); + console.log(`Sessions: ${run.sessionCount} | Lookback: ${run.lookbackDays}d\n`); + + for (const r of run.results) { + const errors = r.sessionScores.filter(s => s.errorMessage).length; + console.log(` ${r.evaluator}: ${r.aggregateScore.toFixed(2)}${errors > 0 ? ` (${errors} errors)` : ''}`); + + if (r.tokenUsage) { + console.log( + ` Tokens: ${r.tokenUsage.totalTokens} (in: ${r.tokenUsage.inputTokens}, out: ${r.tokenUsage.outputTokens})` + ); + } + + if (cliOptions.sessions) { + console.log(''); + for (const s of r.sessionScores) { + const status = s.errorMessage + ? `ERROR: ${s.errorMessage}` + : `${s.value.toFixed(2)}${s.label ? ` (${s.label})` : ''}`; + console.log(` session=${s.sessionId} ${status}`); + if (s.explanation) { + console.log(` ${s.explanation}`); + } + } + } + console.log(''); + } + } catch (error) { + if (cliOptions.json) { + console.log(JSON.stringify({ success: false, error: getErrorMessage(error) })); + } else { + render(Error: {getErrorMessage(error)}); + } + process.exit(1); + } + } + ); +}; diff --git a/src/cli/commands/eval/index.ts b/src/cli/commands/eval/index.ts new file mode 100644 index 00000000..5a761e17 --- /dev/null +++ b/src/cli/commands/eval/index.ts @@ -0,0 +1 @@ +export { registerEval } from './command'; diff --git a/src/cli/commands/index.ts b/src/cli/commands/index.ts index 3e1fd854..3dac1c82 100644 --- a/src/cli/commands/index.ts +++ b/src/cli/commands/index.ts @@ -3,9 +3,13 @@ export { registerAdd } from './add'; export { registerDeploy } from './deploy'; export { registerDev } from './dev'; export { registerCreate } from './create'; +export { registerEval } from './eval'; export { registerInvoke } from './invoke'; export { registerPackage } from './package'; +export { registerPause } from './pause'; export { registerRemove } from './remove'; +export { registerResume } from './resume'; +export { registerRun } from './run'; export { registerStatus } from './status'; export { registerTraces } from './traces'; export { registerUpdate } from './update'; diff --git a/src/cli/commands/logs/command.tsx b/src/cli/commands/logs/command.tsx index 977042cd..282aed81 100644 --- a/src/cli/commands/logs/command.tsx +++ b/src/cli/commands/logs/command.tsx @@ -1,15 +1,24 @@ import { getErrorMessage } from '../../errors'; +import { handleLogsEval } from '../../operations/eval'; +import type { LogsEvalOptions } from '../../operations/eval'; import { COMMAND_DESCRIPTIONS } from '../../tui/copy'; import { requireProject } from '../../tui/guards'; import { handleLogs } from './action'; import type { LogsOptions } from './types'; import type { Command } from '@commander-js/extra-typings'; import { Text, render } from 'ink'; +import React from 'react'; export const registerLogs = (program: Command) => { - program + // enablePositionalOptions + passThroughOptions ensure options like --since and --agent + // are passed to the 'eval' subcommand rather than being consumed by the parent 'logs' command. + program.enablePositionalOptions(); + + const logsCmd = program .command('logs') .alias('l') + .enablePositionalOptions() + .passThroughOptions() .description(COMMAND_DESCRIPTIONS.logs) .option('--agent ', 'Select specific agent') .option('--since