diff --git a/src/assets/__tests__/__snapshots__/assets.snapshot.test.ts.snap b/src/assets/__tests__/__snapshots__/assets.snapshot.test.ts.snap index 0e2f5950..52c7d853 100644 --- a/src/assets/__tests__/__snapshots__/assets.snapshot.test.ts.snap +++ b/src/assets/__tests__/__snapshots__/assets.snapshot.test.ts.snap @@ -372,6 +372,8 @@ test('AgentCoreStack synthesizes with empty spec', () => { agents: [], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }, }); const template = Template.fromStack(stack); diff --git a/src/assets/cdk/test/cdk.test.ts b/src/assets/cdk/test/cdk.test.ts index 5ff491d1..40021c58 100644 --- a/src/assets/cdk/test/cdk.test.ts +++ b/src/assets/cdk/test/cdk.test.ts @@ -11,6 +11,8 @@ test('AgentCoreStack synthesizes with empty spec', () => { agents: [], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }, }); const template = Template.fromStack(stack); diff --git a/src/cli/aws/__tests__/agentcore-control.test.ts b/src/cli/aws/__tests__/agentcore-control.test.ts index 9ec6bae3..3683eb08 100644 --- a/src/cli/aws/__tests__/agentcore-control.test.ts +++ b/src/cli/aws/__tests__/agentcore-control.test.ts @@ -1,4 +1,10 @@ -import { getAgentRuntimeStatus } from '../agentcore-control.js'; +import { + getAgentRuntimeStatus, + getEvaluator, + getOnlineEvaluationConfig, + listEvaluators, + updateOnlineEvalExecutionStatus, +} from '../agentcore-control.js'; import { beforeEach, describe, expect, it, vi } from 'vitest'; const { mockSend } = vi.hoisted(() => ({ @@ -12,6 +18,18 @@ vi.mock('@aws-sdk/client-bedrock-agentcore-control', () => ({ GetAgentRuntimeCommand: class { constructor(public input: unknown) {} }, + GetEvaluatorCommand: class { + constructor(public input: unknown) {} + }, + GetOnlineEvaluationConfigCommand: class { + constructor(public input: unknown) {} + }, + ListEvaluatorsCommand: class { + constructor(public input: unknown) {} + }, + UpdateOnlineEvaluationConfigCommand: class { + constructor(public input: unknown) {} + }, })); vi.mock('../account', () => ({ @@ -56,3 +74,292 @@ describe('getAgentRuntimeStatus', () => { ); }); }); + +describe('getEvaluator', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it('returns evaluator details', async () => { + mockSend.mockResolvedValue({ + evaluatorId: 'eval-123', + evaluatorArn: 'arn:aws:bedrock-agentcore:us-east-1:123456:evaluator/eval-123', + evaluatorName: 'my-evaluator', + level: 'SESSION', + status: 'ACTIVE', + description: 'A test evaluator', + }); + + const result = await getEvaluator({ region: 'us-east-1', evaluatorId: 'eval-123' }); + expect(result.evaluatorId).toBe('eval-123'); + expect(result.evaluatorName).toBe('my-evaluator'); + expect(result.level).toBe('SESSION'); + expect(result.status).toBe('ACTIVE'); + expect(result.description).toBe('A test evaluator'); + }); + + it('throws when no evaluatorId in response', async () => { + mockSend.mockResolvedValue({ evaluatorId: undefined }); + + await expect(getEvaluator({ region: 'us-east-1', evaluatorId: 'eval-missing' })).rejects.toThrow( + 'No evaluator found for ID eval-missing' + ); + }); + + it('passes correct evaluatorId in command', async () => { + mockSend.mockResolvedValue({ + evaluatorId: 'eval-abc', + evaluatorName: 'test', + level: 'TRACE', + status: 'ACTIVE', + }); + + await getEvaluator({ region: 'us-west-2', evaluatorId: 'eval-abc' }); + + const command = mockSend.mock.calls[0]![0]; + expect(command.input.evaluatorId).toBe('eval-abc'); + }); + + it('defaults level to SESSION when undefined', async () => { + mockSend.mockResolvedValue({ + evaluatorId: 'eval-no-level', + level: undefined, + status: 'ACTIVE', + }); + + const result = await getEvaluator({ region: 'us-east-1', evaluatorId: 'eval-no-level' }); + expect(result.level).toBe('SESSION'); + }); + + it('propagates SDK errors', async () => { + mockSend.mockRejectedValue(new Error('AccessDenied')); + + await expect(getEvaluator({ region: 'us-east-1', evaluatorId: 'eval-err' })).rejects.toThrow('AccessDenied'); + }); +}); + +describe('updateOnlineEvalExecutionStatus', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it('sends DISABLED to pause and returns result', async () => { + mockSend.mockResolvedValue({ + onlineEvaluationConfigId: 'cfg-123', + executionStatus: 'DISABLED', + status: 'ACTIVE', + }); + + const result = await updateOnlineEvalExecutionStatus({ + region: 'us-east-1', + onlineEvaluationConfigId: 'cfg-123', + executionStatus: 'DISABLED', + }); + + expect(result.configId).toBe('cfg-123'); + expect(result.executionStatus).toBe('DISABLED'); + expect(result.status).toBe('ACTIVE'); + }); + + it('sends ENABLED to resume', async () => { + mockSend.mockResolvedValue({ + onlineEvaluationConfigId: 'cfg-456', + executionStatus: 'ENABLED', + status: 'ACTIVE', + }); + + const result = await updateOnlineEvalExecutionStatus({ + region: 'us-west-2', + onlineEvaluationConfigId: 'cfg-456', + executionStatus: 'ENABLED', + }); + + expect(result.configId).toBe('cfg-456'); + expect(result.executionStatus).toBe('ENABLED'); + }); + + it('passes correct params in command', async () => { + mockSend.mockResolvedValue({ + onlineEvaluationConfigId: 'cfg-789', + executionStatus: 'DISABLED', + status: 'ACTIVE', + }); + + await updateOnlineEvalExecutionStatus({ + region: 'us-east-1', + onlineEvaluationConfigId: 'cfg-789', + executionStatus: 'DISABLED', + }); + + const command = mockSend.mock.calls[0]![0]; + expect(command.input.onlineEvaluationConfigId).toBe('cfg-789'); + expect(command.input.executionStatus).toBe('DISABLED'); + }); + + it('falls back to input values when response fields are undefined', async () => { + mockSend.mockResolvedValue({}); + + const result = await updateOnlineEvalExecutionStatus({ + region: 'us-east-1', + onlineEvaluationConfigId: 'cfg-fallback', + executionStatus: 'ENABLED', + }); + + expect(result.configId).toBe('cfg-fallback'); + expect(result.executionStatus).toBe('ENABLED'); + expect(result.status).toBe('UNKNOWN'); + }); + + it('propagates SDK errors', async () => { + mockSend.mockRejectedValue(new Error('Throttling')); + + await expect( + updateOnlineEvalExecutionStatus({ + region: 'us-east-1', + onlineEvaluationConfigId: 'cfg-err', + executionStatus: 'DISABLED', + }) + ).rejects.toThrow('Throttling'); + }); +}); + +describe('getOnlineEvaluationConfig', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it('returns config details with output log group', async () => { + mockSend.mockResolvedValue({ + onlineEvaluationConfigId: 'oec-123', + onlineEvaluationConfigArn: 'arn:aws:bedrock-agentcore:us-east-1:123456:online-eval/oec-123', + onlineEvaluationConfigName: 'my-online-eval', + status: 'ACTIVE', + executionStatus: 'ENABLED', + description: 'Production eval', + outputConfig: { + cloudWatchConfig: { logGroupName: '/aws/bedrock-agentcore/evaluations/oec-123' }, + }, + }); + + const result = await getOnlineEvaluationConfig({ region: 'us-east-1', configId: 'oec-123' }); + expect(result.configId).toBe('oec-123'); + expect(result.configName).toBe('my-online-eval'); + expect(result.status).toBe('ACTIVE'); + expect(result.executionStatus).toBe('ENABLED'); + expect(result.description).toBe('Production eval'); + expect(result.outputLogGroupName).toBe('/aws/bedrock-agentcore/evaluations/oec-123'); + }); + + it('throws when no configId in response', async () => { + mockSend.mockResolvedValue({ onlineEvaluationConfigId: undefined }); + + await expect(getOnlineEvaluationConfig({ region: 'us-east-1', configId: 'oec-missing' })).rejects.toThrow( + 'No online evaluation config found for ID oec-missing' + ); + }); + + it('returns failureReason when present', async () => { + mockSend.mockResolvedValue({ + onlineEvaluationConfigId: 'oec-fail', + onlineEvaluationConfigName: 'broken-eval', + status: 'CREATE_FAILED', + executionStatus: 'DISABLED', + failureReason: 'IAM role not found', + }); + + const result = await getOnlineEvaluationConfig({ region: 'us-east-1', configId: 'oec-fail' }); + expect(result.status).toBe('CREATE_FAILED'); + expect(result.failureReason).toBe('IAM role not found'); + }); + + it('handles missing outputConfig', async () => { + mockSend.mockResolvedValue({ + onlineEvaluationConfigId: 'oec-no-output', + status: 'CREATING', + executionStatus: 'DISABLED', + }); + + const result = await getOnlineEvaluationConfig({ region: 'us-east-1', configId: 'oec-no-output' }); + expect(result.outputLogGroupName).toBeUndefined(); + }); + + it('passes correct configId in command', async () => { + mockSend.mockResolvedValue({ + onlineEvaluationConfigId: 'oec-abc', + status: 'ACTIVE', + executionStatus: 'ENABLED', + }); + + await getOnlineEvaluationConfig({ region: 'us-west-2', configId: 'oec-abc' }); + + const command = mockSend.mock.calls[0]![0]; + expect(command.input.onlineEvaluationConfigId).toBe('oec-abc'); + }); + + it('propagates SDK errors', async () => { + mockSend.mockRejectedValue(new Error('ResourceNotFoundException')); + + await expect(getOnlineEvaluationConfig({ region: 'us-east-1', configId: 'oec-err' })).rejects.toThrow( + 'ResourceNotFoundException' + ); + }); +}); + +describe('listEvaluators', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it('returns evaluator summaries', async () => { + mockSend.mockResolvedValue({ + evaluators: [ + { + evaluatorId: 'eval-1', + evaluatorArn: 'arn:aws:bedrock-agentcore:us-east-1:123456:evaluator/eval-1', + evaluatorName: 'Faithfulness', + evaluatorType: 'Builtin', + status: 'ACTIVE', + }, + { + evaluatorId: 'eval-2', + evaluatorArn: 'arn:aws:bedrock-agentcore:us-east-1:123456:evaluator/eval-2', + evaluatorName: 'my-custom', + evaluatorType: 'Custom', + status: 'ACTIVE', + description: 'A custom evaluator', + }, + ], + }); + + const result = await listEvaluators({ region: 'us-east-1' }); + expect(result.evaluators).toHaveLength(2); + expect(result.evaluators[0]!.evaluatorName).toBe('Faithfulness'); + expect(result.evaluators[0]!.evaluatorType).toBe('Builtin'); + expect(result.evaluators[1]!.evaluatorName).toBe('my-custom'); + expect(result.evaluators[1]!.description).toBe('A custom evaluator'); + }); + + it('returns empty array when no evaluators', async () => { + mockSend.mockResolvedValue({ evaluators: undefined }); + + const result = await listEvaluators({ region: 'us-east-1' }); + expect(result.evaluators).toEqual([]); + }); + + it('passes maxResults and nextToken', async () => { + mockSend.mockResolvedValue({ evaluators: [], nextToken: 'token-2' }); + + const result = await listEvaluators({ region: 'us-east-1', maxResults: 5, nextToken: 'token-1' }); + + const command = mockSend.mock.calls[0]![0]; + expect(command.input.maxResults).toBe(5); + expect(command.input.nextToken).toBe('token-1'); + expect(result.nextToken).toBe('token-2'); + }); + + it('propagates SDK errors', async () => { + mockSend.mockRejectedValue(new Error('AccessDeniedException')); + + await expect(listEvaluators({ region: 'us-east-1' })).rejects.toThrow('AccessDeniedException'); + }); +}); diff --git a/src/cli/aws/__tests__/agentcore-evaluate.test.ts b/src/cli/aws/__tests__/agentcore-evaluate.test.ts new file mode 100644 index 00000000..30eafffd --- /dev/null +++ b/src/cli/aws/__tests__/agentcore-evaluate.test.ts @@ -0,0 +1,235 @@ +import { evaluate } from '../agentcore.js'; +import { beforeEach, describe, expect, it, vi } from 'vitest'; + +const { mockSend } = vi.hoisted(() => ({ + mockSend: vi.fn(), +})); + +vi.mock('@aws-sdk/client-bedrock-agentcore', () => ({ + BedrockAgentCoreClient: class { + send = mockSend; + }, + EvaluateCommand: class { + constructor(public input: unknown) {} + }, +})); + +vi.mock('../account', () => ({ + getCredentialProvider: vi.fn().mockReturnValue({}), +})); + +describe('evaluate', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it('sends evaluatorId and sessionSpans in the command', async () => { + mockSend.mockResolvedValue({ + evaluationResults: [{ value: 4.0 }], + }); + + await evaluate({ + region: 'us-east-1', + evaluatorId: 'eval-123', + sessionSpans: [{ traceId: 't1', spanId: 's1' }], + }); + + const command = mockSend.mock.calls[0]![0]; + expect(command.input.evaluatorId).toBe('eval-123'); + expect(command.input.evaluationInput.sessionSpans).toEqual([{ traceId: 't1', spanId: 's1' }]); + }); + + it('includes spanIds target when targetSpanIds is provided', async () => { + mockSend.mockResolvedValue({ + evaluationResults: [{ value: 3.0 }], + }); + + await evaluate({ + region: 'us-east-1', + evaluatorId: 'eval-123', + sessionSpans: [], + targetSpanIds: ['span-1', 'span-2'], + }); + + const command = mockSend.mock.calls[0]![0]; + expect(command.input.evaluationTarget).toEqual({ spanIds: ['span-1', 'span-2'] }); + }); + + it('includes traceIds target when targetTraceIds is provided', async () => { + mockSend.mockResolvedValue({ + evaluationResults: [{ value: 3.0 }], + }); + + await evaluate({ + region: 'us-east-1', + evaluatorId: 'eval-123', + sessionSpans: [], + targetTraceIds: ['trace-1'], + }); + + const command = mockSend.mock.calls[0]![0]; + expect(command.input.evaluationTarget).toEqual({ traceIds: ['trace-1'] }); + }); + + it('prefers spanIds over traceIds when both are provided', async () => { + mockSend.mockResolvedValue({ + evaluationResults: [{ value: 3.0 }], + }); + + await evaluate({ + region: 'us-east-1', + evaluatorId: 'eval-123', + sessionSpans: [], + targetSpanIds: ['span-1'], + targetTraceIds: ['trace-1'], + }); + + const command = mockSend.mock.calls[0]![0]; + expect(command.input.evaluationTarget).toEqual({ spanIds: ['span-1'] }); + }); + + it('omits evaluationTarget when neither targetSpanIds nor targetTraceIds provided', async () => { + mockSend.mockResolvedValue({ + evaluationResults: [{ value: 3.0 }], + }); + + await evaluate({ + region: 'us-east-1', + evaluatorId: 'eval-123', + sessionSpans: [], + }); + + const command = mockSend.mock.calls[0]![0]; + expect(command.input.evaluationTarget).toBeUndefined(); + }); + + it('throws when evaluationResults is undefined', async () => { + mockSend.mockResolvedValue({ evaluationResults: undefined }); + + await expect(evaluate({ region: 'us-east-1', evaluatorId: 'eval-123', sessionSpans: [] })).rejects.toThrow( + 'No evaluation results returned' + ); + }); + + it('maps response with spanContext correctly', async () => { + mockSend.mockResolvedValue({ + evaluationResults: [ + { + evaluatorArn: 'arn:aws:evaluator/eval-123', + evaluatorId: 'eval-123', + evaluatorName: 'MyEval', + explanation: 'Good quality', + value: 4.5, + label: 'Excellent', + errorMessage: undefined, + errorCode: undefined, + context: { + spanContext: { + sessionId: 'sess-1', + traceId: 'trace-1', + spanId: 'span-1', + }, + }, + tokenUsage: { + inputTokens: 100, + outputTokens: 50, + totalTokens: 150, + }, + }, + ], + }); + + const result = await evaluate({ + region: 'us-east-1', + evaluatorId: 'eval-123', + sessionSpans: [], + }); + + expect(result.evaluationResults).toHaveLength(1); + const r = result.evaluationResults[0]!; + expect(r.evaluatorArn).toBe('arn:aws:evaluator/eval-123'); + expect(r.value).toBe(4.5); + expect(r.explanation).toBe('Good quality'); + expect(r.context).toEqual({ sessionId: 'sess-1', traceId: 'trace-1', spanId: 'span-1' }); + expect(r.tokenUsage).toEqual({ inputTokens: 100, outputTokens: 50, totalTokens: 150 }); + }); + + it('handles response without spanContext', async () => { + mockSend.mockResolvedValue({ + evaluationResults: [ + { + value: 3.0, + context: undefined, + tokenUsage: undefined, + }, + ], + }); + + const result = await evaluate({ + region: 'us-east-1', + evaluatorId: 'eval-123', + sessionSpans: [], + }); + + const r = result.evaluationResults[0]!; + expect(r.context).toBeUndefined(); + expect(r.tokenUsage).toBeUndefined(); + }); + + it('defaults token usage values to 0 when partially undefined', async () => { + mockSend.mockResolvedValue({ + evaluationResults: [ + { + value: 3.0, + tokenUsage: { + inputTokens: undefined, + outputTokens: 25, + totalTokens: undefined, + }, + }, + ], + }); + + const result = await evaluate({ + region: 'us-east-1', + evaluatorId: 'eval-123', + sessionSpans: [], + }); + + expect(result.evaluationResults[0]!.tokenUsage).toEqual({ + inputTokens: 0, + outputTokens: 25, + totalTokens: 0, + }); + }); + + it('maps error results correctly', async () => { + mockSend.mockResolvedValue({ + evaluationResults: [ + { + value: 0, + errorMessage: 'Prompt template missing required field', + errorCode: 'TEMPLATE_ERROR', + }, + ], + }); + + const result = await evaluate({ + region: 'us-east-1', + evaluatorId: 'eval-123', + sessionSpans: [], + }); + + const r = result.evaluationResults[0]!; + expect(r.errorMessage).toBe('Prompt template missing required field'); + expect(r.errorCode).toBe('TEMPLATE_ERROR'); + }); + + it('propagates SDK errors', async () => { + mockSend.mockRejectedValue(new Error('AccessDeniedException')); + + await expect(evaluate({ region: 'us-east-1', evaluatorId: 'eval-123', sessionSpans: [] })).rejects.toThrow( + 'AccessDeniedException' + ); + }); +}); diff --git a/src/cli/aws/agentcore-control.ts b/src/cli/aws/agentcore-control.ts index 84ba4766..40db8e48 100644 --- a/src/cli/aws/agentcore-control.ts +++ b/src/cli/aws/agentcore-control.ts @@ -1,5 +1,13 @@ import { getCredentialProvider } from './account'; -import { BedrockAgentCoreControlClient, GetAgentRuntimeCommand } from '@aws-sdk/client-bedrock-agentcore-control'; +import { + BedrockAgentCoreControlClient, + DeleteOnlineEvaluationConfigCommand, + GetAgentRuntimeCommand, + GetEvaluatorCommand, + GetOnlineEvaluationConfigCommand, + ListEvaluatorsCommand, + UpdateOnlineEvaluationConfigCommand, +} from '@aws-sdk/client-bedrock-agentcore-control'; export interface GetAgentRuntimeStatusOptions { region: string; @@ -35,3 +43,237 @@ export async function getAgentRuntimeStatus(options: GetAgentRuntimeStatusOption status: response.status, }; } + +// ============================================================================ +// Evaluator +// ============================================================================ + +export interface GetEvaluatorOptions { + region: string; + evaluatorId: string; +} + +export interface GetEvaluatorResult { + evaluatorId: string; + evaluatorArn: string; + evaluatorName: string; + level: string; + status: string; + description?: string; +} + +export async function getEvaluator(options: GetEvaluatorOptions): Promise { + const client = new BedrockAgentCoreControlClient({ + region: options.region, + credentials: getCredentialProvider(), + }); + + const command = new GetEvaluatorCommand({ + evaluatorId: options.evaluatorId, + }); + + const response = await client.send(command); + + if (!response.evaluatorId) { + throw new Error(`No evaluator found for ID ${options.evaluatorId}`); + } + + return { + evaluatorId: response.evaluatorId, + evaluatorArn: response.evaluatorArn ?? '', + evaluatorName: response.evaluatorName ?? '', + level: response.level ?? 'SESSION', + status: response.status ?? 'UNKNOWN', + description: response.description, + }; +} + +export interface ListEvaluatorsOptions { + region: string; + maxResults?: number; + nextToken?: string; +} + +export interface EvaluatorSummary { + evaluatorId: string; + evaluatorArn: string; + evaluatorName: string; + evaluatorType: string; + level?: string; + status: string; + description?: string; +} + +export interface ListEvaluatorsResult { + evaluators: EvaluatorSummary[]; + nextToken?: string; +} + +export async function listEvaluators(options: ListEvaluatorsOptions): Promise { + const client = new BedrockAgentCoreControlClient({ + region: options.region, + credentials: getCredentialProvider(), + }); + + const command = new ListEvaluatorsCommand({ + maxResults: options.maxResults, + nextToken: options.nextToken, + }); + + const response = await client.send(command); + + return { + evaluators: (response.evaluators ?? []).map(e => ({ + evaluatorId: e.evaluatorId ?? '', + evaluatorArn: e.evaluatorArn ?? '', + evaluatorName: e.evaluatorName ?? '', + evaluatorType: e.evaluatorType ?? 'Custom', + level: e.level, + status: e.status ?? 'UNKNOWN', + description: e.description, + })), + nextToken: response.nextToken, + }; +} + +// ============================================================================ +// Online Eval Config +// ============================================================================ + +export type OnlineEvalExecutionStatus = 'ENABLED' | 'DISABLED'; + +export interface UpdateOnlineEvalStatusOptions { + region: string; + onlineEvaluationConfigId: string; + executionStatus: OnlineEvalExecutionStatus; +} + +export interface UpdateOnlineEvalOptions { + region: string; + onlineEvaluationConfigId: string; + executionStatus?: OnlineEvalExecutionStatus; + description?: string; +} + +export interface UpdateOnlineEvalStatusResult { + configId: string; + executionStatus: string; + status: string; +} + +/** + * Update the execution status of an online evaluation config (pause/resume). + */ +export async function updateOnlineEvalExecutionStatus( + options: UpdateOnlineEvalStatusOptions +): Promise { + return updateOnlineEvalConfig(options); +} + +/** + * Update an online evaluation config with any supported fields. + */ +export async function updateOnlineEvalConfig(options: UpdateOnlineEvalOptions): Promise { + const client = new BedrockAgentCoreControlClient({ + region: options.region, + credentials: getCredentialProvider(), + }); + + const command = new UpdateOnlineEvaluationConfigCommand({ + onlineEvaluationConfigId: options.onlineEvaluationConfigId, + ...(options.executionStatus && { executionStatus: options.executionStatus }), + ...(options.description !== undefined && { description: options.description }), + }); + + const response = await client.send(command); + + return { + configId: response.onlineEvaluationConfigId ?? options.onlineEvaluationConfigId, + executionStatus: response.executionStatus ?? options.executionStatus ?? 'UNKNOWN', + status: response.status ?? 'UNKNOWN', + }; +} + +export interface GetOnlineEvalConfigOptions { + region: string; + configId: string; +} + +export interface GetOnlineEvalConfigResult { + configId: string; + configArn: string; + configName: string; + status: string; + executionStatus: string; + description?: string; + failureReason?: string; + outputLogGroupName?: string; +} + +export async function getOnlineEvaluationConfig( + options: GetOnlineEvalConfigOptions +): Promise { + const client = new BedrockAgentCoreControlClient({ + region: options.region, + credentials: getCredentialProvider(), + }); + + const command = new GetOnlineEvaluationConfigCommand({ + onlineEvaluationConfigId: options.configId, + }); + + const response = await client.send(command); + + if (!response.onlineEvaluationConfigId) { + throw new Error(`No online evaluation config found for ID ${options.configId}`); + } + + const logGroupName = response.outputConfig?.cloudWatchConfig?.logGroupName; + + return { + configId: response.onlineEvaluationConfigId, + configArn: response.onlineEvaluationConfigArn ?? '', + configName: response.onlineEvaluationConfigName ?? '', + status: response.status ?? 'UNKNOWN', + executionStatus: response.executionStatus ?? 'UNKNOWN', + description: response.description, + failureReason: response.failureReason, + outputLogGroupName: logGroupName, + }; +} + +// ============================================================================ +// Delete Online Eval Config +// ============================================================================ + +export interface DeleteOnlineEvalConfigOptions { + region: string; + onlineEvaluationConfigId: string; +} + +export interface DeleteOnlineEvalConfigResult { + configId: string; + configArn: string; + status: string; +} + +export async function deleteOnlineEvalConfig( + options: DeleteOnlineEvalConfigOptions +): Promise { + const client = new BedrockAgentCoreControlClient({ + region: options.region, + credentials: getCredentialProvider(), + }); + + const command = new DeleteOnlineEvaluationConfigCommand({ + onlineEvaluationConfigId: options.onlineEvaluationConfigId, + }); + + const response = await client.send(command); + + return { + configId: response.onlineEvaluationConfigId ?? options.onlineEvaluationConfigId, + configArn: response.onlineEvaluationConfigArn ?? '', + status: response.status ?? 'DELETING', + }; +} diff --git a/src/cli/aws/agentcore.ts b/src/cli/aws/agentcore.ts index 8baf9f72..e84bd6a1 100644 --- a/src/cli/aws/agentcore.ts +++ b/src/cli/aws/agentcore.ts @@ -1,9 +1,11 @@ import { getCredentialProvider } from './account'; import { BedrockAgentCoreClient, + EvaluateCommand, InvokeAgentRuntimeCommand, StopRuntimeSessionCommand, } from '@aws-sdk/client-bedrock-agentcore'; +import type { DocumentType } from '@smithy/types'; /** Logger interface for SSE events */ export interface SSELogger { @@ -234,6 +236,108 @@ export async function invokeAgentRuntime(options: InvokeAgentRuntimeOptions): Pr }; } +// ============================================================================ +// Evaluate +// ============================================================================ + +export interface EvaluateOptions { + region: string; + evaluatorId: string; + sessionSpans: DocumentType[]; + targetSpanIds?: string[]; + targetTraceIds?: string[]; +} + +export interface EvaluationResultContext { + sessionId: string | undefined; + traceId: string | undefined; + spanId: string | undefined; +} + +export interface EvaluationResultTokenUsage { + inputTokens: number; + outputTokens: number; + totalTokens: number; +} + +export interface EvaluationResult { + evaluatorArn: string | undefined; + evaluatorId: string | undefined; + evaluatorName: string | undefined; + explanation: string | undefined; + value: number | undefined; + label: string | undefined; + errorMessage: string | undefined; + errorCode: string | undefined; + context: EvaluationResultContext | undefined; + tokenUsage: EvaluationResultTokenUsage | undefined; +} + +export interface EvaluateResult { + evaluationResults: EvaluationResult[]; +} + +/** + * Run on-demand evaluation of agent traces using a specified evaluator. + */ +export async function evaluate(options: EvaluateOptions): Promise { + const client = new BedrockAgentCoreClient({ + region: options.region, + credentials: getCredentialProvider(), + }); + + const evaluationTarget = options.targetSpanIds + ? { spanIds: options.targetSpanIds } + : options.targetTraceIds + ? { traceIds: options.targetTraceIds } + : undefined; + + const command = new EvaluateCommand({ + evaluatorId: options.evaluatorId, + evaluationInput: { + sessionSpans: options.sessionSpans, + }, + ...(evaluationTarget ? { evaluationTarget } : {}), + }); + + const response = await client.send(command); + + if (!response.evaluationResults) { + throw new Error('No evaluation results returned'); + } + + return { + evaluationResults: response.evaluationResults.map(r => { + const spanContext = r.context && 'spanContext' in r.context ? r.context.spanContext : undefined; + + return { + evaluatorArn: r.evaluatorArn, + evaluatorId: r.evaluatorId, + evaluatorName: r.evaluatorName, + explanation: r.explanation, + value: r.value, + label: r.label, + errorMessage: r.errorMessage, + errorCode: r.errorCode, + context: spanContext + ? { + sessionId: spanContext.sessionId, + traceId: spanContext.traceId, + spanId: spanContext.spanId, + } + : undefined, + tokenUsage: r.tokenUsage + ? { + inputTokens: r.tokenUsage.inputTokens ?? 0, + outputTokens: r.tokenUsage.outputTokens ?? 0, + totalTokens: r.tokenUsage.totalTokens ?? 0, + } + : undefined, + }; + }), + }; +} + /** * Stop a runtime session. */ diff --git a/src/cli/cli.ts b/src/cli/cli.ts index 4d992ad7..dc3b22e3 100644 --- a/src/cli/cli.ts +++ b/src/cli/cli.ts @@ -2,11 +2,15 @@ import { registerAdd } from './commands/add'; import { registerCreate } from './commands/create'; import { registerDeploy } from './commands/deploy'; import { registerDev } from './commands/dev'; +import { registerEval } from './commands/eval'; import { registerHelp } from './commands/help'; import { registerInvoke } from './commands/invoke'; import { registerLogs } from './commands/logs'; import { registerPackage } from './commands/package'; +import { registerPause, registerStop } from './commands/pause'; import { registerRemove } from './commands/remove'; +import { registerResume } from './commands/resume'; +import { registerRun } from './commands/run'; import { registerStatus } from './commands/status'; import { registerTraces } from './commands/traces'; import { registerUpdate } from './commands/update'; @@ -130,11 +134,16 @@ export function registerCommands(program: Command) { registerDev(program); registerDeploy(program); registerCreate(program); + registerEval(program); registerHelp(program); registerInvoke(program); registerLogs(program); registerPackage(program); + registerPause(program); const removeCmd = registerRemove(program); + registerResume(program); + registerRun(program); + registerStop(program); registerStatus(program); registerTraces(program); registerUpdate(program); diff --git a/src/cli/cloudformation/__tests__/outputs-extended.test.ts b/src/cli/cloudformation/__tests__/outputs-extended.test.ts index 85aab1c8..16112c58 100644 --- a/src/cli/cloudformation/__tests__/outputs-extended.test.ts +++ b/src/cli/cloudformation/__tests__/outputs-extended.test.ts @@ -1,4 +1,4 @@ -import { buildDeployedState, parseAgentOutputs } from '../outputs.js'; +import { buildDeployedState, parseAgentOutputs, parseEvaluatorOutputs, parseOnlineEvalOutputs } from '../outputs.js'; import type { StackOutputs } from '../outputs.js'; import { describe, expect, it } from 'vitest'; @@ -233,4 +233,170 @@ describe('buildDeployedState', () => { const state = buildDeployedState({ targetName: 'default', stackName: 'Stack', agents: {}, gateways: {} }); expect(state.targets.default!.resources?.agents).toBeUndefined(); }); + + it('includes evaluators in deployed state when provided', () => { + const evaluators = { + MyEval: { + evaluatorId: 'proj_MyEval-abc', + evaluatorArn: 'arn:aws:bedrock:us-east-1:123:evaluator/proj_MyEval-abc', + }, + }; + + const state = buildDeployedState({ + targetName: 'default', + stackName: 'Stack', + agents: {}, + gateways: {}, + evaluators, + }); + expect(state.targets.default!.resources?.evaluators).toEqual(evaluators); + }); + + it('omits evaluators from deployed state when empty', () => { + const state = buildDeployedState({ + targetName: 'default', + stackName: 'Stack', + agents: {}, + gateways: {}, + evaluators: {}, + }); + expect(state.targets.default!.resources?.evaluators).toBeUndefined(); + }); + + it('includes onlineEvalConfigs in deployed state when provided', () => { + const onlineEvalConfigs = { + TestConfig: { + onlineEvaluationConfigId: 'proj_TestConfig-xyz', + onlineEvaluationConfigArn: 'arn:aws:bedrock:us-east-1:123:online-evaluation-config/proj_TestConfig-xyz', + }, + }; + + const state = buildDeployedState({ + targetName: 'default', + stackName: 'Stack', + agents: {}, + gateways: {}, + onlineEvalConfigs, + }); + expect(state.targets.default!.resources?.onlineEvalConfigs).toEqual(onlineEvalConfigs); + }); + + it('omits onlineEvalConfigs from deployed state when empty', () => { + const state = buildDeployedState({ + targetName: 'default', + stackName: 'Stack', + agents: {}, + gateways: {}, + onlineEvalConfigs: {}, + }); + expect(state.targets.default!.resources?.onlineEvalConfigs).toBeUndefined(); + }); +}); + +describe('parseEvaluatorOutputs', () => { + it('parses evaluator Id and Arn from stack outputs', () => { + const outputs: StackOutputs = { + ApplicationEvaluatorMyEvalIdOutputABC123: 'proj_MyEval-abc', + ApplicationEvaluatorMyEvalArnOutputDEF456: 'arn:aws:bedrock:us-east-1:123:evaluator/proj_MyEval-abc', + }; + + const result = parseEvaluatorOutputs(outputs, ['MyEval']); + expect(result.MyEval).toBeDefined(); + expect(result.MyEval!.evaluatorId).toBe('proj_MyEval-abc'); + expect(result.MyEval!.evaluatorArn).toBe('arn:aws:bedrock:us-east-1:123:evaluator/proj_MyEval-abc'); + }); + + it('parses multiple evaluators', () => { + const outputs: StackOutputs = { + ApplicationEvaluatorEvalAIdOutputA: 'id-a', + ApplicationEvaluatorEvalAArnOutputB: 'arn:a', + ApplicationEvaluatorEvalBIdOutputC: 'id-b', + ApplicationEvaluatorEvalBArnOutputD: 'arn:b', + }; + + const result = parseEvaluatorOutputs(outputs, ['EvalA', 'EvalB']); + expect(Object.keys(result)).toHaveLength(2); + expect(result.EvalA!.evaluatorId).toBe('id-a'); + expect(result.EvalB!.evaluatorId).toBe('id-b'); + }); + + it('skips evaluator when Id output is missing', () => { + const outputs: StackOutputs = { + ApplicationEvaluatorMyEvalArnOutputDEF456: 'arn:eval', + }; + + const result = parseEvaluatorOutputs(outputs, ['MyEval']); + expect(result.MyEval).toBeUndefined(); + }); + + it('skips evaluator when Arn output is missing', () => { + const outputs: StackOutputs = { + ApplicationEvaluatorMyEvalIdOutputABC123: 'eval-id', + }; + + const result = parseEvaluatorOutputs(outputs, ['MyEval']); + expect(result.MyEval).toBeUndefined(); + }); + + it('returns empty record for no matching outputs', () => { + const result = parseEvaluatorOutputs({ UnrelatedOutput: 'value' }, ['MyEval']); + expect(result).toEqual({}); + }); + + it('maps PascalCase output keys back to original underscore names', () => { + // Evaluator name "my_eval" becomes "MyEval" in PascalCase + const outputs: StackOutputs = { + ApplicationEvaluatorMyEvalIdOutputA: 'id-1', + ApplicationEvaluatorMyEvalArnOutputB: 'arn:1', + }; + + const result = parseEvaluatorOutputs(outputs, ['my_eval']); + expect(result.my_eval).toBeDefined(); + expect(result.my_eval!.evaluatorId).toBe('id-1'); + }); +}); + +describe('parseOnlineEvalOutputs', () => { + it('parses online eval config Id and Arn from stack outputs', () => { + const outputs: StackOutputs = { + ApplicationOnlineEvalTestConfigIdOutputABC: 'proj_TestConfig-xyz', + ApplicationOnlineEvalTestConfigArnOutputDEF: + 'arn:aws:bedrock:us-east-1:123:online-evaluation-config/proj_TestConfig-xyz', + }; + + const result = parseOnlineEvalOutputs(outputs, ['TestConfig']); + expect(result.TestConfig).toBeDefined(); + expect(result.TestConfig!.onlineEvaluationConfigId).toBe('proj_TestConfig-xyz'); + expect(result.TestConfig!.onlineEvaluationConfigArn).toBe( + 'arn:aws:bedrock:us-east-1:123:online-evaluation-config/proj_TestConfig-xyz' + ); + }); + + it('parses multiple online eval configs', () => { + const outputs: StackOutputs = { + ApplicationOnlineEvalConfigAIdOutputA: 'id-a', + ApplicationOnlineEvalConfigAArnOutputB: 'arn:a', + ApplicationOnlineEvalConfigBIdOutputC: 'id-b', + ApplicationOnlineEvalConfigBArnOutputD: 'arn:b', + }; + + const result = parseOnlineEvalOutputs(outputs, ['ConfigA', 'ConfigB']); + expect(Object.keys(result)).toHaveLength(2); + expect(result.ConfigA!.onlineEvaluationConfigId).toBe('id-a'); + expect(result.ConfigB!.onlineEvaluationConfigId).toBe('id-b'); + }); + + it('skips config when Id output is missing', () => { + const outputs: StackOutputs = { + ApplicationOnlineEvalTestConfigArnOutputDEF: 'arn:config', + }; + + const result = parseOnlineEvalOutputs(outputs, ['TestConfig']); + expect(result.TestConfig).toBeUndefined(); + }); + + it('returns empty record for empty outputs', () => { + const result = parseOnlineEvalOutputs({}, ['TestConfig']); + expect(result).toEqual({}); + }); }); diff --git a/src/cli/cloudformation/outputs.ts b/src/cli/cloudformation/outputs.ts index 86ec368f..073fc05a 100644 --- a/src/cli/cloudformation/outputs.ts +++ b/src/cli/cloudformation/outputs.ts @@ -1,4 +1,11 @@ -import type { AgentCoreDeployedState, DeployedState, MemoryDeployedState, TargetDeployedState } from '../../schema'; +import type { + AgentCoreDeployedState, + DeployedState, + EvaluatorDeployedState, + MemoryDeployedState, + OnlineEvalDeployedState, + TargetDeployedState, +} from '../../schema'; import { getCredentialProvider } from '../aws'; import { toPascalId } from './logical-ids'; import { getStackName } from './stack-discovery'; @@ -202,6 +209,68 @@ export function parseMemoryOutputs(outputs: StackOutputs, memoryNames: string[]) return memories; } +/** + * Parse stack outputs into deployed state for evaluators. + * + * Output key pattern: ApplicationEvaluator{PascalName}(Id|Arn)Output{Hash} + */ +export function parseEvaluatorOutputs( + outputs: StackOutputs, + evaluatorNames: string[] +): Record { + const evaluators: Record = {}; + const outputKeys = Object.keys(outputs); + + for (const evalName of evaluatorNames) { + const pascal = toPascalId('Evaluator', evalName); + const idPrefix = `Application${pascal}IdOutput`; + const arnPrefix = `Application${pascal}ArnOutput`; + + const idKey = outputKeys.find(k => k.startsWith(idPrefix)); + const arnKey = outputKeys.find(k => k.startsWith(arnPrefix)); + + if (idKey && arnKey) { + evaluators[evalName] = { + evaluatorId: outputs[idKey]!, + evaluatorArn: outputs[arnKey]!, + }; + } + } + + return evaluators; +} + +/** + * Parse stack outputs into deployed state for online evaluation configs. + * + * Output key pattern: ApplicationOnlineEval{PascalName}(Id|Arn)Output{Hash} + */ +export function parseOnlineEvalOutputs( + outputs: StackOutputs, + onlineEvalNames: string[] +): Record { + const configs: Record = {}; + const outputKeys = Object.keys(outputs); + + for (const configName of onlineEvalNames) { + const pascal = toPascalId('OnlineEval', configName); + const idPrefix = `Application${pascal}IdOutput`; + const arnPrefix = `Application${pascal}ArnOutput`; + + const idKey = outputKeys.find(k => k.startsWith(idPrefix)); + const arnKey = outputKeys.find(k => k.startsWith(arnPrefix)); + + if (idKey && arnKey) { + configs[configName] = { + onlineEvaluationConfigId: outputs[idKey]!, + onlineEvaluationConfigArn: outputs[arnKey]!, + }; + } + } + + return configs; +} + export interface BuildDeployedStateOptions { targetName: string; stackName: string; @@ -211,13 +280,26 @@ export interface BuildDeployedStateOptions { identityKmsKeyArn?: string; credentials?: Record; memories?: Record; + evaluators?: Record; + onlineEvalConfigs?: Record; } /** * Build deployed state from stack outputs. */ export function buildDeployedState(opts: BuildDeployedStateOptions): DeployedState { - const { targetName, stackName, agents, gateways, existingState, identityKmsKeyArn, credentials, memories } = opts; + const { + targetName, + stackName, + agents, + gateways, + existingState, + identityKmsKeyArn, + credentials, + memories, + evaluators, + onlineEvalConfigs, + } = opts; const targetState: TargetDeployedState = { resources: { agents: Object.keys(agents).length > 0 ? agents : undefined, @@ -239,6 +321,16 @@ export function buildDeployedState(opts: BuildDeployedStateOptions): DeployedSta targetState.resources!.credentials = credentials; } + // Add evaluator state if evaluators exist + if (evaluators && Object.keys(evaluators).length > 0) { + targetState.resources!.evaluators = evaluators; + } + + // Add online eval config state if configs exist + if (onlineEvalConfigs && Object.keys(onlineEvalConfigs).length > 0) { + targetState.resources!.onlineEvalConfigs = onlineEvalConfigs; + } + return { targets: { ...existingState?.targets, diff --git a/src/cli/commands/create/action.ts b/src/cli/commands/create/action.ts index c99f69dc..eba7385b 100644 --- a/src/cli/commands/create/action.ts +++ b/src/cli/commands/create/action.ts @@ -28,6 +28,8 @@ function createDefaultProjectSpec(projectName: string): AgentCoreProjectSpec { agents: [], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }; } diff --git a/src/cli/commands/deploy/actions.ts b/src/cli/commands/deploy/actions.ts index 721a050a..6289d2d2 100644 --- a/src/cli/commands/deploy/actions.ts +++ b/src/cli/commands/deploy/actions.ts @@ -6,8 +6,10 @@ import { buildDeployedState, getStackOutputs, parseAgentOutputs, + parseEvaluatorOutputs, parseGatewayOutputs, parseMemoryOutputs, + parseOnlineEvalOutputs, } from '../../cloudformation'; import { getErrorMessage } from '../../errors'; import { ExecLogger } from '../../logging'; @@ -374,6 +376,14 @@ export async function handleDeploy(options: ValidatedDeployOptions): Promise e.name); + const evaluators = parseEvaluatorOutputs(outputs, evaluatorNames); + + // Parse online eval config outputs + const onlineEvalNames = (context.projectSpec.onlineEvalConfigs ?? []).map(c => c.name); + const onlineEvalConfigs = parseOnlineEvalOutputs(outputs, onlineEvalNames); + // Parse gateway outputs const gatewaySpecs = mcpSpec?.agentCoreGateways?.reduce( @@ -395,6 +405,8 @@ export async function handleDeploy(options: ValidatedDeployOptions): Promise { + const evalCmd = program.command('eval').description(COMMAND_DESCRIPTIONS.eval); + + evalCmd + .command('history') + .description('Show past eval run results') + .option('-a, --agent ', 'Filter by agent name') + .option('-n, --limit ', 'Maximum number of runs to show') + .option('--json', 'Output as JSON') + .action((cliOptions: { agent?: string; limit?: string; json?: boolean }) => { + requireProject(); + + try { + const result = handleListEvalRuns({ + agent: cliOptions.agent, + limit: cliOptions.limit ? parseInt(cliOptions.limit, 10) : undefined, + json: cliOptions.json, + }); + + if (cliOptions.json) { + console.log(JSON.stringify(result)); + process.exit(result.success ? 0 : 1); + return; + } + + if (!result.success) { + render({result.error}); + process.exit(1); + } + + const runs = result.runs ?? []; + if (runs.length === 0) { + console.log('No eval runs found. Run `agentcore run eval` to create one.'); + return; + } + + console.log(`\n${'Date'.padEnd(22)} ${'Agent'.padEnd(20)} ${'Evaluators'.padEnd(30)} Sessions`); + console.log('─'.repeat(90)); + + for (const run of runs) { + const scores = run.results.map(r => `${r.evaluator}=${r.aggregateScore.toFixed(2)}`).join(', '); + const date = new Date(run.timestamp).toLocaleString([], { + year: 'numeric', + month: 'short', + day: 'numeric', + hour: '2-digit', + minute: '2-digit', + }); + console.log(`${date.padEnd(22)} ${run.agent.padEnd(20)} ${scores.padEnd(30)} ${run.sessionCount}`); + } + + try { + console.log(`\nResults saved in: ${getResultsPath()}`); + } catch { + // ignore — no project context + } + console.log(''); + } catch (error) { + if (cliOptions.json) { + console.log(JSON.stringify({ success: false, error: getErrorMessage(error) })); + } else { + render(Error: {getErrorMessage(error)}); + } + process.exit(1); + } + }); +}; diff --git a/src/cli/commands/eval/index.ts b/src/cli/commands/eval/index.ts new file mode 100644 index 00000000..5a761e17 --- /dev/null +++ b/src/cli/commands/eval/index.ts @@ -0,0 +1 @@ +export { registerEval } from './command'; diff --git a/src/cli/commands/index.ts b/src/cli/commands/index.ts index 3e1fd854..3dac1c82 100644 --- a/src/cli/commands/index.ts +++ b/src/cli/commands/index.ts @@ -3,9 +3,13 @@ export { registerAdd } from './add'; export { registerDeploy } from './deploy'; export { registerDev } from './dev'; export { registerCreate } from './create'; +export { registerEval } from './eval'; export { registerInvoke } from './invoke'; export { registerPackage } from './package'; +export { registerPause } from './pause'; export { registerRemove } from './remove'; +export { registerResume } from './resume'; +export { registerRun } from './run'; export { registerStatus } from './status'; export { registerTraces } from './traces'; export { registerUpdate } from './update'; diff --git a/src/cli/commands/logs/__tests__/action.test.ts b/src/cli/commands/logs/__tests__/action.test.ts index 81e1f39f..9f41b66f 100644 --- a/src/cli/commands/logs/__tests__/action.test.ts +++ b/src/cli/commands/logs/__tests__/action.test.ts @@ -55,6 +55,8 @@ describe('resolveAgentContext', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }, deployedState: { targets: { @@ -111,6 +113,8 @@ describe('resolveAgentContext', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }, }); const result = resolveAgentContext(context, {}); @@ -147,6 +151,8 @@ describe('resolveAgentContext', () => { ], memories: [], credentials: [], + evaluators: [], + onlineEvalConfigs: [], }, deployedState: { targets: { @@ -187,7 +193,15 @@ describe('resolveAgentContext', () => { it('errors when no agents defined', () => { const context = makeContext({ - project: { name: 'TestProject', version: 1, agents: [], memories: [], credentials: [] }, + project: { + name: 'TestProject', + version: 1, + agents: [], + memories: [], + credentials: [], + evaluators: [], + onlineEvalConfigs: [], + }, }); const result = resolveAgentContext(context, {}); expect(result.success).toBe(false); diff --git a/src/cli/commands/logs/command.tsx b/src/cli/commands/logs/command.tsx index 977042cd..282aed81 100644 --- a/src/cli/commands/logs/command.tsx +++ b/src/cli/commands/logs/command.tsx @@ -1,15 +1,24 @@ import { getErrorMessage } from '../../errors'; +import { handleLogsEval } from '../../operations/eval'; +import type { LogsEvalOptions } from '../../operations/eval'; import { COMMAND_DESCRIPTIONS } from '../../tui/copy'; import { requireProject } from '../../tui/guards'; import { handleLogs } from './action'; import type { LogsOptions } from './types'; import type { Command } from '@commander-js/extra-typings'; import { Text, render } from 'ink'; +import React from 'react'; export const registerLogs = (program: Command) => { - program + // enablePositionalOptions + passThroughOptions ensure options like --since and --agent + // are passed to the 'eval' subcommand rather than being consumed by the parent 'logs' command. + program.enablePositionalOptions(); + + const logsCmd = program .command('logs') .alias('l') + .enablePositionalOptions() + .passThroughOptions() .description(COMMAND_DESCRIPTIONS.logs) .option('--agent ', 'Select specific agent') .option('--since