From 076a794bc2101784b7fc2944ff988fa7f274c46e Mon Sep 17 00:00:00 2001
From: notgitika <gitijh@gmail.com>
Date: Sun, 8 Mar 2026 00:14:10 -0500
Subject: [PATCH 1/9] feat: add evals control plane operations

---
 .../assets.snapshot.test.ts.snap              |   2 +
 src/assets/cdk/test/cdk.test.ts               |   2 +
 src/cli/commands/create/action.ts             |   2 +
 .../commands/logs/__tests__/action.test.ts    |  16 +-
 src/cli/commands/remove/command.tsx           |   2 +
 src/cli/commands/remove/types.ts              |   2 +-
 src/cli/commands/status/action.ts             |  18 +-
 src/cli/commands/status/command.tsx           |  22 +-
 .../__tests__/checks-extended.test.ts         |  20 ++
 src/cli/logging/remove-logger.ts              |   2 +-
 .../agent/generate/write-agent-to-project.ts  |   2 +
 .../deploy/__tests__/preflight.test.ts        |   2 +-
 src/cli/operations/deploy/preflight.ts        |   5 +-
 .../operations/dev/__tests__/config.test.ts   |  34 +++
 src/cli/primitives/EvaluatorPrimitive.ts      | 222 ++++++++++++++++
 .../primitives/OnlineEvalConfigPrimitive.ts   | 219 ++++++++++++++++
 src/cli/primitives/index.ts                   |   4 +
 src/cli/primitives/registry.ts                |   6 +
 src/cli/tui/components/ResourceGraph.tsx      |   2 +
 src/cli/tui/hooks/useCreateEvaluator.ts       |  56 ++++
 src/cli/tui/hooks/useCreateOnlineEval.ts      |  59 +++++
 src/cli/tui/hooks/useRemove.ts                |  38 +++
 src/cli/tui/screens/add/AddFlow.tsx           |  36 +++
 src/cli/tui/screens/add/AddScreen.tsx         |   2 +
 src/cli/tui/screens/create/useCreateFlow.ts   |   2 +
 .../screens/evaluator/AddEvaluatorFlow.tsx    |  76 ++++++
 .../screens/evaluator/AddEvaluatorScreen.tsx  | 164 ++++++++++++
 src/cli/tui/screens/evaluator/index.ts        |   2 +
 src/cli/tui/screens/evaluator/types.ts        | 131 ++++++++++
 .../evaluator/useAddEvaluatorWizard.ts        | 121 +++++++++
 .../screens/online-eval/AddOnlineEvalFlow.tsx |  86 +++++++
 .../online-eval/AddOnlineEvalScreen.tsx       | 151 +++++++++++
 src/cli/tui/screens/online-eval/index.ts      |   2 +
 src/cli/tui/screens/online-eval/types.ts      |  41 +++
 .../online-eval/useAddOnlineEvalWizard.ts     |  86 +++++++
 .../screens/remove/RemoveEvaluatorScreen.tsx  |  26 ++
 src/cli/tui/screens/remove/RemoveFlow.tsx     | 243 +++++++++++++++++-
 .../screens/remove/RemoveOnlineEvalScreen.tsx |  26 ++
 src/cli/tui/screens/remove/RemoveScreen.tsx   |  22 +-
 .../remove/__tests__/RemoveScreen.test.tsx    |   4 +
 src/cli/tui/screens/remove/index.ts           |   2 +
 src/cli/tui/screens/remove/useRemoveFlow.ts   |   2 +
 src/schema/schemas/agentcore-project.ts       | 136 +++++++---
 src/schema/schemas/primitives/evaluator.ts    |  74 ++++++
 src/schema/schemas/primitives/index.ts        |  21 ++
 .../schemas/primitives/online-eval-config.ts  |  29 +++
 46 files changed, 2175 insertions(+), 47 deletions(-)
 create mode 100644 src/cli/primitives/EvaluatorPrimitive.ts
 create mode 100644 src/cli/primitives/OnlineEvalConfigPrimitive.ts
 create mode 100644 src/cli/tui/hooks/useCreateEvaluator.ts
 create mode 100644 src/cli/tui/hooks/useCreateOnlineEval.ts
 create mode 100644 src/cli/tui/screens/evaluator/AddEvaluatorFlow.tsx
 create mode 100644 src/cli/tui/screens/evaluator/AddEvaluatorScreen.tsx
 create mode 100644 src/cli/tui/screens/evaluator/index.ts
 create mode 100644 src/cli/tui/screens/evaluator/types.ts
 create mode 100644 src/cli/tui/screens/evaluator/useAddEvaluatorWizard.ts
 create mode 100644 src/cli/tui/screens/online-eval/AddOnlineEvalFlow.tsx
 create mode 100644 src/cli/tui/screens/online-eval/AddOnlineEvalScreen.tsx
 create mode 100644 src/cli/tui/screens/online-eval/index.ts
 create mode 100644 src/cli/tui/screens/online-eval/types.ts
 create mode 100644 src/cli/tui/screens/online-eval/useAddOnlineEvalWizard.ts
 create mode 100644 src/cli/tui/screens/remove/RemoveEvaluatorScreen.tsx
 create mode 100644 src/cli/tui/screens/remove/RemoveOnlineEvalScreen.tsx
 create mode 100644 src/schema/schemas/primitives/evaluator.ts
 create mode 100644 src/schema/schemas/primitives/online-eval-config.ts

diff --git a/src/assets/__tests__/__snapshots__/assets.snapshot.test.ts.snap b/src/assets/__tests__/__snapshots__/assets.snapshot.test.ts.snap
index 0e2f5950..52c7d853 100644
--- a/src/assets/__tests__/__snapshots__/assets.snapshot.test.ts.snap
+++ b/src/assets/__tests__/__snapshots__/assets.snapshot.test.ts.snap
@@ -372,6 +372,8 @@ test('AgentCoreStack synthesizes with empty spec', () => {
       agents: [],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     },
   });
   const template = Template.fromStack(stack);
diff --git a/src/assets/cdk/test/cdk.test.ts b/src/assets/cdk/test/cdk.test.ts
index 5ff491d1..40021c58 100644
--- a/src/assets/cdk/test/cdk.test.ts
+++ b/src/assets/cdk/test/cdk.test.ts
@@ -11,6 +11,8 @@ test('AgentCoreStack synthesizes with empty spec', () => {
       agents: [],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     },
   });
   const template = Template.fromStack(stack);
diff --git a/src/cli/commands/create/action.ts b/src/cli/commands/create/action.ts
index c99f69dc..eba7385b 100644
--- a/src/cli/commands/create/action.ts
+++ b/src/cli/commands/create/action.ts
@@ -28,6 +28,8 @@ function createDefaultProjectSpec(projectName: string): AgentCoreProjectSpec {
     agents: [],
     memories: [],
     credentials: [],
+    evaluators: [],
+    onlineEvalConfigs: [],
   };
 }
 
diff --git a/src/cli/commands/logs/__tests__/action.test.ts b/src/cli/commands/logs/__tests__/action.test.ts
index 81e1f39f..9f41b66f 100644
--- a/src/cli/commands/logs/__tests__/action.test.ts
+++ b/src/cli/commands/logs/__tests__/action.test.ts
@@ -55,6 +55,8 @@ describe('resolveAgentContext', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     },
     deployedState: {
       targets: {
@@ -111,6 +113,8 @@ describe('resolveAgentContext', () => {
         ],
         memories: [],
         credentials: [],
+        evaluators: [],
+        onlineEvalConfigs: [],
       },
     });
     const result = resolveAgentContext(context, {});
@@ -147,6 +151,8 @@ describe('resolveAgentContext', () => {
         ],
         memories: [],
         credentials: [],
+        evaluators: [],
+        onlineEvalConfigs: [],
       },
       deployedState: {
         targets: {
@@ -187,7 +193,15 @@ describe('resolveAgentContext', () => {
 
   it('errors when no agents defined', () => {
     const context = makeContext({
-      project: { name: 'TestProject', version: 1, agents: [], memories: [], credentials: [] },
+      project: {
+        name: 'TestProject',
+        version: 1,
+        agents: [],
+        memories: [],
+        credentials: [],
+        evaluators: [],
+        onlineEvalConfigs: [],
+      },
     });
     const result = resolveAgentContext(context, {});
     expect(result.success).toBe(false);
diff --git a/src/cli/commands/remove/command.tsx b/src/cli/commands/remove/command.tsx
index 8ada29c0..e0a45f07 100644
--- a/src/cli/commands/remove/command.tsx
+++ b/src/cli/commands/remove/command.tsx
@@ -29,6 +29,8 @@ async function handleRemoveAll(_options: RemoveAllOptions): Promise<RemoveResult
       agents: [],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     });
 
     // Reset mcp.json gateways if it exists
diff --git a/src/cli/commands/remove/types.ts b/src/cli/commands/remove/types.ts
index d4dbe99b..2144cc4a 100644
--- a/src/cli/commands/remove/types.ts
+++ b/src/cli/commands/remove/types.ts
@@ -1,4 +1,4 @@
-export type ResourceType = 'agent' | 'gateway' | 'gateway-target' | 'memory' | 'identity';
+export type ResourceType = 'agent' | 'gateway' | 'gateway-target' | 'memory' | 'identity' | 'evaluator' | 'online-eval';
 
 export interface RemoveOptions {
   resourceType: ResourceType;
diff --git a/src/cli/commands/status/action.ts b/src/cli/commands/status/action.ts
index 7eab20de..f14f6bbf 100644
--- a/src/cli/commands/status/action.ts
+++ b/src/cli/commands/status/action.ts
@@ -14,7 +14,7 @@ import type { ResourceDeploymentState } from './constants';
 export type { ResourceDeploymentState };
 
 export interface ResourceStatusEntry {
-  resourceType: 'agent' | 'memory' | 'credential' | 'gateway';
+  resourceType: 'agent' | 'memory' | 'credential' | 'gateway' | 'evaluator' | 'online-eval';
   name: string;
   deploymentState: ResourceDeploymentState;
   identifier?: string;
@@ -152,7 +152,21 @@ export function computeResourceStatuses(
     },
   });
 
-  return [...agents, ...credentials, ...memories, ...gateways];
+  const evaluators: ResourceStatusEntry[] = (project.evaluators ?? []).map(e => ({
+    resourceType: 'evaluator',
+    name: e.name,
+    deploymentState: 'local-only' as ResourceDeploymentState,
+    detail: `${e.level} — LLM-as-a-Judge`,
+  }));
+
+  const onlineEvalConfigs: ResourceStatusEntry[] = (project.onlineEvalConfigs ?? []).map(c => ({
+    resourceType: 'online-eval',
+    name: c.name,
+    deploymentState: 'local-only' as ResourceDeploymentState,
+    detail: `${c.agents.length} agent${c.agents.length !== 1 ? 's' : ''}, ${c.evaluators.length} evaluator${c.evaluators.length !== 1 ? 's' : ''}`,
+  }));
+
+  return [...agents, ...credentials, ...memories, ...gateways, ...evaluators, ...onlineEvalConfigs];
 }
 
 export async function handleProjectStatus(
diff --git a/src/cli/commands/status/command.tsx b/src/cli/commands/status/command.tsx
index 09279fd6..1a80e4af 100644
--- a/src/cli/commands/status/command.tsx
+++ b/src/cli/commands/status/command.tsx
@@ -7,7 +7,7 @@ import { DEPLOYMENT_STATE_COLORS, DEPLOYMENT_STATE_LABELS } from './constants';
 import type { Command } from '@commander-js/extra-typings';
 import { Box, Text, render } from 'ink';
 
-const VALID_RESOURCE_TYPES = ['agent', 'memory', 'credential', 'gateway'] as const;
+const VALID_RESOURCE_TYPES = ['agent', 'memory', 'credential', 'gateway', 'evaluator', 'online-eval'] as const;
 const VALID_STATES = ['deployed', 'local-only', 'pending-removal'] as const;
 
 interface StatusCliOptions {
@@ -126,6 +126,8 @@ export const registerStatus = (program: Command) => {
         const credentials = filtered.filter(r => r.resourceType === 'credential');
         const memories = filtered.filter(r => r.resourceType === 'memory');
         const gateways = filtered.filter(r => r.resourceType === 'gateway');
+        const evaluators = filtered.filter(r => r.resourceType === 'evaluator');
+        const onlineEvals = filtered.filter(r => r.resourceType === 'online-eval');
 
         render(
           <Box flexDirection="column">
@@ -170,6 +172,24 @@ export const registerStatus = (program: Command) => {
               </Box>
             )}
 
+            {evaluators.length > 0 && (
+              <Box flexDirection="column" marginTop={1}>
+                <Text bold>Evaluators</Text>
+                {evaluators.map(entry => (
+                  <ResourceEntry key={`${entry.resourceType}-${entry.name}`} entry={entry} />
+                ))}
+              </Box>
+            )}
+
+            {onlineEvals.length > 0 && (
+              <Box flexDirection="column" marginTop={1}>
+                <Text bold>Online Eval Configs</Text>
+                {onlineEvals.map(entry => (
+                  <ResourceEntry key={`${entry.resourceType}-${entry.name}`} entry={entry} />
+                ))}
+              </Box>
+            )}
+
             {filtered.length === 0 && <Text dimColor>No resources match the given filters.</Text>}
           </Box>
         );
diff --git a/src/cli/external-requirements/__tests__/checks-extended.test.ts b/src/cli/external-requirements/__tests__/checks-extended.test.ts
index 30384086..42e44152 100644
--- a/src/cli/external-requirements/__tests__/checks-extended.test.ts
+++ b/src/cli/external-requirements/__tests__/checks-extended.test.ts
@@ -48,6 +48,8 @@ describe('requiresUv', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
     expect(requiresUv(project)).toBe(true);
   });
@@ -68,6 +70,8 @@ describe('requiresUv', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
     expect(requiresUv(project)).toBe(false);
   });
@@ -79,6 +83,8 @@ describe('requiresUv', () => {
       agents: [],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
     expect(requiresUv(project)).toBe(false);
   });
@@ -101,6 +107,8 @@ describe('requiresContainerRuntime', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
     expect(requiresContainerRuntime(project)).toBe(true);
   });
@@ -121,6 +129,8 @@ describe('requiresContainerRuntime', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
     expect(requiresContainerRuntime(project)).toBe(false);
   });
@@ -132,6 +142,8 @@ describe('requiresContainerRuntime', () => {
       agents: [],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
     expect(requiresContainerRuntime(project)).toBe(false);
   });
@@ -160,6 +172,8 @@ describe('requiresContainerRuntime', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
     expect(requiresContainerRuntime(project)).toBe(true);
   });
@@ -222,6 +236,8 @@ describe('checkDependencyVersions', () => {
       agents: [],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     const result = await checkDependencyVersions(project);
@@ -237,6 +253,8 @@ describe('checkDependencyVersions', () => {
       agents: [],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     const result = await checkDependencyVersions(project);
@@ -260,6 +278,8 @@ describe('checkDependencyVersions', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     const result = await checkDependencyVersions(project);
diff --git a/src/cli/logging/remove-logger.ts b/src/cli/logging/remove-logger.ts
index a21201ff..f40ace6c 100644
--- a/src/cli/logging/remove-logger.ts
+++ b/src/cli/logging/remove-logger.ts
@@ -7,7 +7,7 @@ const REMOVE_LOGS_SUBDIR = 'remove';
 
 export interface RemoveLoggerOptions {
   /** Type of resource being removed */
-  resourceType: 'agent' | 'memory' | 'identity' | 'gateway' | 'gateway-target';
+  resourceType: 'agent' | 'memory' | 'identity' | 'gateway' | 'gateway-target' | 'evaluator' | 'online-eval';
   /** Name of the resource being removed */
   resourceName: string;
 }
diff --git a/src/cli/operations/agent/generate/write-agent-to-project.ts b/src/cli/operations/agent/generate/write-agent-to-project.ts
index 85819835..37b001ea 100644
--- a/src/cli/operations/agent/generate/write-agent-to-project.ts
+++ b/src/cli/operations/agent/generate/write-agent-to-project.ts
@@ -67,6 +67,8 @@ export async function writeAgentToProject(config: GenerateConfig, options?: Writ
       agents: [agent],
       memories,
       credentials,
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     await configIO.writeProjectSpec(project);
diff --git a/src/cli/operations/deploy/__tests__/preflight.test.ts b/src/cli/operations/deploy/__tests__/preflight.test.ts
index dd148df4..0818acf7 100644
--- a/src/cli/operations/deploy/__tests__/preflight.test.ts
+++ b/src/cli/operations/deploy/__tests__/preflight.test.ts
@@ -81,7 +81,7 @@ describe('validateProject', () => {
     mockReadDeployedState.mockRejectedValue(new Error('No deployed state'));
 
     await expect(validateProject()).rejects.toThrow(
-      'No resources defined in project. Add an agent with "agentcore add agent", a memory with "agentcore add memory", or a gateway with "agentcore add gateway" before deploying.'
+      'No resources defined in project. Add at least one resource (agent, memory, evaluator, or gateway) before deploying.'
     );
   });
 
diff --git a/src/cli/operations/deploy/preflight.ts b/src/cli/operations/deploy/preflight.ts
index 9c5025a5..fe522053 100644
--- a/src/cli/operations/deploy/preflight.ts
+++ b/src/cli/operations/deploy/preflight.ts
@@ -82,6 +82,7 @@ export async function validateProject(): Promise<PreflightContext> {
   let isTeardownDeploy = false;
   const hasAgents = projectSpec.agents && projectSpec.agents.length > 0;
   const hasMemories = projectSpec.memories && projectSpec.memories.length > 0;
+  const hasEvaluators = projectSpec.evaluators && projectSpec.evaluators.length > 0;
 
   // Check for gateways in mcp.json
   let hasGateways = false;
@@ -92,7 +93,7 @@ export async function validateProject(): Promise<PreflightContext> {
     // No mcp.json or invalid — no gateways
   }
 
-  if (!hasAgents && !hasGateways && !hasMemories) {
+  if (!hasAgents && !hasGateways && !hasMemories && !hasEvaluators) {
     let hasExistingStack = false;
     try {
       const deployedState = await configIO.readDeployedState();
@@ -102,7 +103,7 @@ export async function validateProject(): Promise<PreflightContext> {
     }
     if (!hasExistingStack) {
       throw new Error(
-        'No resources defined in project. Add an agent with "agentcore add agent", a memory with "agentcore add memory", or a gateway with "agentcore add gateway" before deploying.'
+        'No resources defined in project. Add at least one resource (agent, memory, evaluator, or gateway) before deploying.'
       );
     }
     isTeardownDeploy = true;
diff --git a/src/cli/operations/dev/__tests__/config.test.ts b/src/cli/operations/dev/__tests__/config.test.ts
index c6e04210..3751e6c6 100644
--- a/src/cli/operations/dev/__tests__/config.test.ts
+++ b/src/cli/operations/dev/__tests__/config.test.ts
@@ -16,6 +16,8 @@ describe('getDevConfig', () => {
       agents: [],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     const config = getDevConfig(workingDir, project);
@@ -38,6 +40,8 @@ describe('getDevConfig', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     const config = getDevConfig(workingDir, project);
@@ -60,6 +64,8 @@ describe('getDevConfig', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     const config = getDevConfig(workingDir, project, '/test/project/agentcore');
@@ -88,6 +94,8 @@ describe('getDevConfig', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     expect(() => getDevConfig(workingDir, project, undefined, 'NonExistentAgent')).toThrow(
@@ -111,6 +119,8 @@ describe('getDevConfig', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     expect(() => getDevConfig(workingDir, project, undefined, 'NodeAgent')).toThrow('Dev mode only supports Python');
@@ -132,6 +142,8 @@ describe('getDevConfig', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     const config = getDevConfig(workingDir, project, '/test/project/agentcore');
@@ -156,6 +168,8 @@ describe('getDevConfig', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     // No configRoot provided
@@ -180,6 +194,8 @@ describe('getDevConfig', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     const config = getDevConfig(workingDir, project, '/test/project/agentcore');
@@ -204,6 +220,8 @@ describe('getDevConfig', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     const config = getDevConfig(workingDir, project, '/test/project/agentcore');
@@ -228,6 +246,8 @@ describe('getDevConfig', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     const config = getDevConfig(workingDir, project, '/test/project/agentcore');
@@ -265,6 +285,8 @@ describe('getAgentPort', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     expect(getAgentPort(project, 'Agent1', 8080)).toBe(8080);
@@ -278,6 +300,8 @@ describe('getAgentPort', () => {
       agents: [],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     expect(getAgentPort(project, 'NonExistent', 9000)).toBe(9000);
@@ -296,6 +320,8 @@ describe('getDevSupportedAgents', () => {
       agents: [],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     expect(getDevSupportedAgents(project)).toEqual([]);
@@ -317,6 +343,8 @@ describe('getDevSupportedAgents', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     expect(getDevSupportedAgents(project)).toEqual([]);
@@ -346,6 +374,8 @@ describe('getDevSupportedAgents', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     const supported = getDevSupportedAgents(project);
@@ -369,6 +399,8 @@ describe('getDevSupportedAgents', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     const supported = getDevSupportedAgents(project);
@@ -400,6 +432,8 @@ describe('getDevSupportedAgents', () => {
       ],
       memories: [],
       credentials: [],
+      evaluators: [],
+      onlineEvalConfigs: [],
     };
 
     const supported = getDevSupportedAgents(project);
diff --git a/src/cli/primitives/EvaluatorPrimitive.ts b/src/cli/primitives/EvaluatorPrimitive.ts
new file mode 100644
index 00000000..bf0cb7d8
--- /dev/null
+++ b/src/cli/primitives/EvaluatorPrimitive.ts
@@ -0,0 +1,222 @@
+import { findConfigRoot } from '../../lib';
+import type { EvaluationLevel, Evaluator, EvaluatorConfig } from '../../schema';
+import { EvaluationLevelSchema, EvaluatorSchema } from '../../schema';
+import { getErrorMessage } from '../errors';
+import type { RemovalPreview, RemovalResult, SchemaChange } from '../operations/remove/types';
+import { BasePrimitive } from './BasePrimitive';
+import type { AddResult, AddScreenComponent, RemovableResource } from './types';
+import type { Command } from '@commander-js/extra-typings';
+
+export interface AddEvaluatorOptions {
+  name: string;
+  level: EvaluationLevel;
+  description?: string;
+  config: EvaluatorConfig;
+}
+
+export type RemovableEvaluator = RemovableResource;
+
+/**
+ * EvaluatorPrimitive handles all evaluator add/remove operations.
+ */
+export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, RemovableEvaluator> {
+  readonly kind = 'evaluator' as const;
+  readonly label = 'Evaluator';
+  override readonly article = 'an';
+  readonly primitiveSchema = EvaluatorSchema;
+
+  async add(options: AddEvaluatorOptions): Promise<AddResult<{ evaluatorName: string }>> {
+    try {
+      const evaluator = await this.createEvaluator(options);
+      return { success: true, evaluatorName: evaluator.name };
+    } catch (err) {
+      return { success: false, error: getErrorMessage(err) };
+    }
+  }
+
+  async remove(evaluatorName: string): Promise<RemovalResult> {
+    try {
+      const project = await this.readProjectSpec();
+
+      const index = project.evaluators.findIndex(e => e.name === evaluatorName);
+      if (index === -1) {
+        return { success: false, error: `Evaluator "${evaluatorName}" not found.` };
+      }
+
+      // Warn if referenced by online eval configs
+      const referencingConfigs = project.onlineEvalConfigs.filter(c => c.evaluators.includes(evaluatorName));
+      if (referencingConfigs.length > 0) {
+        const configNames = referencingConfigs.map(c => c.name).join(', ');
+        return {
+          success: false,
+          error: `Evaluator "${evaluatorName}" is referenced by online eval config(s): ${configNames}. Remove those references first.`,
+        };
+      }
+
+      project.evaluators.splice(index, 1);
+      await this.writeProjectSpec(project);
+
+      return { success: true };
+    } catch (err) {
+      return { success: false, error: getErrorMessage(err) };
+    }
+  }
+
+  async previewRemove(evaluatorName: string): Promise<RemovalPreview> {
+    const project = await this.readProjectSpec();
+
+    const evaluator = project.evaluators.find(e => e.name === evaluatorName);
+    if (!evaluator) {
+      throw new Error(`Evaluator "${evaluatorName}" not found.`);
+    }
+
+    const summary: string[] = [`Removing evaluator: ${evaluatorName}`];
+    const schemaChanges: SchemaChange[] = [];
+
+    const referencingConfigs = project.onlineEvalConfigs.filter(c => c.evaluators.includes(evaluatorName));
+    if (referencingConfigs.length > 0) {
+      summary.push(
+        `Blocked: Referenced by online eval config(s): ${referencingConfigs.map(c => c.name).join(', ')}. Remove those references first.`
+      );
+    }
+
+    const afterSpec = {
+      ...project,
+      evaluators: project.evaluators.filter(e => e.name !== evaluatorName),
+    };
+
+    schemaChanges.push({
+      file: 'agentcore/agentcore.json',
+      before: project,
+      after: afterSpec,
+    });
+
+    return { summary, directoriesToDelete: [], schemaChanges };
+  }
+
+  async getRemovable(): Promise<RemovableEvaluator[]> {
+    try {
+      const project = await this.readProjectSpec();
+      return project.evaluators.map(e => ({ name: e.name }));
+    } catch {
+      return [];
+    }
+  }
+
+  async getAllNames(): Promise<string[]> {
+    try {
+      const project = await this.readProjectSpec();
+      return project.evaluators.map(e => e.name);
+    } catch {
+      return [];
+    }
+  }
+
+  registerCommands(addCmd: Command, removeCmd: Command): void {
+    addCmd
+      .command('eval')
+      .description('Add a custom evaluator to the project')
+      .option('--name <name>', 'Evaluator name [non-interactive]')
+      .option('--level <level>', 'Evaluation level: SESSION, TRACE, TOOL_CALL [non-interactive]')
+      .option('--config <path>', 'Path to evaluator config JSON file [non-interactive]')
+      .option('--json', 'Output as JSON [non-interactive]')
+      .action(async (cliOptions: { name?: string; level?: string; config?: string; json?: boolean }) => {
+        try {
+          if (!findConfigRoot()) {
+            console.error('No agentcore project found. Run `agentcore create` first.');
+            process.exit(1);
+          }
+
+          if (cliOptions.name || cliOptions.json) {
+            if (!cliOptions.name || !cliOptions.level || !cliOptions.config) {
+              const error = '--name, --level, and --config are all required in non-interactive mode';
+              if (cliOptions.json) {
+                console.log(JSON.stringify({ success: false, error }));
+              } else {
+                console.error(error);
+              }
+              process.exit(1);
+            }
+
+            const levelResult = EvaluationLevelSchema.safeParse(cliOptions.level);
+            if (!levelResult.success) {
+              const error = `Invalid --level "${cliOptions.level}". Must be one of: SESSION, TRACE, TOOL_CALL`;
+              if (cliOptions.json) {
+                console.log(JSON.stringify({ success: false, error }));
+              } else {
+                console.error(error);
+              }
+              process.exit(1);
+            }
+
+            const { readFileSync } = await import('fs');
+            const configJson = JSON.parse(readFileSync(cliOptions.config, 'utf-8')) as EvaluatorConfig;
+
+            const result = await this.add({
+              name: cliOptions.name,
+              level: levelResult.data,
+              config: configJson,
+            });
+
+            if (cliOptions.json) {
+              console.log(JSON.stringify(result));
+            } else if (result.success) {
+              console.log(`Added evaluator '${result.evaluatorName}'`);
+            } else {
+              console.error(result.error);
+            }
+            process.exit(result.success ? 0 : 1);
+          } else {
+            // TUI fallback
+            const [{ render }, { default: React }, { AddFlow }] = await Promise.all([
+              import('ink'),
+              import('react'),
+              import('../tui/screens/add/AddFlow'),
+            ]);
+            const { clear, unmount } = render(
+              React.createElement(AddFlow, {
+                isInteractive: false,
+                onExit: () => {
+                  clear();
+                  unmount();
+                  process.exit(0);
+                },
+              })
+            );
+          }
+        } catch (error) {
+          if (cliOptions.json) {
+            console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
+          } else {
+            console.error(getErrorMessage(error));
+          }
+          process.exit(1);
+        }
+      });
+
+    this.registerRemoveSubcommand(removeCmd);
+  }
+
+  addScreen(): AddScreenComponent {
+    return null;
+  }
+
+  private async createEvaluator(options: AddEvaluatorOptions): Promise<Evaluator> {
+    const project = await this.readProjectSpec();
+
+    this.checkDuplicate(project.evaluators, options.name);
+
+    const evaluator: Evaluator = {
+      type: 'CustomEvaluator',
+      name: options.name,
+      level: options.level,
+      ...(options.description && { description: options.description }),
+      config: options.config,
+    };
+
+    project.evaluators.push(evaluator);
+    await this.writeProjectSpec(project);
+
+    return evaluator;
+  }
+}
diff --git a/src/cli/primitives/OnlineEvalConfigPrimitive.ts b/src/cli/primitives/OnlineEvalConfigPrimitive.ts
new file mode 100644
index 00000000..36e66069
--- /dev/null
+++ b/src/cli/primitives/OnlineEvalConfigPrimitive.ts
@@ -0,0 +1,219 @@
+import { findConfigRoot } from '../../lib';
+import type { OnlineEvalConfig } from '../../schema';
+import { OnlineEvalConfigSchema } from '../../schema';
+import { getErrorMessage } from '../errors';
+import type { RemovalPreview, RemovalResult, SchemaChange } from '../operations/remove/types';
+import { BasePrimitive } from './BasePrimitive';
+import type { AddResult, AddScreenComponent, RemovableResource } from './types';
+import type { Command } from '@commander-js/extra-typings';
+
+export interface AddOnlineEvalConfigOptions {
+  name: string;
+  agents: string[];
+  evaluators: string[];
+  samplingRate: number;
+  enableOnCreate?: boolean;
+}
+
+export type RemovableOnlineEvalConfig = RemovableResource;
+
+/**
+ * OnlineEvalConfigPrimitive handles all online eval config add/remove operations.
+ */
+export class OnlineEvalConfigPrimitive extends BasePrimitive<AddOnlineEvalConfigOptions, RemovableOnlineEvalConfig> {
+  readonly kind = 'online-eval' as const;
+  readonly label = 'Online Eval Config';
+  override readonly article = 'an';
+  readonly primitiveSchema = OnlineEvalConfigSchema;
+
+  async add(options: AddOnlineEvalConfigOptions): Promise<AddResult<{ configName: string }>> {
+    try {
+      const config = await this.createOnlineEvalConfig(options);
+      return { success: true, configName: config.name };
+    } catch (err) {
+      return { success: false, error: getErrorMessage(err) };
+    }
+  }
+
+  async remove(configName: string): Promise<RemovalResult> {
+    try {
+      const project = await this.readProjectSpec();
+
+      const index = project.onlineEvalConfigs.findIndex(c => c.name === configName);
+      if (index === -1) {
+        return { success: false, error: `Online eval config "${configName}" not found.` };
+      }
+
+      project.onlineEvalConfigs.splice(index, 1);
+      await this.writeProjectSpec(project);
+
+      return { success: true };
+    } catch (err) {
+      return { success: false, error: getErrorMessage(err) };
+    }
+  }
+
+  async previewRemove(configName: string): Promise<RemovalPreview> {
+    const project = await this.readProjectSpec();
+
+    const config = project.onlineEvalConfigs.find(c => c.name === configName);
+    if (!config) {
+      throw new Error(`Online eval config "${configName}" not found.`);
+    }
+
+    const summary: string[] = [
+      `Removing online eval config: ${configName}`,
+      `Monitors agents: ${config.agents.join(', ')}`,
+      `Uses evaluators: ${config.evaluators.join(', ')}`,
+    ];
+    const schemaChanges: SchemaChange[] = [];
+
+    const afterSpec = {
+      ...project,
+      onlineEvalConfigs: project.onlineEvalConfigs.filter(c => c.name !== configName),
+    };
+
+    schemaChanges.push({
+      file: 'agentcore/agentcore.json',
+      before: project,
+      after: afterSpec,
+    });
+
+    return { summary, directoriesToDelete: [], schemaChanges };
+  }
+
+  async getRemovable(): Promise<RemovableOnlineEvalConfig[]> {
+    try {
+      const project = await this.readProjectSpec();
+      return project.onlineEvalConfigs.map(c => ({ name: c.name }));
+    } catch {
+      return [];
+    }
+  }
+
+  async getAllNames(): Promise<string[]> {
+    try {
+      const project = await this.readProjectSpec();
+      return project.onlineEvalConfigs.map(c => c.name);
+    } catch {
+      return [];
+    }
+  }
+
+  registerCommands(addCmd: Command, removeCmd: Command): void {
+    addCmd
+      .command('online-eval')
+      .description('Add an online eval config to the project')
+      .option('--name <name>', 'Config name [non-interactive]')
+      .option('-a, --agent <agents...>', 'Agent name(s) to monitor [non-interactive]')
+      .option('-e, --evaluator <evaluators...>', 'Evaluator name(s) or Builtin.* IDs [non-interactive]')
+      .option('--sampling-rate <rate>', 'Sampling percentage (0.01-100) [non-interactive]')
+      .option('--json', 'Output as JSON [non-interactive]')
+      .action(
+        async (cliOptions: {
+          name?: string;
+          agent?: string[];
+          evaluator?: string[];
+          samplingRate?: string;
+          json?: boolean;
+        }) => {
+          try {
+            if (!findConfigRoot()) {
+              console.error('No agentcore project found. Run `agentcore create` first.');
+              process.exit(1);
+            }
+
+            if (cliOptions.name || cliOptions.json) {
+              if (!cliOptions.name || !cliOptions.agent || !cliOptions.evaluator || !cliOptions.samplingRate) {
+                const error =
+                  '--name, --agent, --evaluator, and --sampling-rate are all required in non-interactive mode';
+                if (cliOptions.json) {
+                  console.log(JSON.stringify({ success: false, error }));
+                } else {
+                  console.error(error);
+                }
+                process.exit(1);
+              }
+
+              const samplingRate = parseFloat(cliOptions.samplingRate);
+              if (isNaN(samplingRate) || samplingRate < 0.01 || samplingRate > 100) {
+                const error = `Invalid --sampling-rate "${cliOptions.samplingRate}". Must be a number between 0.01 and 100`;
+                if (cliOptions.json) {
+                  console.log(JSON.stringify({ success: false, error }));
+                } else {
+                  console.error(error);
+                }
+                process.exit(1);
+              }
+
+              const result = await this.add({
+                name: cliOptions.name,
+                agents: cliOptions.agent,
+                evaluators: cliOptions.evaluator,
+                samplingRate,
+              });
+
+              if (cliOptions.json) {
+                console.log(JSON.stringify(result));
+              } else if (result.success) {
+                console.log(`Added online eval config '${result.configName}'`);
+              } else {
+                console.error(result.error);
+              }
+              process.exit(result.success ? 0 : 1);
+            } else {
+              // TUI fallback
+              const [{ render }, { default: React }, { AddFlow }] = await Promise.all([
+                import('ink'),
+                import('react'),
+                import('../tui/screens/add/AddFlow'),
+              ]);
+              const { clear, unmount } = render(
+                React.createElement(AddFlow, {
+                  isInteractive: false,
+                  onExit: () => {
+                    clear();
+                    unmount();
+                    process.exit(0);
+                  },
+                })
+              );
+            }
+          } catch (error) {
+            if (cliOptions.json) {
+              console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
+            } else {
+              console.error(getErrorMessage(error));
+            }
+            process.exit(1);
+          }
+        }
+      );
+
+    this.registerRemoveSubcommand(removeCmd);
+  }
+
+  addScreen(): AddScreenComponent {
+    return null;
+  }
+
+  private async createOnlineEvalConfig(options: AddOnlineEvalConfigOptions): Promise<OnlineEvalConfig> {
+    const project = await this.readProjectSpec();
+
+    this.checkDuplicate(project.onlineEvalConfigs, options.name, 'Online eval config');
+
+    const config: OnlineEvalConfig = {
+      type: 'OnlineEvalConfig',
+      name: options.name,
+      agents: options.agents,
+      evaluators: options.evaluators,
+      samplingRate: options.samplingRate,
+      enableOnCreate: options.enableOnCreate ?? true,
+    };
+
+    project.onlineEvalConfigs.push(config);
+    await this.writeProjectSpec(project);
+
+    return config;
+  }
+}
diff --git a/src/cli/primitives/index.ts b/src/cli/primitives/index.ts
index 0c995da6..2ef948e5 100644
--- a/src/cli/primitives/index.ts
+++ b/src/cli/primitives/index.ts
@@ -2,6 +2,8 @@ export { BasePrimitive } from './BasePrimitive';
 export { MemoryPrimitive } from './MemoryPrimitive';
 export { CredentialPrimitive } from './CredentialPrimitive';
 export { AgentPrimitive } from './AgentPrimitive';
+export { EvaluatorPrimitive } from './EvaluatorPrimitive';
+export { OnlineEvalConfigPrimitive } from './OnlineEvalConfigPrimitive';
 export { GatewayPrimitive } from './GatewayPrimitive';
 export { GatewayTargetPrimitive } from './GatewayTargetPrimitive';
 export {
@@ -9,6 +11,8 @@ export {
   agentPrimitive,
   memoryPrimitive,
   credentialPrimitive,
+  evaluatorPrimitive,
+  onlineEvalConfigPrimitive,
   gatewayPrimitive,
   gatewayTargetPrimitive,
   getPrimitive,
diff --git a/src/cli/primitives/registry.ts b/src/cli/primitives/registry.ts
index 4dd33b4d..290e0d60 100644
--- a/src/cli/primitives/registry.ts
+++ b/src/cli/primitives/registry.ts
@@ -1,9 +1,11 @@
 import { AgentPrimitive } from './AgentPrimitive';
 import type { BasePrimitive } from './BasePrimitive';
 import { CredentialPrimitive } from './CredentialPrimitive';
+import { EvaluatorPrimitive } from './EvaluatorPrimitive';
 import { GatewayPrimitive } from './GatewayPrimitive';
 import { GatewayTargetPrimitive } from './GatewayTargetPrimitive';
 import { MemoryPrimitive } from './MemoryPrimitive';
+import { OnlineEvalConfigPrimitive } from './OnlineEvalConfigPrimitive';
 import type { RemovableResource } from './types';
 
 /**
@@ -12,6 +14,8 @@ import type { RemovableResource } from './types';
 export const agentPrimitive = new AgentPrimitive();
 export const memoryPrimitive = new MemoryPrimitive();
 export const credentialPrimitive = new CredentialPrimitive();
+export const evaluatorPrimitive = new EvaluatorPrimitive();
+export const onlineEvalConfigPrimitive = new OnlineEvalConfigPrimitive();
 export const gatewayPrimitive = new GatewayPrimitive();
 export const gatewayTargetPrimitive = new GatewayTargetPrimitive();
 
@@ -22,6 +26,8 @@ export const ALL_PRIMITIVES: BasePrimitive<unknown, RemovableResource>[] = [
   agentPrimitive,
   memoryPrimitive,
   credentialPrimitive,
+  evaluatorPrimitive,
+  onlineEvalConfigPrimitive,
   gatewayPrimitive,
   gatewayTargetPrimitive,
 ];
diff --git a/src/cli/tui/components/ResourceGraph.tsx b/src/cli/tui/components/ResourceGraph.tsx
index 816a96cc..cbea8692 100644
--- a/src/cli/tui/components/ResourceGraph.tsx
+++ b/src/cli/tui/components/ResourceGraph.tsx
@@ -16,6 +16,8 @@ const ICONS = {
   gateway: '◆',
   tool: '⚙',
   runtime: '▶',
+  evaluator: '✦',
+  'online-eval': '↻',
 } as const;
 
 interface ResourceGraphProps {
diff --git a/src/cli/tui/hooks/useCreateEvaluator.ts b/src/cli/tui/hooks/useCreateEvaluator.ts
new file mode 100644
index 00000000..bf3015bd
--- /dev/null
+++ b/src/cli/tui/hooks/useCreateEvaluator.ts
@@ -0,0 +1,56 @@
+import type { EvaluatorConfig } from '../../../schema';
+import { evaluatorPrimitive } from '../../primitives/registry';
+import { useCallback, useEffect, useState } from 'react';
+
+interface CreateEvaluatorConfig {
+  name: string;
+  level: string;
+  config: EvaluatorConfig;
+}
+
+export function useCreateEvaluator() {
+  const [status, setStatus] = useState<{ state: 'idle' | 'loading' | 'success' | 'error'; error?: string }>({
+    state: 'idle',
+  });
+
+  const create = useCallback(async (config: CreateEvaluatorConfig) => {
+    setStatus({ state: 'loading' });
+    try {
+      const addResult = await evaluatorPrimitive.add({
+        name: config.name,
+        level: config.level as 'SESSION' | 'TRACE' | 'TOOL_CALL',
+        config: config.config,
+      });
+      if (!addResult.success) {
+        throw new Error(addResult.error ?? 'Failed to create evaluator');
+      }
+      setStatus({ state: 'success' });
+      return { ok: true as const, evaluatorName: config.name };
+    } catch (err) {
+      const message = err instanceof Error ? err.message : 'Failed to create evaluator.';
+      setStatus({ state: 'error', error: message });
+      return { ok: false as const, error: message };
+    }
+  }, []);
+
+  const reset = useCallback(() => {
+    setStatus({ state: 'idle' });
+  }, []);
+
+  return { status, createEvaluator: create, reset };
+}
+
+export function useExistingEvaluatorNames() {
+  const [names, setNames] = useState<string[]>([]);
+
+  useEffect(() => {
+    void evaluatorPrimitive.getAllNames().then(setNames);
+  }, []);
+
+  const refresh = useCallback(async () => {
+    const result = await evaluatorPrimitive.getAllNames();
+    setNames(result);
+  }, []);
+
+  return { names, refresh };
+}
diff --git a/src/cli/tui/hooks/useCreateOnlineEval.ts b/src/cli/tui/hooks/useCreateOnlineEval.ts
new file mode 100644
index 00000000..ab4ed1c4
--- /dev/null
+++ b/src/cli/tui/hooks/useCreateOnlineEval.ts
@@ -0,0 +1,59 @@
+import { onlineEvalConfigPrimitive } from '../../primitives/registry';
+import { useCallback, useEffect, useState } from 'react';
+
+interface CreateOnlineEvalConfig {
+  name: string;
+  agents: string[];
+  evaluators: string[];
+  samplingRate: number;
+  enableOnCreate?: boolean;
+}
+
+export function useCreateOnlineEval() {
+  const [status, setStatus] = useState<{ state: 'idle' | 'loading' | 'success' | 'error'; error?: string }>({
+    state: 'idle',
+  });
+
+  const create = useCallback(async (config: CreateOnlineEvalConfig) => {
+    setStatus({ state: 'loading' });
+    try {
+      const addResult = await onlineEvalConfigPrimitive.add({
+        name: config.name,
+        agents: config.agents,
+        evaluators: config.evaluators,
+        samplingRate: config.samplingRate,
+        enableOnCreate: config.enableOnCreate,
+      });
+      if (!addResult.success) {
+        throw new Error(addResult.error ?? 'Failed to create online eval config');
+      }
+      setStatus({ state: 'success' });
+      return { ok: true as const, configName: config.name };
+    } catch (err) {
+      const message = err instanceof Error ? err.message : 'Failed to create online eval config.';
+      setStatus({ state: 'error', error: message });
+      return { ok: false as const, error: message };
+    }
+  }, []);
+
+  const reset = useCallback(() => {
+    setStatus({ state: 'idle' });
+  }, []);
+
+  return { status, createOnlineEval: create, reset };
+}
+
+export function useExistingOnlineEvalNames() {
+  const [names, setNames] = useState<string[]>([]);
+
+  useEffect(() => {
+    void onlineEvalConfigPrimitive.getAllNames().then(setNames);
+  }, []);
+
+  const refresh = useCallback(async () => {
+    const result = await onlineEvalConfigPrimitive.getAllNames();
+    setNames(result);
+  }, []);
+
+  return { names, refresh };
+}
diff --git a/src/cli/tui/hooks/useRemove.ts b/src/cli/tui/hooks/useRemove.ts
index dd6b5468..31a7519f 100644
--- a/src/cli/tui/hooks/useRemove.ts
+++ b/src/cli/tui/hooks/useRemove.ts
@@ -6,9 +6,11 @@ import type { RemovableMemory } from '../../primitives/MemoryPrimitive';
 import {
   agentPrimitive,
   credentialPrimitive,
+  evaluatorPrimitive,
   gatewayPrimitive,
   gatewayTargetPrimitive,
   memoryPrimitive,
+  onlineEvalConfigPrimitive,
 } from '../../primitives/registry';
 import { useCallback, useEffect, useRef, useState } from 'react';
 
@@ -117,6 +119,16 @@ export function useRemovableIdentities() {
   return { identities, ...rest };
 }
 
+export function useRemovableEvaluators() {
+  const { items: evaluators, ...rest } = useRemovableResources(() => evaluatorPrimitive.getRemovable());
+  return { evaluators, ...rest };
+}
+
+export function useRemovableOnlineEvalConfigs() {
+  const { items: onlineEvalConfigs, ...rest } = useRemovableResources(() => onlineEvalConfigPrimitive.getRemovable());
+  return { onlineEvalConfigs, ...rest };
+}
+
 // ============================================================================
 // Preview Hook
 // ============================================================================
@@ -172,6 +184,14 @@ export function useRemovalPreview() {
     (name: string) => loadPreview(n => credentialPrimitive.previewRemove(n), name),
     [loadPreview]
   );
+  const loadEvaluatorPreview = useCallback(
+    (name: string) => loadPreview(n => evaluatorPrimitive.previewRemove(n), name),
+    [loadPreview]
+  );
+  const loadOnlineEvalPreview = useCallback(
+    (name: string) => loadPreview(n => onlineEvalConfigPrimitive.previewRemove(n), name),
+    [loadPreview]
+  );
 
   const reset = useCallback(() => {
     setState({ isLoading: false, preview: null, error: null });
@@ -184,6 +204,8 @@ export function useRemovalPreview() {
     loadGatewayTargetPreview,
     loadMemoryPreview,
     loadIdentityPreview,
+    loadEvaluatorPreview,
+    loadOnlineEvalPreview,
     reset,
   };
 }
@@ -238,3 +260,19 @@ export function useRemoveIdentity() {
     name => name
   );
 }
+
+export function useRemoveEvaluator() {
+  return useRemoveResource(
+    (name: string) => evaluatorPrimitive.remove(name),
+    'evaluator',
+    name => name
+  );
+}
+
+export function useRemoveOnlineEvalConfig() {
+  return useRemoveResource(
+    (name: string) => onlineEvalConfigPrimitive.remove(name),
+    'online-eval',
+    name => name
+  );
+}
diff --git a/src/cli/tui/screens/add/AddFlow.tsx b/src/cli/tui/screens/add/AddFlow.tsx
index 690d25af..35926ad9 100644
--- a/src/cli/tui/screens/add/AddFlow.tsx
+++ b/src/cli/tui/screens/add/AddFlow.tsx
@@ -6,9 +6,11 @@ import { AddAgentFlow } from '../agent/AddAgentFlow';
 import type { AddAgentConfig } from '../agent/types';
 import { FRAMEWORK_OPTIONS } from '../agent/types';
 import { useAddAgent } from '../agent/useAddAgent';
+import { AddEvaluatorFlow } from '../evaluator';
 import { AddIdentityFlow } from '../identity';
 import { AddGatewayFlow, AddGatewayTargetFlow } from '../mcp';
 import { AddMemoryFlow } from '../memory/AddMemoryFlow';
+import { AddOnlineEvalFlow } from '../online-eval';
 import type { AddResourceType } from './AddScreen';
 import { AddScreen } from './AddScreen';
 import { AddSuccessScreen } from './AddSuccessScreen';
@@ -23,6 +25,8 @@ type FlowState =
   | { name: 'tool-wizard' }
   | { name: 'memory-wizard' }
   | { name: 'identity-wizard' }
+  | { name: 'evaluator-wizard' }
+  | { name: 'online-eval-wizard' }
   | {
       name: 'agent-create-success';
       agentName: string;
@@ -172,6 +176,12 @@ export function AddFlow(props: AddFlowProps) {
       case 'identity':
         setFlow({ name: 'identity-wizard' });
         break;
+      case 'evaluator':
+        setFlow({ name: 'evaluator-wizard' });
+        break;
+      case 'online-eval':
+        setFlow({ name: 'online-eval-wizard' });
+        break;
     }
   }, []);
 
@@ -366,6 +376,32 @@ export function AddFlow(props: AddFlowProps) {
     );
   }
 
+  // Evaluator wizard
+  if (flow.name === 'evaluator-wizard') {
+    return (
+      <AddEvaluatorFlow
+        isInteractive={props.isInteractive}
+        onExit={props.onExit}
+        onBack={() => setFlow({ name: 'select' })}
+        onDev={props.onDev}
+        onDeploy={props.onDeploy}
+      />
+    );
+  }
+
+  // Online eval config wizard
+  if (flow.name === 'online-eval-wizard') {
+    return (
+      <AddOnlineEvalFlow
+        isInteractive={props.isInteractive}
+        onExit={props.onExit}
+        onBack={() => setFlow({ name: 'select' })}
+        onDev={props.onDev}
+        onDeploy={props.onDeploy}
+      />
+    );
+  }
+
   return (
     <ErrorPrompt
       message="Failed to add resource"
diff --git a/src/cli/tui/screens/add/AddScreen.tsx b/src/cli/tui/screens/add/AddScreen.tsx
index a96fbb53..eabc98af 100644
--- a/src/cli/tui/screens/add/AddScreen.tsx
+++ b/src/cli/tui/screens/add/AddScreen.tsx
@@ -5,6 +5,8 @@ const ADD_RESOURCES = [
   { id: 'agent', title: 'Agent', description: 'New or existing agent code' },
   { id: 'memory', title: 'Memory', description: 'Persistent context storage' },
   { id: 'identity', title: 'Identity', description: 'API key credential providers' },
+  { id: 'evaluator', title: 'Evaluator', description: 'Custom LLM-as-a-Judge evaluator' },
+  { id: 'online-eval', title: 'Online Eval Config', description: 'Continuous evaluation pipeline' },
   { id: 'gateway', title: 'Gateway', description: 'Route and manage gateway targets' },
   { id: 'gateway-target', title: 'Gateway Target', description: 'Extend agent capabilities' },
 ] as const;
diff --git a/src/cli/tui/screens/create/useCreateFlow.ts b/src/cli/tui/screens/create/useCreateFlow.ts
index 2a2bae57..157d3e07 100644
--- a/src/cli/tui/screens/create/useCreateFlow.ts
+++ b/src/cli/tui/screens/create/useCreateFlow.ts
@@ -74,6 +74,8 @@ function createDefaultProjectSpec(projectName: string): AgentCoreProjectSpec {
     agents: [],
     memories: [],
     credentials: [],
+    evaluators: [],
+    onlineEvalConfigs: [],
   };
 }
 
diff --git a/src/cli/tui/screens/evaluator/AddEvaluatorFlow.tsx b/src/cli/tui/screens/evaluator/AddEvaluatorFlow.tsx
new file mode 100644
index 00000000..a53aacb2
--- /dev/null
+++ b/src/cli/tui/screens/evaluator/AddEvaluatorFlow.tsx
@@ -0,0 +1,76 @@
+import { ErrorPrompt } from '../../components';
+import { useCreateEvaluator, useExistingEvaluatorNames } from '../../hooks/useCreateEvaluator';
+import { AddSuccessScreen } from '../add/AddSuccessScreen';
+import { AddEvaluatorScreen } from './AddEvaluatorScreen';
+import type { AddEvaluatorConfig } from './types';
+import React, { useCallback, useEffect, useState } from 'react';
+
+type FlowState =
+  | { name: 'create-wizard' }
+  | { name: 'create-success'; evaluatorName: string }
+  | { name: 'error'; message: string };
+
+interface AddEvaluatorFlowProps {
+  isInteractive?: boolean;
+  onExit: () => void;
+  onBack: () => void;
+  onDev?: () => void;
+  onDeploy?: () => void;
+}
+
+export function AddEvaluatorFlow({ isInteractive = true, onExit, onBack, onDev, onDeploy }: AddEvaluatorFlowProps) {
+  const { createEvaluator, reset: resetCreate } = useCreateEvaluator();
+  const { names: existingNames } = useExistingEvaluatorNames();
+  const [flow, setFlow] = useState<FlowState>({ name: 'create-wizard' });
+
+  useEffect(() => {
+    if (!isInteractive && flow.name === 'create-success') {
+      onExit();
+    }
+  }, [isInteractive, flow.name, onExit]);
+
+  const handleCreateComplete = useCallback(
+    (config: AddEvaluatorConfig) => {
+      void createEvaluator(config).then(result => {
+        if (result.ok) {
+          setFlow({ name: 'create-success', evaluatorName: result.evaluatorName });
+          return;
+        }
+        setFlow({ name: 'error', message: result.error });
+      });
+    },
+    [createEvaluator]
+  );
+
+  if (flow.name === 'create-wizard') {
+    return (
+      <AddEvaluatorScreen existingEvaluatorNames={existingNames} onComplete={handleCreateComplete} onExit={onBack} />
+    );
+  }
+
+  if (flow.name === 'create-success') {
+    return (
+      <AddSuccessScreen
+        isInteractive={isInteractive}
+        message={`Added evaluator: ${flow.evaluatorName}`}
+        detail="Evaluator added to project in `agentcore/agentcore.json`. Deploy with `agentcore deploy`."
+        onAddAnother={onBack}
+        onDev={onDev}
+        onDeploy={onDeploy}
+        onExit={onExit}
+      />
+    );
+  }
+
+  return (
+    <ErrorPrompt
+      message="Failed to add evaluator"
+      detail={flow.message}
+      onBack={() => {
+        resetCreate();
+        setFlow({ name: 'create-wizard' });
+      }}
+      onExit={onExit}
+    />
+  );
+}
diff --git a/src/cli/tui/screens/evaluator/AddEvaluatorScreen.tsx b/src/cli/tui/screens/evaluator/AddEvaluatorScreen.tsx
new file mode 100644
index 00000000..8969c010
--- /dev/null
+++ b/src/cli/tui/screens/evaluator/AddEvaluatorScreen.tsx
@@ -0,0 +1,164 @@
+import type { EvaluationLevel, EvaluatorConfig } from '../../../../schema';
+import { EvaluatorNameSchema } from '../../../../schema';
+import type { SelectableItem } from '../../components';
+import { ConfirmReview, Panel, Screen, StepIndicator, TextInput, WizardSelect } from '../../components';
+import { HELP_TEXT } from '../../constants';
+import { useListNavigation } from '../../hooks';
+import { generateUniqueName } from '../../utils';
+import type { AddEvaluatorConfig } from './types';
+import {
+  DEFAULT_INSTRUCTIONS,
+  DEFAULT_MODEL,
+  EVALUATION_LEVEL_OPTIONS,
+  EVALUATOR_STEP_LABELS,
+  LEVEL_PLACEHOLDERS,
+  RATING_SCALE_PRESETS,
+  validateInstructionPlaceholders,
+} from './types';
+import { useAddEvaluatorWizard } from './useAddEvaluatorWizard';
+import React, { useMemo } from 'react';
+
+interface AddEvaluatorScreenProps {
+  onComplete: (config: AddEvaluatorConfig) => void;
+  onExit: () => void;
+  existingEvaluatorNames: string[];
+}
+
+function formatRatingScale(ratingScale: EvaluatorConfig['llmAsAJudge']['ratingScale']): string {
+  if ('numerical' in ratingScale && ratingScale.numerical) {
+    return ratingScale.numerical.map(r => `${r.value}=${r.label}`).join(', ');
+  }
+  if ('categorical' in ratingScale && ratingScale.categorical) {
+    return ratingScale.categorical.map(r => r.label).join(', ');
+  }
+  return 'Unknown';
+}
+
+export function AddEvaluatorScreen({ onComplete, onExit, existingEvaluatorNames }: AddEvaluatorScreenProps) {
+  const wizard = useAddEvaluatorWizard();
+
+  const levelItems: SelectableItem[] = useMemo(
+    () => EVALUATION_LEVEL_OPTIONS.map(opt => ({ id: opt.id, title: opt.title, description: opt.description })),
+    []
+  );
+
+  const ratingScaleItems: SelectableItem[] = useMemo(
+    () => RATING_SCALE_PRESETS.map(opt => ({ id: opt.id, title: opt.title, description: opt.description })),
+    []
+  );
+
+  const isNameStep = wizard.step === 'name';
+  const isLevelStep = wizard.step === 'level';
+  const isModelStep = wizard.step === 'model';
+  const isInstructionsStep = wizard.step === 'instructions';
+  const isRatingScaleStep = wizard.step === 'ratingScale';
+  const isConfirmStep = wizard.step === 'confirm';
+
+  const levelNav = useListNavigation({
+    items: levelItems,
+    onSelect: item => wizard.setLevel(item.id as EvaluationLevel),
+    onExit: () => wizard.goBack(),
+    isActive: isLevelStep,
+  });
+
+  const ratingScaleNav = useListNavigation({
+    items: ratingScaleItems,
+    onSelect: item => {
+      const preset = RATING_SCALE_PRESETS.find(p => p.id === item.id);
+      if (preset) wizard.setRatingScale(preset.ratingScale);
+    },
+    onExit: () => wizard.goBack(),
+    isActive: isRatingScaleStep,
+  });
+
+  useListNavigation({
+    items: [{ id: 'confirm', title: 'Confirm' }],
+    onSelect: () => onComplete(wizard.config),
+    onExit: () => wizard.goBack(),
+    isActive: isConfirmStep,
+  });
+
+  const helpText =
+    isLevelStep || isRatingScaleStep
+      ? HELP_TEXT.NAVIGATE_SELECT
+      : isConfirmStep
+        ? HELP_TEXT.CONFIRM_CANCEL
+        : HELP_TEXT.TEXT_INPUT;
+
+  const headerContent = <StepIndicator steps={wizard.steps} currentStep={wizard.step} labels={EVALUATOR_STEP_LABELS} />;
+
+  return (
+    <Screen title="Add Evaluator" onExit={onExit} helpText={helpText} headerContent={headerContent}>
+      <Panel>
+        {isNameStep && (
+          <TextInput
+            key="name"
+            prompt="Evaluator name"
+            initialValue={generateUniqueName('MyEvaluator', existingEvaluatorNames)}
+            onSubmit={wizard.setName}
+            onCancel={onExit}
+            schema={EvaluatorNameSchema}
+            customValidation={value => !existingEvaluatorNames.includes(value) || 'Evaluator name already exists'}
+          />
+        )}
+
+        {isLevelStep && (
+          <WizardSelect
+            title="Evaluation level"
+            description="Granularity of evaluation"
+            items={levelItems}
+            selectedIndex={levelNav.selectedIndex}
+          />
+        )}
+
+        {isModelStep && (
+          <TextInput
+            key="model"
+            prompt="Bedrock model ID"
+            initialValue={DEFAULT_MODEL}
+            onSubmit={wizard.setModel}
+            onCancel={() => wizard.goBack()}
+          />
+        )}
+
+        {isInstructionsStep && (
+          <TextInput
+            key="instructions"
+            prompt={`Evaluation instructions (must include at least one: ${LEVEL_PLACEHOLDERS[wizard.config.level].map(p => `{${p}}`).join(', ')})`}
+            initialValue={DEFAULT_INSTRUCTIONS[wizard.config.level]}
+            onSubmit={wizard.setInstructions}
+            onCancel={() => wizard.goBack()}
+            customValidation={value => validateInstructionPlaceholders(value, wizard.config.level)}
+          />
+        )}
+
+        {isRatingScaleStep && (
+          <WizardSelect
+            title="Rating scale"
+            description="Choose a rating scale preset"
+            items={ratingScaleItems}
+            selectedIndex={ratingScaleNav.selectedIndex}
+          />
+        )}
+
+        {isConfirmStep && (
+          <ConfirmReview
+            fields={[
+              { label: 'Name', value: wizard.config.name },
+              { label: 'Level', value: wizard.config.level },
+              { label: 'Model', value: wizard.config.config.llmAsAJudge.model },
+              {
+                label: 'Instructions',
+                value:
+                  wizard.config.config.llmAsAJudge.instructions.length > 60
+                    ? wizard.config.config.llmAsAJudge.instructions.slice(0, 60) + '...'
+                    : wizard.config.config.llmAsAJudge.instructions,
+              },
+              { label: 'Rating Scale', value: formatRatingScale(wizard.config.config.llmAsAJudge.ratingScale) },
+            ]}
+          />
+        )}
+      </Panel>
+    </Screen>
+  );
+}
diff --git a/src/cli/tui/screens/evaluator/index.ts b/src/cli/tui/screens/evaluator/index.ts
new file mode 100644
index 00000000..1e85211d
--- /dev/null
+++ b/src/cli/tui/screens/evaluator/index.ts
@@ -0,0 +1,2 @@
+export { AddEvaluatorFlow } from './AddEvaluatorFlow';
+export { AddEvaluatorScreen } from './AddEvaluatorScreen';
diff --git a/src/cli/tui/screens/evaluator/types.ts b/src/cli/tui/screens/evaluator/types.ts
new file mode 100644
index 00000000..f22a56c4
--- /dev/null
+++ b/src/cli/tui/screens/evaluator/types.ts
@@ -0,0 +1,131 @@
+import type { EvaluationLevel, EvaluatorConfig } from '../../../../schema';
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Evaluator Flow Types
+// ─────────────────────────────────────────────────────────────────────────────
+
+export type AddEvaluatorStep = 'name' | 'level' | 'model' | 'instructions' | 'ratingScale' | 'confirm';
+
+export interface AddEvaluatorConfig {
+  name: string;
+  level: EvaluationLevel;
+  config: EvaluatorConfig;
+}
+
+export const EVALUATOR_STEP_LABELS: Record<AddEvaluatorStep, string> = {
+  name: 'Name',
+  level: 'Level',
+  model: 'Model',
+  instructions: 'Prompt',
+  ratingScale: 'Scale',
+  confirm: 'Confirm',
+};
+
+// ─────────────────────────────────────────────────────────────────────────────
+// UI Option Constants
+// ─────────────────────────────────────────────────────────────────────────────
+
+export const EVALUATION_LEVEL_OPTIONS = [
+  { id: 'SESSION', title: 'Session', description: 'Evaluate entire conversation sessions' },
+  { id: 'TRACE', title: 'Trace', description: 'Evaluate individual agent traces' },
+  { id: 'TOOL_CALL', title: 'Tool Call', description: 'Evaluate individual tool calls' },
+] as const;
+
+export const DEFAULT_MODEL = 'us.anthropic.claude-sonnet-4-5-20250929-v1:0';
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Placeholder Constants
+// ─────────────────────────────────────────────────────────────────────────────
+
+/**
+ * Allowed placeholders per evaluation level. The API requires instructions
+ * to contain at least one placeholder from the evaluator's level.
+ */
+export const LEVEL_PLACEHOLDERS: Record<EvaluationLevel, string[]> = {
+  SESSION: ['available_tools', 'context', 'actual_trajectory', 'expected_trajectory', 'assertions'],
+  TRACE: ['available_tools', 'context', 'actual_trajectory', 'expected_trajectory', 'assertions'],
+  TOOL_CALL: ['tool_name', 'tool_input', 'tool_output', 'context'],
+};
+
+/**
+ * Default instruction templates per level that include required placeholders.
+ */
+export const DEFAULT_INSTRUCTIONS: Record<EvaluationLevel, string> = {
+  SESSION:
+    'Evaluate the agent session. Context: {context}. The agent trajectory was: {actual_trajectory}. Rate the overall quality of the response.',
+  TRACE:
+    'Evaluate the agent trace. Context: {context}. The agent trajectory was: {actual_trajectory}. Rate the quality of this trace.',
+  TOOL_CALL:
+    'Evaluate the tool call. Tool: {tool_name}. Input: {tool_input}. Output: {tool_output}. Rate the quality of this tool usage.',
+};
+
+/**
+ * Validates that instructions contain at least one placeholder for the given level.
+ */
+export function validateInstructionPlaceholders(instructions: string, level: EvaluationLevel): string | true {
+  const placeholders = LEVEL_PLACEHOLDERS[level];
+  const hasPlaceholder = placeholders.some(p => instructions.includes(`{${p}}`));
+  if (!hasPlaceholder) {
+    return `Instructions must contain at least one placeholder: ${placeholders.map(p => `{${p}}`).join(', ')}`;
+  }
+  return true;
+}
+
+export interface RatingScalePreset {
+  id: string;
+  title: string;
+  description: string;
+  ratingScale: EvaluatorConfig['llmAsAJudge']['ratingScale'];
+}
+
+export const RATING_SCALE_PRESETS: RatingScalePreset[] = [
+  {
+    id: '1-5-quality',
+    title: '1–5 Quality (Numerical)',
+    description: 'Five-point quality scale from Poor to Excellent',
+    ratingScale: {
+      numerical: [
+        { value: 1, label: 'Poor', definition: 'Fails to meet expectations' },
+        { value: 2, label: 'Fair', definition: 'Partially meets expectations' },
+        { value: 3, label: 'Good', definition: 'Meets expectations' },
+        { value: 4, label: 'Very Good', definition: 'Exceeds expectations' },
+        { value: 5, label: 'Excellent', definition: 'Far exceeds expectations' },
+      ],
+    },
+  },
+  {
+    id: '1-3-simple',
+    title: '1–3 Simple (Numerical)',
+    description: 'Three-point scale: Low, Medium, High',
+    ratingScale: {
+      numerical: [
+        { value: 1, label: 'Low', definition: 'Below acceptable quality' },
+        { value: 2, label: 'Medium', definition: 'Acceptable quality' },
+        { value: 3, label: 'High', definition: 'Above acceptable quality' },
+      ],
+    },
+  },
+  {
+    id: 'pass-fail',
+    title: 'Pass / Fail (Categorical)',
+    description: 'Binary pass or fail assessment',
+    ratingScale: {
+      categorical: [
+        { label: 'Pass', definition: 'Meets the evaluation criteria' },
+        { label: 'Fail', definition: 'Does not meet the evaluation criteria' },
+      ],
+    },
+  },
+  {
+    id: 'good-neutral-bad',
+    title: 'Good / Neutral / Bad (Categorical)',
+    description: 'Three-tier categorical assessment',
+    ratingScale: {
+      categorical: [
+        { label: 'Good', definition: 'Positive outcome, meets or exceeds criteria' },
+        { label: 'Neutral', definition: 'Acceptable but unremarkable outcome' },
+        { label: 'Bad', definition: 'Negative outcome, fails to meet criteria' },
+      ],
+    },
+  },
+];
diff --git a/src/cli/tui/screens/evaluator/useAddEvaluatorWizard.ts b/src/cli/tui/screens/evaluator/useAddEvaluatorWizard.ts
new file mode 100644
index 00000000..6288eab9
--- /dev/null
+++ b/src/cli/tui/screens/evaluator/useAddEvaluatorWizard.ts
@@ -0,0 +1,121 @@
+import type { EvaluationLevel, EvaluatorConfig } from '../../../../schema';
+import type { AddEvaluatorConfig, AddEvaluatorStep } from './types';
+import { DEFAULT_MODEL } from './types';
+import { useCallback, useState } from 'react';
+
+const ALL_STEPS: AddEvaluatorStep[] = ['name', 'level', 'model', 'instructions', 'ratingScale', 'confirm'];
+
+function getDefaultConfig(): AddEvaluatorConfig {
+  return {
+    name: '',
+    level: 'SESSION',
+    config: {
+      llmAsAJudge: {
+        model: DEFAULT_MODEL,
+        instructions: '',
+        ratingScale: {
+          numerical: [
+            { value: 1, label: 'Poor', definition: 'Fails to meet expectations' },
+            { value: 5, label: 'Excellent', definition: 'Far exceeds expectations' },
+          ],
+        },
+      },
+    },
+  };
+}
+
+export function useAddEvaluatorWizard() {
+  const [config, setConfig] = useState<AddEvaluatorConfig>(getDefaultConfig);
+  const [step, setStep] = useState<AddEvaluatorStep>('name');
+
+  const currentIndex = ALL_STEPS.indexOf(step);
+
+  const goBack = useCallback(() => {
+    const prevStep = ALL_STEPS[currentIndex - 1];
+    if (prevStep) setStep(prevStep);
+  }, [currentIndex]);
+
+  const nextStep = useCallback((currentStep: AddEvaluatorStep): AddEvaluatorStep | undefined => {
+    const idx = ALL_STEPS.indexOf(currentStep);
+    return ALL_STEPS[idx + 1];
+  }, []);
+
+  const setName = useCallback(
+    (name: string) => {
+      setConfig(c => ({ ...c, name }));
+      const next = nextStep('name');
+      if (next) setStep(next);
+    },
+    [nextStep]
+  );
+
+  const setLevel = useCallback(
+    (level: EvaluationLevel) => {
+      setConfig(c => ({ ...c, level }));
+      const next = nextStep('level');
+      if (next) setStep(next);
+    },
+    [nextStep]
+  );
+
+  const setModel = useCallback(
+    (model: string) => {
+      setConfig(c => ({
+        ...c,
+        config: {
+          llmAsAJudge: { ...c.config.llmAsAJudge, model },
+        },
+      }));
+      const next = nextStep('model');
+      if (next) setStep(next);
+    },
+    [nextStep]
+  );
+
+  const setInstructions = useCallback(
+    (instructions: string) => {
+      setConfig(c => ({
+        ...c,
+        config: {
+          llmAsAJudge: { ...c.config.llmAsAJudge, instructions },
+        },
+      }));
+      const next = nextStep('instructions');
+      if (next) setStep(next);
+    },
+    [nextStep]
+  );
+
+  const setRatingScale = useCallback(
+    (ratingScale: EvaluatorConfig['llmAsAJudge']['ratingScale']) => {
+      setConfig(c => ({
+        ...c,
+        config: {
+          llmAsAJudge: { ...c.config.llmAsAJudge, ratingScale },
+        },
+      }));
+      const next = nextStep('ratingScale');
+      if (next) setStep(next);
+    },
+    [nextStep]
+  );
+
+  const reset = useCallback(() => {
+    setConfig(getDefaultConfig());
+    setStep('name');
+  }, []);
+
+  return {
+    config,
+    step,
+    steps: ALL_STEPS,
+    currentIndex,
+    goBack,
+    setName,
+    setLevel,
+    setModel,
+    setInstructions,
+    setRatingScale,
+    reset,
+  };
+}
diff --git a/src/cli/tui/screens/online-eval/AddOnlineEvalFlow.tsx b/src/cli/tui/screens/online-eval/AddOnlineEvalFlow.tsx
new file mode 100644
index 00000000..d838f429
--- /dev/null
+++ b/src/cli/tui/screens/online-eval/AddOnlineEvalFlow.tsx
@@ -0,0 +1,86 @@
+import { ErrorPrompt } from '../../components';
+import { useExistingEvaluatorNames } from '../../hooks/useCreateEvaluator';
+import { useAvailableAgents } from '../../hooks/useCreateMcp';
+import { useCreateOnlineEval, useExistingOnlineEvalNames } from '../../hooks/useCreateOnlineEval';
+import { AddSuccessScreen } from '../add/AddSuccessScreen';
+import { AddOnlineEvalScreen } from './AddOnlineEvalScreen';
+import type { AddOnlineEvalConfig } from './types';
+import React, { useCallback, useEffect, useState } from 'react';
+
+type FlowState =
+  | { name: 'create-wizard' }
+  | { name: 'create-success'; configName: string }
+  | { name: 'error'; message: string };
+
+interface AddOnlineEvalFlowProps {
+  isInteractive?: boolean;
+  onExit: () => void;
+  onBack: () => void;
+  onDev?: () => void;
+  onDeploy?: () => void;
+}
+
+export function AddOnlineEvalFlow({ isInteractive = true, onExit, onBack, onDev, onDeploy }: AddOnlineEvalFlowProps) {
+  const { createOnlineEval, reset: resetCreate } = useCreateOnlineEval();
+  const { names: existingConfigNames } = useExistingOnlineEvalNames();
+  const { agents: availableAgents } = useAvailableAgents();
+  const { names: availableEvaluators } = useExistingEvaluatorNames();
+  const [flow, setFlow] = useState<FlowState>({ name: 'create-wizard' });
+
+  useEffect(() => {
+    if (!isInteractive && flow.name === 'create-success') {
+      onExit();
+    }
+  }, [isInteractive, flow.name, onExit]);
+
+  const handleCreateComplete = useCallback(
+    (config: AddOnlineEvalConfig) => {
+      void createOnlineEval(config).then(result => {
+        if (result.ok) {
+          setFlow({ name: 'create-success', configName: result.configName });
+          return;
+        }
+        setFlow({ name: 'error', message: result.error });
+      });
+    },
+    [createOnlineEval]
+  );
+
+  if (flow.name === 'create-wizard') {
+    return (
+      <AddOnlineEvalScreen
+        existingConfigNames={existingConfigNames}
+        availableAgents={availableAgents}
+        availableEvaluators={availableEvaluators}
+        onComplete={handleCreateComplete}
+        onExit={onBack}
+      />
+    );
+  }
+
+  if (flow.name === 'create-success') {
+    return (
+      <AddSuccessScreen
+        isInteractive={isInteractive}
+        message={`Added online eval config: ${flow.configName}`}
+        detail="Online eval config added to project in `agentcore/agentcore.json`. Deploy with `agentcore deploy`."
+        onAddAnother={onBack}
+        onDev={onDev}
+        onDeploy={onDeploy}
+        onExit={onExit}
+      />
+    );
+  }
+
+  return (
+    <ErrorPrompt
+      message="Failed to add online eval config"
+      detail={flow.message}
+      onBack={() => {
+        resetCreate();
+        setFlow({ name: 'create-wizard' });
+      }}
+      onExit={onExit}
+    />
+  );
+}
diff --git a/src/cli/tui/screens/online-eval/AddOnlineEvalScreen.tsx b/src/cli/tui/screens/online-eval/AddOnlineEvalScreen.tsx
new file mode 100644
index 00000000..7390a874
--- /dev/null
+++ b/src/cli/tui/screens/online-eval/AddOnlineEvalScreen.tsx
@@ -0,0 +1,151 @@
+import { OnlineEvalConfigNameSchema } from '../../../../schema';
+import type { SelectableItem } from '../../components';
+import { ConfirmReview, Panel, Screen, StepIndicator, TextInput, WizardMultiSelect } from '../../components';
+import { HELP_TEXT } from '../../constants';
+import { useListNavigation, useMultiSelectNavigation } from '../../hooks';
+import { generateUniqueName } from '../../utils';
+import type { AddOnlineEvalConfig } from './types';
+import { BUILTIN_EVALUATORS, DEFAULT_SAMPLING_RATE, ONLINE_EVAL_STEP_LABELS } from './types';
+import { useAddOnlineEvalWizard } from './useAddOnlineEvalWizard';
+import React, { useMemo } from 'react';
+
+interface AddOnlineEvalScreenProps {
+  onComplete: (config: AddOnlineEvalConfig) => void;
+  onExit: () => void;
+  existingConfigNames: string[];
+  availableAgents: string[];
+  availableEvaluators: string[];
+}
+
+export function AddOnlineEvalScreen({
+  onComplete,
+  onExit,
+  existingConfigNames,
+  availableAgents,
+  availableEvaluators,
+}: AddOnlineEvalScreenProps) {
+  const wizard = useAddOnlineEvalWizard();
+
+  const agentItems: SelectableItem[] = useMemo(
+    () => availableAgents.map(name => ({ id: name, title: name, description: 'Agent' })),
+    [availableAgents]
+  );
+
+  const evaluatorItems: SelectableItem[] = useMemo(() => {
+    const custom = availableEvaluators.map(name => ({ id: name, title: name, description: 'Custom evaluator' }));
+    const builtin = BUILTIN_EVALUATORS.map(b => ({ id: b.id, title: b.title, description: b.description }));
+    return [...custom, ...builtin];
+  }, [availableEvaluators]);
+
+  const isNameStep = wizard.step === 'name';
+  const isAgentsStep = wizard.step === 'agents';
+  const isEvaluatorsStep = wizard.step === 'evaluators';
+  const isSamplingRateStep = wizard.step === 'samplingRate';
+  const isConfirmStep = wizard.step === 'confirm';
+
+  const agentsNav = useMultiSelectNavigation({
+    items: agentItems,
+    getId: item => item.id,
+    onConfirm: ids => wizard.setAgents(ids),
+    onExit: () => wizard.goBack(),
+    isActive: isAgentsStep,
+    requireSelection: true,
+  });
+
+  const evaluatorsNav = useMultiSelectNavigation({
+    items: evaluatorItems,
+    getId: item => item.id,
+    onConfirm: ids => wizard.setEvaluators(ids),
+    onExit: () => wizard.goBack(),
+    isActive: isEvaluatorsStep,
+    requireSelection: true,
+  });
+
+  useListNavigation({
+    items: [{ id: 'confirm', title: 'Confirm' }],
+    onSelect: () => onComplete(wizard.config),
+    onExit: () => wizard.goBack(),
+    isActive: isConfirmStep,
+  });
+
+  const helpText =
+    isAgentsStep || isEvaluatorsStep
+      ? 'Space toggle · Enter confirm · Esc back'
+      : isConfirmStep
+        ? HELP_TEXT.CONFIRM_CANCEL
+        : HELP_TEXT.TEXT_INPUT;
+
+  const headerContent = (
+    <StepIndicator steps={wizard.steps} currentStep={wizard.step} labels={ONLINE_EVAL_STEP_LABELS} />
+  );
+
+  return (
+    <Screen title="Add Online Eval Config" onExit={onExit} helpText={helpText} headerContent={headerContent}>
+      <Panel>
+        {isNameStep && (
+          <TextInput
+            key="name"
+            prompt="Config name"
+            initialValue={generateUniqueName('MyOnlineEval', existingConfigNames)}
+            onSubmit={wizard.setName}
+            onCancel={onExit}
+            schema={OnlineEvalConfigNameSchema}
+            customValidation={value => !existingConfigNames.includes(value) || 'Config name already exists'}
+          />
+        )}
+
+        {isAgentsStep && (
+          <WizardMultiSelect
+            title="Select agents to monitor"
+            description="Choose which agents this config evaluates"
+            items={agentItems}
+            cursorIndex={agentsNav.cursorIndex}
+            selectedIds={agentsNav.selectedIds}
+          />
+        )}
+
+        {isEvaluatorsStep && (
+          <WizardMultiSelect
+            title="Select evaluators"
+            description="Choose custom and/or built-in evaluators"
+            items={evaluatorItems}
+            cursorIndex={evaluatorsNav.cursorIndex}
+            selectedIds={evaluatorsNav.selectedIds}
+          />
+        )}
+
+        {isSamplingRateStep && (
+          <TextInput
+            key="samplingRate"
+            prompt="Sampling rate (0.01–100%)"
+            initialValue={String(DEFAULT_SAMPLING_RATE)}
+            onSubmit={value => {
+              const rate = parseFloat(value);
+              if (isNaN(rate) || rate < 0.01 || rate > 100) return;
+              wizard.setSamplingRate(rate);
+            }}
+            onCancel={() => wizard.goBack()}
+            customValidation={value => {
+              const rate = parseFloat(value);
+              if (isNaN(rate)) return 'Must be a number';
+              if (rate < 0.01 || rate > 100) return 'Must be between 0.01 and 100';
+              return true;
+            }}
+          />
+        )}
+
+        {isConfirmStep && (
+          <ConfirmReview
+            fields={[
+              { label: 'Name', value: wizard.config.name },
+              { label: 'Agents', value: wizard.config.agents.join(', ') },
+              { label: 'Evaluators', value: wizard.config.evaluators.join(', ') },
+              { label: 'Sampling Rate', value: `${wizard.config.samplingRate}%` },
+              { label: 'Enable on Create', value: 'Yes' },
+            ]}
+          />
+        )}
+      </Panel>
+    </Screen>
+  );
+}
diff --git a/src/cli/tui/screens/online-eval/index.ts b/src/cli/tui/screens/online-eval/index.ts
new file mode 100644
index 00000000..fcd0d5f4
--- /dev/null
+++ b/src/cli/tui/screens/online-eval/index.ts
@@ -0,0 +1,2 @@
+export { AddOnlineEvalFlow } from './AddOnlineEvalFlow';
+export { AddOnlineEvalScreen } from './AddOnlineEvalScreen';
diff --git a/src/cli/tui/screens/online-eval/types.ts b/src/cli/tui/screens/online-eval/types.ts
new file mode 100644
index 00000000..0c2d70b7
--- /dev/null
+++ b/src/cli/tui/screens/online-eval/types.ts
@@ -0,0 +1,41 @@
+// ─────────────────────────────────────────────────────────────────────────────
+// Online Eval Config Flow Types
+// ─────────────────────────────────────────────────────────────────────────────
+
+export type AddOnlineEvalStep = 'name' | 'agents' | 'evaluators' | 'samplingRate' | 'confirm';
+
+export interface AddOnlineEvalConfig {
+  name: string;
+  agents: string[];
+  evaluators: string[];
+  samplingRate: number;
+  enableOnCreate: boolean;
+}
+
+export const ONLINE_EVAL_STEP_LABELS: Record<AddOnlineEvalStep, string> = {
+  name: 'Name',
+  agents: 'Agents',
+  evaluators: 'Evaluators',
+  samplingRate: 'Rate',
+  confirm: 'Confirm',
+};
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Built-in Evaluators
+// ─────────────────────────────────────────────────────────────────────────────
+
+export const BUILTIN_EVALUATORS = [
+  { id: 'Builtin.Helpfulness', title: 'Builtin.Helpfulness', description: 'Measures how helpful agent responses are' },
+  {
+    id: 'Builtin.GoalSuccessRate',
+    title: 'Builtin.GoalSuccessRate',
+    description: 'Measures whether the agent achieved the user goal',
+  },
+  {
+    id: 'Builtin.Faithfulness',
+    title: 'Builtin.Faithfulness',
+    description: 'Measures factual consistency with source material',
+  },
+] as const;
+
+export const DEFAULT_SAMPLING_RATE = 10;
diff --git a/src/cli/tui/screens/online-eval/useAddOnlineEvalWizard.ts b/src/cli/tui/screens/online-eval/useAddOnlineEvalWizard.ts
new file mode 100644
index 00000000..a4743cb3
--- /dev/null
+++ b/src/cli/tui/screens/online-eval/useAddOnlineEvalWizard.ts
@@ -0,0 +1,86 @@
+import type { AddOnlineEvalConfig, AddOnlineEvalStep } from './types';
+import { DEFAULT_SAMPLING_RATE } from './types';
+import { useCallback, useState } from 'react';
+
+const ALL_STEPS: AddOnlineEvalStep[] = ['name', 'agents', 'evaluators', 'samplingRate', 'confirm'];
+
+function getDefaultConfig(): AddOnlineEvalConfig {
+  return {
+    name: '',
+    agents: [],
+    evaluators: [],
+    samplingRate: DEFAULT_SAMPLING_RATE,
+    enableOnCreate: true,
+  };
+}
+
+export function useAddOnlineEvalWizard() {
+  const [config, setConfig] = useState<AddOnlineEvalConfig>(getDefaultConfig);
+  const [step, setStep] = useState<AddOnlineEvalStep>('name');
+
+  const currentIndex = ALL_STEPS.indexOf(step);
+
+  const goBack = useCallback(() => {
+    const prevStep = ALL_STEPS[currentIndex - 1];
+    if (prevStep) setStep(prevStep);
+  }, [currentIndex]);
+
+  const nextStep = useCallback((currentStep: AddOnlineEvalStep): AddOnlineEvalStep | undefined => {
+    const idx = ALL_STEPS.indexOf(currentStep);
+    return ALL_STEPS[idx + 1];
+  }, []);
+
+  const setName = useCallback(
+    (name: string) => {
+      setConfig(c => ({ ...c, name }));
+      const next = nextStep('name');
+      if (next) setStep(next);
+    },
+    [nextStep]
+  );
+
+  const setAgents = useCallback(
+    (agents: string[]) => {
+      setConfig(c => ({ ...c, agents }));
+      const next = nextStep('agents');
+      if (next) setStep(next);
+    },
+    [nextStep]
+  );
+
+  const setEvaluators = useCallback(
+    (evaluators: string[]) => {
+      setConfig(c => ({ ...c, evaluators }));
+      const next = nextStep('evaluators');
+      if (next) setStep(next);
+    },
+    [nextStep]
+  );
+
+  const setSamplingRate = useCallback(
+    (samplingRate: number) => {
+      setConfig(c => ({ ...c, samplingRate }));
+      const next = nextStep('samplingRate');
+      if (next) setStep(next);
+    },
+    [nextStep]
+  );
+
+  const reset = useCallback(() => {
+    setConfig(getDefaultConfig());
+    setStep('name');
+  }, []);
+
+  return {
+    config,
+    step,
+    steps: ALL_STEPS,
+    currentIndex,
+    goBack,
+    setName,
+    setAgents,
+    setEvaluators,
+    setSamplingRate,
+    reset,
+  };
+}
diff --git a/src/cli/tui/screens/remove/RemoveEvaluatorScreen.tsx b/src/cli/tui/screens/remove/RemoveEvaluatorScreen.tsx
new file mode 100644
index 00000000..a0d5f0fa
--- /dev/null
+++ b/src/cli/tui/screens/remove/RemoveEvaluatorScreen.tsx
@@ -0,0 +1,26 @@
+import type { RemovableEvaluator } from '../../../primitives/EvaluatorPrimitive';
+import { SelectScreen } from '../../components';
+import React from 'react';
+
+interface RemoveEvaluatorScreenProps {
+  evaluators: RemovableEvaluator[];
+  onSelect: (evaluatorName: string) => void;
+  onExit: () => void;
+}
+
+export function RemoveEvaluatorScreen({ evaluators, onSelect, onExit }: RemoveEvaluatorScreenProps) {
+  const items = evaluators.map(evaluator => ({
+    id: evaluator.name,
+    title: evaluator.name,
+    description: 'Custom Evaluator',
+  }));
+
+  return (
+    <SelectScreen
+      title="Select Evaluator to Remove"
+      items={items}
+      onSelect={item => onSelect(item.id)}
+      onExit={onExit}
+    />
+  );
+}
diff --git a/src/cli/tui/screens/remove/RemoveFlow.tsx b/src/cli/tui/screens/remove/RemoveFlow.tsx
index 066874bb..635f18cb 100644
--- a/src/cli/tui/screens/remove/RemoveFlow.tsx
+++ b/src/cli/tui/screens/remove/RemoveFlow.tsx
@@ -2,24 +2,30 @@ import type { RemovableGatewayTarget, RemovalPreview } from '../../../operations
 import { ErrorPrompt, Panel, Screen } from '../../components';
 import {
   useRemovableAgents,
+  useRemovableEvaluators,
   useRemovableGatewayTargets,
   useRemovableGateways,
   useRemovableIdentities,
   useRemovableMemories,
+  useRemovableOnlineEvalConfigs,
   useRemovalPreview,
   useRemoveAgent,
+  useRemoveEvaluator,
   useRemoveGateway,
   useRemoveGatewayTarget,
   useRemoveIdentity,
   useRemoveMemory,
+  useRemoveOnlineEvalConfig,
 } from '../../hooks/useRemove';
 import { RemoveAgentScreen } from './RemoveAgentScreen';
 import { RemoveAllScreen } from './RemoveAllScreen';
 import { RemoveConfirmScreen } from './RemoveConfirmScreen';
+import { RemoveEvaluatorScreen } from './RemoveEvaluatorScreen';
 import { RemoveGatewayScreen } from './RemoveGatewayScreen';
 import { RemoveGatewayTargetScreen } from './RemoveGatewayTargetScreen';
 import { RemoveIdentityScreen } from './RemoveIdentityScreen';
 import { RemoveMemoryScreen } from './RemoveMemoryScreen';
+import { RemoveOnlineEvalScreen } from './RemoveOnlineEvalScreen';
 import type { RemoveResourceType } from './RemoveScreen';
 import { RemoveScreen } from './RemoveScreen';
 import { RemoveSuccessScreen } from './RemoveSuccessScreen';
@@ -34,17 +40,23 @@ type FlowState =
   | { name: 'select-gateway-target' }
   | { name: 'select-memory' }
   | { name: 'select-identity' }
+  | { name: 'select-evaluator' }
+  | { name: 'select-online-eval' }
   | { name: 'confirm-agent'; agentName: string; preview: RemovalPreview }
   | { name: 'confirm-gateway'; gatewayName: string; preview: RemovalPreview }
   | { name: 'confirm-gateway-target'; tool: RemovableGatewayTarget; preview: RemovalPreview }
   | { name: 'confirm-memory'; memoryName: string; preview: RemovalPreview }
   | { name: 'confirm-identity'; identityName: string; preview: RemovalPreview }
+  | { name: 'confirm-evaluator'; evaluatorName: string; preview: RemovalPreview }
+  | { name: 'confirm-online-eval'; configName: string; preview: RemovalPreview }
   | { name: 'loading'; message: string }
   | { name: 'agent-success'; agentName: string; logFilePath?: string }
   | { name: 'gateway-success'; gatewayName: string; logFilePath?: string }
   | { name: 'tool-success'; toolName: string; logFilePath?: string }
   | { name: 'memory-success'; memoryName: string; logFilePath?: string }
   | { name: 'identity-success'; identityName: string; logFilePath?: string }
+  | { name: 'evaluator-success'; evaluatorName: string; logFilePath?: string }
+  | { name: 'online-eval-success'; configName: string; logFilePath?: string }
   | { name: 'remove-all' }
   | { name: 'error'; message: string };
 
@@ -57,7 +69,7 @@ interface RemoveFlowProps {
   /** Force mode - skip confirmation */
   force?: boolean;
   /** Initial resource type to start at (for CLI subcommands) */
-  initialResourceType?: 'agent' | 'gateway' | 'gateway-target' | 'memory' | 'identity';
+  initialResourceType?: 'agent' | 'gateway' | 'gateway-target' | 'memory' | 'identity' | 'evaluator' | 'online-eval';
   /** Initial resource name to auto-select (for CLI --name flag) */
   initialResourceName?: string;
 }
@@ -83,6 +95,10 @@ export function RemoveFlow({
         return { name: 'select-memory' };
       case 'identity':
         return { name: 'select-identity' };
+      case 'evaluator':
+        return { name: 'select-evaluator' };
+      case 'online-eval':
+        return { name: 'select-online-eval' };
       default:
         return { name: 'select' };
     }
@@ -95,9 +111,22 @@ export function RemoveFlow({
   const { tools: mcpTools, isLoading: isLoadingTools, refresh: refreshTools } = useRemovableGatewayTargets();
   const { memories, isLoading: isLoadingMemories, refresh: refreshMemories } = useRemovableMemories();
   const { identities, isLoading: isLoadingIdentities, refresh: refreshIdentities } = useRemovableIdentities();
+  const { evaluators, isLoading: isLoadingEvaluators, refresh: refreshEvaluators } = useRemovableEvaluators();
+  const {
+    onlineEvalConfigs,
+    isLoading: isLoadingOnlineEvals,
+    refresh: refreshOnlineEvals,
+  } = useRemovableOnlineEvalConfigs();
 
   // Check if any data is still loading
-  const isLoading = isLoadingAgents || isLoadingGateways || isLoadingTools || isLoadingMemories || isLoadingIdentities;
+  const isLoading =
+    isLoadingAgents ||
+    isLoadingGateways ||
+    isLoadingTools ||
+    isLoadingMemories ||
+    isLoadingIdentities ||
+    isLoadingEvaluators ||
+    isLoadingOnlineEvals;
 
   // Preview hook
   const {
@@ -106,6 +135,8 @@ export function RemoveFlow({
     loadGatewayTargetPreview,
     loadMemoryPreview,
     loadIdentityPreview,
+    loadEvaluatorPreview,
+    loadOnlineEvalPreview,
     reset: resetPreview,
   } = useRemovalPreview();
 
@@ -115,6 +146,8 @@ export function RemoveFlow({
   const { remove: removeGatewayTargetOp, reset: resetRemoveGatewayTarget } = useRemoveGatewayTarget();
   const { remove: removeMemoryOp, reset: resetRemoveMemory } = useRemoveMemory();
   const { remove: removeIdentityOp, reset: resetRemoveIdentity } = useRemoveIdentity();
+  const { remove: removeEvaluatorOp, reset: resetRemoveEvaluator } = useRemoveEvaluator();
+  const { remove: removeOnlineEvalOp, reset: resetRemoveOnlineEval } = useRemoveOnlineEvalConfig();
 
   // Track pending result state
   const pendingResultRef = useRef<FlowState | null>(null);
@@ -135,7 +168,15 @@ export function RemoveFlow({
   // In non-interactive mode, exit after success
   useEffect(() => {
     if (!isInteractive) {
-      const successStates = ['agent-success', 'gateway-success', 'tool-success', 'memory-success', 'identity-success'];
+      const successStates = [
+        'agent-success',
+        'gateway-success',
+        'tool-success',
+        'memory-success',
+        'identity-success',
+        'evaluator-success',
+        'online-eval-success',
+      ];
       if (successStates.includes(flow.name)) {
         onExit();
       }
@@ -162,6 +203,12 @@ export function RemoveFlow({
       case 'identity':
         setFlow({ name: 'select-identity' });
         break;
+      case 'evaluator':
+        setFlow({ name: 'select-evaluator' });
+        break;
+      case 'online-eval':
+        setFlow({ name: 'select-online-eval' });
+        break;
       case 'all':
         setFlow({ name: 'remove-all' });
         break;
@@ -281,6 +328,50 @@ export function RemoveFlow({
     [loadIdentityPreview, force, removeIdentityOp]
   );
 
+  const handleSelectEvaluator = useCallback(
+    async (evaluatorName: string) => {
+      const result = await loadEvaluatorPreview(evaluatorName);
+      if (result.ok) {
+        if (force) {
+          setFlow({ name: 'loading', message: `Removing evaluator ${evaluatorName}...` });
+          const removeResult = await removeEvaluatorOp(evaluatorName, result.preview);
+          if (removeResult.success) {
+            setFlow({ name: 'evaluator-success', evaluatorName });
+          } else {
+            setFlow({ name: 'error', message: removeResult.error });
+          }
+        } else {
+          setFlow({ name: 'confirm-evaluator', evaluatorName, preview: result.preview });
+        }
+      } else {
+        setFlow({ name: 'error', message: result.error });
+      }
+    },
+    [loadEvaluatorPreview, force, removeEvaluatorOp]
+  );
+
+  const handleSelectOnlineEval = useCallback(
+    async (configName: string) => {
+      const result = await loadOnlineEvalPreview(configName);
+      if (result.ok) {
+        if (force) {
+          setFlow({ name: 'loading', message: `Removing online eval config ${configName}...` });
+          const removeResult = await removeOnlineEvalOp(configName, result.preview);
+          if (removeResult.success) {
+            setFlow({ name: 'online-eval-success', configName });
+          } else {
+            setFlow({ name: 'error', message: removeResult.error });
+          }
+        } else {
+          setFlow({ name: 'confirm-online-eval', configName, preview: result.preview });
+        }
+      } else {
+        setFlow({ name: 'error', message: result.error });
+      }
+    },
+    [loadOnlineEvalPreview, force, removeOnlineEvalOp]
+  );
+
   // Auto-select resource when initialResourceName is provided and data is loaded
   useEffect(() => {
     if (!initialResourceName || isLoading || hasTriggeredInitialSelection.current) {
@@ -305,6 +396,12 @@ export function RemoveFlow({
         case 'identity':
           void handleSelectIdentity(initialResourceName);
           break;
+        case 'evaluator':
+          void handleSelectEvaluator(initialResourceName);
+          break;
+        case 'online-eval':
+          void handleSelectOnlineEval(initialResourceName);
+          break;
       }
     }, 0);
   }, [
@@ -315,6 +412,8 @@ export function RemoveFlow({
     handleSelectGateway,
     handleSelectMemory,
     handleSelectIdentity,
+    handleSelectEvaluator,
+    handleSelectOnlineEval,
   ]);
 
   // Confirm handlers - pass preview for logging
@@ -398,6 +497,38 @@ export function RemoveFlow({
     [removeIdentityOp]
   );
 
+  const handleConfirmEvaluator = useCallback(
+    async (evaluatorName: string, preview: RemovalPreview) => {
+      pendingResultRef.current = null;
+      setResultReady(false);
+      setFlow({ name: 'loading', message: `Removing evaluator ${evaluatorName}...` });
+      const result = await removeEvaluatorOp(evaluatorName, preview);
+      if (result.success) {
+        pendingResultRef.current = { name: 'evaluator-success', evaluatorName, logFilePath: result.logFilePath };
+      } else {
+        pendingResultRef.current = { name: 'error', message: result.error };
+      }
+      setResultReady(true);
+    },
+    [removeEvaluatorOp]
+  );
+
+  const handleConfirmOnlineEval = useCallback(
+    async (configName: string, preview: RemovalPreview) => {
+      pendingResultRef.current = null;
+      setResultReady(false);
+      setFlow({ name: 'loading', message: `Removing online eval config ${configName}...` });
+      const result = await removeOnlineEvalOp(configName, preview);
+      if (result.success) {
+        pendingResultRef.current = { name: 'online-eval-success', configName, logFilePath: result.logFilePath };
+      } else {
+        pendingResultRef.current = { name: 'error', message: result.error };
+      }
+      setResultReady(true);
+    },
+    [removeOnlineEvalOp]
+  );
+
   const resetAll = useCallback(() => {
     resetPreview();
     resetRemoveAgent();
@@ -405,6 +536,8 @@ export function RemoveFlow({
     resetRemoveGatewayTarget();
     resetRemoveMemory();
     resetRemoveIdentity();
+    resetRemoveEvaluator();
+    resetRemoveOnlineEval();
   }, [
     resetPreview,
     resetRemoveAgent,
@@ -412,11 +545,29 @@ export function RemoveFlow({
     resetRemoveGatewayTarget,
     resetRemoveMemory,
     resetRemoveIdentity,
+    resetRemoveEvaluator,
+    resetRemoveOnlineEval,
   ]);
 
   const refreshAll = useCallback(async () => {
-    await Promise.all([refreshAgents(), refreshGateways(), refreshTools(), refreshMemories(), refreshIdentities()]);
-  }, [refreshAgents, refreshGateways, refreshTools, refreshMemories, refreshIdentities]);
+    await Promise.all([
+      refreshAgents(),
+      refreshGateways(),
+      refreshTools(),
+      refreshMemories(),
+      refreshIdentities(),
+      refreshEvaluators(),
+      refreshOnlineEvals(),
+    ]);
+  }, [
+    refreshAgents,
+    refreshGateways,
+    refreshTools,
+    refreshMemories,
+    refreshIdentities,
+    refreshEvaluators,
+    refreshOnlineEvals,
+  ]);
 
   // Select screen - wait for data to load to avoid arrow position issues
   if (flow.name === 'select') {
@@ -432,6 +583,8 @@ export function RemoveFlow({
         mcpToolCount={mcpTools.length}
         memoryCount={memories.length}
         identityCount={identities.length}
+        evaluatorCount={evaluators.length}
+        onlineEvalCount={onlineEvalConfigs.length}
       />
     );
   }
@@ -514,6 +667,32 @@ export function RemoveFlow({
     );
   }
 
+  if (flow.name === 'select-evaluator') {
+    if (initialResourceName && isLoading) {
+      return null;
+    }
+    return (
+      <RemoveEvaluatorScreen
+        evaluators={evaluators}
+        onSelect={(name: string) => void handleSelectEvaluator(name)}
+        onExit={() => setFlow({ name: 'select' })}
+      />
+    );
+  }
+
+  if (flow.name === 'select-online-eval') {
+    if (initialResourceName && isLoading) {
+      return null;
+    }
+    return (
+      <RemoveOnlineEvalScreen
+        configs={onlineEvalConfigs}
+        onSelect={(name: string) => void handleSelectOnlineEval(name)}
+        onExit={() => setFlow({ name: 'select' })}
+      />
+    );
+  }
+
   // Confirmation screens
   if (flow.name === 'confirm-agent') {
     return (
@@ -570,6 +749,28 @@ export function RemoveFlow({
     );
   }
 
+  if (flow.name === 'confirm-evaluator') {
+    return (
+      <RemoveConfirmScreen
+        title={`Remove Evaluator: ${flow.evaluatorName}`}
+        preview={flow.preview}
+        onConfirm={() => void handleConfirmEvaluator(flow.evaluatorName, flow.preview)}
+        onCancel={() => setFlow({ name: 'select-evaluator' })}
+      />
+    );
+  }
+
+  if (flow.name === 'confirm-online-eval') {
+    return (
+      <RemoveConfirmScreen
+        title={`Remove Online Eval Config: ${flow.configName}`}
+        preview={flow.preview}
+        onConfirm={() => void handleConfirmOnlineEval(flow.configName, flow.preview)}
+        onCancel={() => setFlow({ name: 'select-online-eval' })}
+      />
+    );
+  }
+
   // Success screens
   if (flow.name === 'agent-success') {
     return (
@@ -651,6 +852,38 @@ export function RemoveFlow({
     );
   }
 
+  if (flow.name === 'evaluator-success') {
+    return (
+      <RemoveSuccessScreen
+        isInteractive={isInteractive}
+        message={`Removed evaluator: ${flow.evaluatorName}`}
+        detail="Evaluator removed from agentcore.json. Deploy with `agentcore deploy` to apply changes."
+        logFilePath={flow.logFilePath}
+        onRemoveAnother={() => {
+          resetAll();
+          void refreshAll().then(() => setFlow({ name: 'select' }));
+        }}
+        onExit={onExit}
+      />
+    );
+  }
+
+  if (flow.name === 'online-eval-success') {
+    return (
+      <RemoveSuccessScreen
+        isInteractive={isInteractive}
+        message={`Removed online eval config: ${flow.configName}`}
+        detail="Online eval config removed from agentcore.json. Deploy with `agentcore deploy` to apply changes."
+        logFilePath={flow.logFilePath}
+        onRemoveAnother={() => {
+          resetAll();
+          void refreshAll().then(() => setFlow({ name: 'select' }));
+        }}
+        onExit={onExit}
+      />
+    );
+  }
+
   // Remove all screen
   if (flow.name === 'remove-all') {
     return <RemoveAllScreen isInteractive={isInteractive} onExit={onExit} onNavigate={onNavigate} />;
diff --git a/src/cli/tui/screens/remove/RemoveOnlineEvalScreen.tsx b/src/cli/tui/screens/remove/RemoveOnlineEvalScreen.tsx
new file mode 100644
index 00000000..faab02f4
--- /dev/null
+++ b/src/cli/tui/screens/remove/RemoveOnlineEvalScreen.tsx
@@ -0,0 +1,26 @@
+import type { RemovableOnlineEvalConfig } from '../../../primitives/OnlineEvalConfigPrimitive';
+import { SelectScreen } from '../../components';
+import React from 'react';
+
+interface RemoveOnlineEvalScreenProps {
+  configs: RemovableOnlineEvalConfig[];
+  onSelect: (configName: string) => void;
+  onExit: () => void;
+}
+
+export function RemoveOnlineEvalScreen({ configs, onSelect, onExit }: RemoveOnlineEvalScreenProps) {
+  const items = configs.map(config => ({
+    id: config.name,
+    title: config.name,
+    description: 'Online Eval Config',
+  }));
+
+  return (
+    <SelectScreen
+      title="Select Online Eval Config to Remove"
+      items={items}
+      onSelect={item => onSelect(item.id)}
+      onExit={onExit}
+    />
+  );
+}
diff --git a/src/cli/tui/screens/remove/RemoveScreen.tsx b/src/cli/tui/screens/remove/RemoveScreen.tsx
index bcb7307c..59441d0e 100644
--- a/src/cli/tui/screens/remove/RemoveScreen.tsx
+++ b/src/cli/tui/screens/remove/RemoveScreen.tsx
@@ -6,6 +6,8 @@ const REMOVE_RESOURCES = [
   { id: 'agent', title: 'Agent', description: 'Remove an agent from the project' },
   { id: 'memory', title: 'Memory', description: 'Remove a memory provider' },
   { id: 'identity', title: 'Identity', description: 'Remove an identity provider' },
+  { id: 'evaluator', title: 'Evaluator', description: 'Remove a custom evaluator' },
+  { id: 'online-eval', title: 'Online Eval Config', description: 'Remove an online eval config' },
   { id: 'gateway', title: 'Gateway', description: 'Remove a gateway' },
   { id: 'gateway-target', title: 'Gateway Target', description: 'Remove a gateway target' },
   { id: 'all', title: 'All', description: 'Reset entire agentcore project' },
@@ -26,6 +28,10 @@ interface RemoveScreenProps {
   memoryCount: number;
   /** Number of identities available for removal */
   identityCount: number;
+  /** Number of evaluators available for removal */
+  evaluatorCount: number;
+  /** Number of online eval configs available for removal */
+  onlineEvalCount: number;
 }
 
 export function RemoveScreen({
@@ -36,6 +42,8 @@ export function RemoveScreen({
   mcpToolCount,
   memoryCount,
   identityCount,
+  evaluatorCount,
+  onlineEvalCount,
 }: RemoveScreenProps) {
   const items: SelectableItem[] = useMemo(() => {
     return REMOVE_RESOURCES.map(r => {
@@ -73,6 +81,18 @@ export function RemoveScreen({
             description = 'No identities to remove';
           }
           break;
+        case 'evaluator':
+          if (evaluatorCount === 0) {
+            disabled = true;
+            description = 'No evaluators to remove';
+          }
+          break;
+        case 'online-eval':
+          if (onlineEvalCount === 0) {
+            disabled = true;
+            description = 'No online eval configs to remove';
+          }
+          break;
         case 'all':
           // 'all' is always available
           break;
@@ -80,7 +100,7 @@ export function RemoveScreen({
 
       return { ...r, disabled, description };
     });
-  }, [agentCount, gatewayCount, mcpToolCount, memoryCount, identityCount]);
+  }, [agentCount, gatewayCount, mcpToolCount, memoryCount, identityCount, evaluatorCount, onlineEvalCount]);
 
   const isDisabled = (item: SelectableItem) => item.disabled ?? false;
 
diff --git a/src/cli/tui/screens/remove/__tests__/RemoveScreen.test.tsx b/src/cli/tui/screens/remove/__tests__/RemoveScreen.test.tsx
index e1e32e05..4d52e68c 100644
--- a/src/cli/tui/screens/remove/__tests__/RemoveScreen.test.tsx
+++ b/src/cli/tui/screens/remove/__tests__/RemoveScreen.test.tsx
@@ -17,6 +17,8 @@ describe('RemoveScreen', () => {
         mcpToolCount={1}
         memoryCount={1}
         identityCount={1}
+        evaluatorCount={1}
+        onlineEvalCount={1}
       />
     );
 
@@ -39,6 +41,8 @@ describe('RemoveScreen', () => {
         mcpToolCount={0}
         memoryCount={0}
         identityCount={0}
+        evaluatorCount={0}
+        onlineEvalCount={0}
       />
     );
 
diff --git a/src/cli/tui/screens/remove/index.ts b/src/cli/tui/screens/remove/index.ts
index 71d78c30..4a470fff 100644
--- a/src/cli/tui/screens/remove/index.ts
+++ b/src/cli/tui/screens/remove/index.ts
@@ -1,10 +1,12 @@
 export { RemoveAgentScreen } from './RemoveAgentScreen';
 export { RemoveAllScreen } from './RemoveAllScreen';
 export { RemoveConfirmScreen } from './RemoveConfirmScreen';
+export { RemoveEvaluatorScreen } from './RemoveEvaluatorScreen';
 export { RemoveFlow } from './RemoveFlow';
 export { RemoveGatewayScreen } from './RemoveGatewayScreen';
 export { RemoveIdentityScreen } from './RemoveIdentityScreen';
 export { RemoveGatewayTargetScreen } from './RemoveGatewayTargetScreen';
 export { RemoveMemoryScreen } from './RemoveMemoryScreen';
+export { RemoveOnlineEvalScreen } from './RemoveOnlineEvalScreen';
 export { RemoveScreen, type RemoveResourceType } from './RemoveScreen';
 export { RemoveSuccessScreen } from './RemoveSuccessScreen';
diff --git a/src/cli/tui/screens/remove/useRemoveFlow.ts b/src/cli/tui/screens/remove/useRemoveFlow.ts
index 2c8fea13..114fab96 100644
--- a/src/cli/tui/screens/remove/useRemoveFlow.ts
+++ b/src/cli/tui/screens/remove/useRemoveFlow.ts
@@ -34,6 +34,8 @@ function createDefaultProjectSpec(projectName: string): AgentCoreProjectSpec {
     agents: [],
     memories: [],
     credentials: [],
+    evaluators: [],
+    onlineEvalConfigs: [],
   };
 }
 
diff --git a/src/schema/schemas/agentcore-project.ts b/src/schema/schemas/agentcore-project.ts
index fda34160..d367ffb8 100644
--- a/src/schema/schemas/agentcore-project.ts
+++ b/src/schema/schemas/agentcore-project.ts
@@ -8,13 +8,20 @@
  */
 import { isReservedProjectName } from '../constants';
 import { AgentEnvSpecSchema } from './agent-env';
+import { EvaluationLevelSchema, EvaluatorConfigSchema, EvaluatorNameSchema } from './primitives/evaluator';
 import { DEFAULT_STRATEGY_NAMESPACES, MemoryStrategySchema, MemoryStrategyTypeSchema } from './primitives/memory';
+import { OnlineEvalConfigSchema } from './primitives/online-eval-config';
 import { uniqueBy } from './zod-util';
 import { z } from 'zod';
 
 // Re-export for convenience
 export { DEFAULT_STRATEGY_NAMESPACES, MemoryStrategySchema, MemoryStrategyTypeSchema };
+export { EvaluationLevelSchema };
 export type { MemoryStrategy, MemoryStrategyType } from './primitives/memory';
+export type { OnlineEvalConfig } from './primitives/online-eval-config';
+export { OnlineEvalConfigSchema, OnlineEvalConfigNameSchema } from './primitives/online-eval-config';
+export type { EvaluationLevel, EvaluatorConfig, LlmAsAJudgeConfig, RatingScale } from './primitives/evaluator';
+export { EvaluatorNameSchema } from './primitives/evaluator';
 
 // ============================================================================
 // Project Name Schema
@@ -112,42 +119,107 @@ export const CredentialSchema = z.discriminatedUnion('type', [ApiKeyCredentialSc
 export type Credential = z.infer<typeof CredentialSchema>;
 
 // ============================================================================
-// Project Schema (Top Level)
+// Evaluator Schema
 // ============================================================================
 
-export const AgentCoreProjectSpecSchema = z.object({
-  name: ProjectNameSchema,
-  version: z.number().int(),
+export const EvaluatorTypeSchema = z.literal('CustomEvaluator');
+export type EvaluatorType = z.infer<typeof EvaluatorTypeSchema>;
 
-  agents: z
-    .array(AgentEnvSpecSchema)
-    .default([])
-    .superRefine(
-      uniqueBy(
-        agent => agent.name,
-        name => `Duplicate agent name: ${name}`
-      )
-    ),
+export const EvaluatorSchema = z.object({
+  type: EvaluatorTypeSchema,
+  name: EvaluatorNameSchema,
+  level: EvaluationLevelSchema,
+  description: z.string().optional(),
+  config: EvaluatorConfigSchema,
+});
 
-  memories: z
-    .array(MemorySchema)
-    .default([])
-    .superRefine(
-      uniqueBy(
-        memory => memory.name,
-        name => `Duplicate memory name: ${name}`
-      )
-    ),
+export type Evaluator = z.infer<typeof EvaluatorSchema>;
 
-  credentials: z
-    .array(CredentialSchema)
-    .default([])
-    .superRefine(
-      uniqueBy(
-        credential => credential.name,
-        name => `Duplicate credential name: ${name}`
-      )
-    ),
-});
+// ============================================================================
+// Project Schema (Top Level)
+// ============================================================================
+
+const BUILTIN_EVALUATOR_PREFIX = 'Builtin.';
+
+export const AgentCoreProjectSpecSchema = z
+  .object({
+    name: ProjectNameSchema,
+    version: z.number().int(),
+
+    agents: z
+      .array(AgentEnvSpecSchema)
+      .default([])
+      .superRefine(
+        uniqueBy(
+          agent => agent.name,
+          name => `Duplicate agent name: ${name}`
+        )
+      ),
+
+    memories: z
+      .array(MemorySchema)
+      .default([])
+      .superRefine(
+        uniqueBy(
+          memory => memory.name,
+          name => `Duplicate memory name: ${name}`
+        )
+      ),
+
+    credentials: z
+      .array(CredentialSchema)
+      .default([])
+      .superRefine(
+        uniqueBy(
+          credential => credential.name,
+          name => `Duplicate credential name: ${name}`
+        )
+      ),
+
+    evaluators: z
+      .array(EvaluatorSchema)
+      .default([])
+      .superRefine(
+        uniqueBy(
+          evaluator => evaluator.name,
+          name => `Duplicate evaluator name: ${name}`
+        )
+      ),
+
+    onlineEvalConfigs: z
+      .array(OnlineEvalConfigSchema)
+      .default([])
+      .superRefine(
+        uniqueBy(
+          config => config.name,
+          name => `Duplicate online eval config name: ${name}`
+        )
+      ),
+  })
+  .superRefine((spec, ctx) => {
+    // Cross-field validation: onlineEvalConfigs reference valid agents and evaluators
+    const agentNames = new Set(spec.agents.map(a => a.name));
+    const evaluatorNames = new Set(spec.evaluators.map(e => e.name));
+
+    for (const config of spec.onlineEvalConfigs) {
+      for (const agentName of config.agents) {
+        if (!agentNames.has(agentName)) {
+          ctx.addIssue({
+            code: z.ZodIssueCode.custom,
+            message: `Online eval config "${config.name}" references unknown agent "${agentName}"`,
+          });
+        }
+      }
+
+      for (const evalName of config.evaluators) {
+        if (!evalName.startsWith(BUILTIN_EVALUATOR_PREFIX) && !evaluatorNames.has(evalName)) {
+          ctx.addIssue({
+            code: z.ZodIssueCode.custom,
+            message: `Online eval config "${config.name}" references unknown evaluator "${evalName}"`,
+          });
+        }
+      }
+    }
+  });
 
 export type AgentCoreProjectSpec = z.infer<typeof AgentCoreProjectSpecSchema>;
diff --git a/src/schema/schemas/primitives/evaluator.ts b/src/schema/schemas/primitives/evaluator.ts
new file mode 100644
index 00000000..632cfd85
--- /dev/null
+++ b/src/schema/schemas/primitives/evaluator.ts
@@ -0,0 +1,74 @@
+import { z } from 'zod';
+
+// ============================================================================
+// Evaluator Types
+// ============================================================================
+
+export const EvaluationLevelSchema = z.enum(['SESSION', 'TRACE', 'TOOL_CALL']);
+export type EvaluationLevel = z.infer<typeof EvaluationLevelSchema>;
+
+export const EvaluatorNameSchema = z
+  .string()
+  .min(1, 'Name is required')
+  .max(48)
+  .regex(
+    /^[a-zA-Z][a-zA-Z0-9_]{0,47}$/,
+    'Must begin with a letter and contain only alphanumeric characters and underscores (max 48 chars)'
+  );
+
+// ============================================================================
+// Rating Scale
+// ============================================================================
+
+export const NumericalRatingSchema = z.object({
+  value: z.number().int(),
+  label: z.string().min(1),
+  definition: z.string().min(1),
+});
+
+export type NumericalRating = z.infer<typeof NumericalRatingSchema>;
+
+export const CategoricalRatingSchema = z.object({
+  label: z.string().min(1),
+  definition: z.string().min(1),
+});
+
+export type CategoricalRating = z.infer<typeof CategoricalRatingSchema>;
+
+export const RatingScaleSchema = z
+  .object({
+    numerical: z.array(NumericalRatingSchema).optional(),
+    categorical: z.array(CategoricalRatingSchema).optional(),
+  })
+  .refine(
+    scale => {
+      const hasNumerical = Boolean(scale.numerical);
+      const hasCategorical = Boolean(scale.categorical);
+      return hasNumerical !== hasCategorical;
+    },
+    { message: 'Rating scale must have either numerical or categorical, not both' }
+  );
+
+export type RatingScale = z.infer<typeof RatingScaleSchema>;
+
+// ============================================================================
+// LLM-as-a-Judge Config
+// ============================================================================
+
+export const LlmAsAJudgeConfigSchema = z.object({
+  model: z.string().min(1, 'Model ID is required'),
+  instructions: z.string().min(1, 'Evaluation instructions are required'),
+  ratingScale: RatingScaleSchema,
+});
+
+export type LlmAsAJudgeConfig = z.infer<typeof LlmAsAJudgeConfigSchema>;
+
+// ============================================================================
+// Evaluator Config
+// ============================================================================
+
+export const EvaluatorConfigSchema = z.object({
+  llmAsAJudge: LlmAsAJudgeConfigSchema,
+});
+
+export type EvaluatorConfig = z.infer<typeof EvaluatorConfigSchema>;
diff --git a/src/schema/schemas/primitives/index.ts b/src/schema/schemas/primitives/index.ts
index e7f572e8..7b29a435 100644
--- a/src/schema/schemas/primitives/index.ts
+++ b/src/schema/schemas/primitives/index.ts
@@ -5,3 +5,24 @@ export {
   MemoryStrategySchema,
   MemoryStrategyTypeSchema,
 } from './memory';
+
+export type {
+  EvaluationLevel,
+  EvaluatorConfig,
+  LlmAsAJudgeConfig,
+  RatingScale,
+  NumericalRating,
+  CategoricalRating,
+} from './evaluator';
+export {
+  EvaluationLevelSchema,
+  EvaluatorConfigSchema,
+  EvaluatorNameSchema,
+  LlmAsAJudgeConfigSchema,
+  RatingScaleSchema,
+  NumericalRatingSchema,
+  CategoricalRatingSchema,
+} from './evaluator';
+
+export type { OnlineEvalConfig } from './online-eval-config';
+export { OnlineEvalConfigSchema, OnlineEvalConfigNameSchema } from './online-eval-config';
diff --git a/src/schema/schemas/primitives/online-eval-config.ts b/src/schema/schemas/primitives/online-eval-config.ts
new file mode 100644
index 00000000..ea6ef95c
--- /dev/null
+++ b/src/schema/schemas/primitives/online-eval-config.ts
@@ -0,0 +1,29 @@
+import { z } from 'zod';
+
+// ============================================================================
+// Online Eval Config Types
+// ============================================================================
+
+export const OnlineEvalConfigNameSchema = z
+  .string()
+  .min(1, 'Name is required')
+  .max(48)
+  .regex(
+    /^[a-zA-Z][a-zA-Z0-9_]{0,47}$/,
+    'Must begin with a letter and contain only alphanumeric characters and underscores (max 48 chars)'
+  );
+
+export const OnlineEvalConfigSchema = z.object({
+  type: z.literal('OnlineEvalConfig'),
+  name: OnlineEvalConfigNameSchema,
+  /** Agent names this online eval config monitors */
+  agents: z.array(z.string().min(1)).min(1, 'At least one agent is required'),
+  /** Evaluator names (custom) or Builtin.* IDs */
+  evaluators: z.array(z.string().min(1)).min(1, 'At least one evaluator is required'),
+  /** Sampling rate as a percentage (0.01 to 100) */
+  samplingRate: z.number().min(0.01).max(100),
+  /** Whether to start the pipeline immediately on deploy */
+  enableOnCreate: z.boolean().default(true),
+});
+
+export type OnlineEvalConfig = z.infer<typeof OnlineEvalConfigSchema>;

From 674c6b1088b88b18a97e3a00e71342c22ae0fb32 Mon Sep 17 00:00:00 2001
From: notgitika <gitijh@gmail.com>
Date: Wed, 11 Mar 2026 20:25:40 -0400
Subject: [PATCH 2/9] feat: add functionality to run evaluation and online
 evals

---
 src/cli/aws/agentcore-control.ts              |  49 ++-
 src/cli/aws/agentcore.ts                      | 104 +++++
 src/cli/cli.ts                                |   8 +
 .../__tests__/outputs-extended.test.ts        | 168 +++++++-
 src/cli/cloudformation/outputs.ts             |  96 ++++-
 src/cli/commands/deploy/actions.ts            |  12 +
 src/cli/commands/eval/command.tsx             | 138 +++++++
 src/cli/commands/eval/index.ts                |   1 +
 src/cli/commands/index.ts                     |   4 +
 src/cli/commands/logs/command.tsx             |  36 +-
 src/cli/commands/pause/command.tsx            |  41 ++
 src/cli/commands/pause/index.ts               |   1 +
 src/cli/commands/resume/command.tsx           |  41 ++
 src/cli/commands/resume/index.ts              |   1 +
 src/cli/commands/run/command.tsx              |  94 +++++
 src/cli/commands/run/index.ts                 |   1 +
 .../commands/status/__tests__/action.test.ts  | 110 +++++
 src/cli/commands/status/action.ts             |  23 +-
 src/cli/operations/eval/get-eval-run.ts       |  17 +
 src/cli/operations/eval/index.ts              |  18 +
 src/cli/operations/eval/list-eval-runs.ts     |  26 ++
 src/cli/operations/eval/logs-eval.ts          | 140 +++++++
 src/cli/operations/eval/pause-resume.ts       |  72 ++++
 src/cli/operations/eval/run-eval.ts           | 387 ++++++++++++++++++
 src/cli/operations/eval/storage.ts            |  54 +++
 src/cli/operations/eval/types.ts              |  63 +++
 .../primitives/OnlineEvalConfigPrimitive.ts   |   4 +-
 src/cli/tui/App.tsx                           |   8 +
 src/cli/tui/copy.ts                           |   4 +
 src/cli/tui/hooks/useCreateOnlineEval.ts      |   2 -
 .../tui/screens/cli-only/CliOnlyScreen.tsx    |  28 ++
 src/cli/tui/screens/cli-only/index.ts         |   1 +
 src/cli/tui/screens/deploy/useDeployFlow.ts   |  12 +
 src/cli/tui/screens/eval/EvalScreen.tsx       |  88 ++++
 src/cli/tui/screens/eval/index.ts             |   1 +
 .../screens/evaluator/AddEvaluatorScreen.tsx  |  24 +-
 src/cli/tui/screens/evaluator/types.ts        |   4 +-
 src/cli/tui/screens/online-eval/types.ts      |   1 -
 .../online-eval/useAddOnlineEvalWizard.ts     |   1 -
 src/cli/tui/utils/commands.ts                 |   2 +-
 src/schema/schemas/deployed-state.ts          |  25 ++
 .../schemas/primitives/online-eval-config.ts  |   4 +-
 42 files changed, 1878 insertions(+), 36 deletions(-)
 create mode 100644 src/cli/commands/eval/command.tsx
 create mode 100644 src/cli/commands/eval/index.ts
 create mode 100644 src/cli/commands/pause/command.tsx
 create mode 100644 src/cli/commands/pause/index.ts
 create mode 100644 src/cli/commands/resume/command.tsx
 create mode 100644 src/cli/commands/resume/index.ts
 create mode 100644 src/cli/commands/run/command.tsx
 create mode 100644 src/cli/commands/run/index.ts
 create mode 100644 src/cli/operations/eval/get-eval-run.ts
 create mode 100644 src/cli/operations/eval/index.ts
 create mode 100644 src/cli/operations/eval/list-eval-runs.ts
 create mode 100644 src/cli/operations/eval/logs-eval.ts
 create mode 100644 src/cli/operations/eval/pause-resume.ts
 create mode 100644 src/cli/operations/eval/run-eval.ts
 create mode 100644 src/cli/operations/eval/storage.ts
 create mode 100644 src/cli/operations/eval/types.ts
 create mode 100644 src/cli/tui/screens/cli-only/CliOnlyScreen.tsx
 create mode 100644 src/cli/tui/screens/cli-only/index.ts
 create mode 100644 src/cli/tui/screens/eval/EvalScreen.tsx
 create mode 100644 src/cli/tui/screens/eval/index.ts

diff --git a/src/cli/aws/agentcore-control.ts b/src/cli/aws/agentcore-control.ts
index 84ba4766..4ef9ff64 100644
--- a/src/cli/aws/agentcore-control.ts
+++ b/src/cli/aws/agentcore-control.ts
@@ -1,5 +1,9 @@
 import { getCredentialProvider } from './account';
-import { BedrockAgentCoreControlClient, GetAgentRuntimeCommand } from '@aws-sdk/client-bedrock-agentcore-control';
+import {
+  BedrockAgentCoreControlClient,
+  GetAgentRuntimeCommand,
+  UpdateOnlineEvaluationConfigCommand,
+} from '@aws-sdk/client-bedrock-agentcore-control';
 
 export interface GetAgentRuntimeStatusOptions {
   region: string;
@@ -35,3 +39,46 @@ export async function getAgentRuntimeStatus(options: GetAgentRuntimeStatusOption
     status: response.status,
   };
 }
+
+// ============================================================================
+// Online Eval Config
+// ============================================================================
+
+export type OnlineEvalExecutionStatus = 'ENABLED' | 'DISABLED';
+
+export interface UpdateOnlineEvalStatusOptions {
+  region: string;
+  onlineEvaluationConfigId: string;
+  executionStatus: OnlineEvalExecutionStatus;
+}
+
+export interface UpdateOnlineEvalStatusResult {
+  configId: string;
+  executionStatus: string;
+  status: string;
+}
+
+/**
+ * Update the execution status of an online evaluation config (pause/resume).
+ */
+export async function updateOnlineEvalExecutionStatus(
+  options: UpdateOnlineEvalStatusOptions
+): Promise<UpdateOnlineEvalStatusResult> {
+  const client = new BedrockAgentCoreControlClient({
+    region: options.region,
+    credentials: getCredentialProvider(),
+  });
+
+  const command = new UpdateOnlineEvaluationConfigCommand({
+    onlineEvaluationConfigId: options.onlineEvaluationConfigId,
+    executionStatus: options.executionStatus,
+  });
+
+  const response = await client.send(command);
+
+  return {
+    configId: response.onlineEvaluationConfigId ?? options.onlineEvaluationConfigId,
+    executionStatus: response.executionStatus ?? options.executionStatus,
+    status: response.status ?? 'UNKNOWN',
+  };
+}
diff --git a/src/cli/aws/agentcore.ts b/src/cli/aws/agentcore.ts
index 8baf9f72..e84bd6a1 100644
--- a/src/cli/aws/agentcore.ts
+++ b/src/cli/aws/agentcore.ts
@@ -1,9 +1,11 @@
 import { getCredentialProvider } from './account';
 import {
   BedrockAgentCoreClient,
+  EvaluateCommand,
   InvokeAgentRuntimeCommand,
   StopRuntimeSessionCommand,
 } from '@aws-sdk/client-bedrock-agentcore';
+import type { DocumentType } from '@smithy/types';
 
 /** Logger interface for SSE events */
 export interface SSELogger {
@@ -234,6 +236,108 @@ export async function invokeAgentRuntime(options: InvokeAgentRuntimeOptions): Pr
   };
 }
 
+// ============================================================================
+// Evaluate
+// ============================================================================
+
+export interface EvaluateOptions {
+  region: string;
+  evaluatorId: string;
+  sessionSpans: DocumentType[];
+  targetSpanIds?: string[];
+  targetTraceIds?: string[];
+}
+
+export interface EvaluationResultContext {
+  sessionId: string | undefined;
+  traceId: string | undefined;
+  spanId: string | undefined;
+}
+
+export interface EvaluationResultTokenUsage {
+  inputTokens: number;
+  outputTokens: number;
+  totalTokens: number;
+}
+
+export interface EvaluationResult {
+  evaluatorArn: string | undefined;
+  evaluatorId: string | undefined;
+  evaluatorName: string | undefined;
+  explanation: string | undefined;
+  value: number | undefined;
+  label: string | undefined;
+  errorMessage: string | undefined;
+  errorCode: string | undefined;
+  context: EvaluationResultContext | undefined;
+  tokenUsage: EvaluationResultTokenUsage | undefined;
+}
+
+export interface EvaluateResult {
+  evaluationResults: EvaluationResult[];
+}
+
+/**
+ * Run on-demand evaluation of agent traces using a specified evaluator.
+ */
+export async function evaluate(options: EvaluateOptions): Promise<EvaluateResult> {
+  const client = new BedrockAgentCoreClient({
+    region: options.region,
+    credentials: getCredentialProvider(),
+  });
+
+  const evaluationTarget = options.targetSpanIds
+    ? { spanIds: options.targetSpanIds }
+    : options.targetTraceIds
+      ? { traceIds: options.targetTraceIds }
+      : undefined;
+
+  const command = new EvaluateCommand({
+    evaluatorId: options.evaluatorId,
+    evaluationInput: {
+      sessionSpans: options.sessionSpans,
+    },
+    ...(evaluationTarget ? { evaluationTarget } : {}),
+  });
+
+  const response = await client.send(command);
+
+  if (!response.evaluationResults) {
+    throw new Error('No evaluation results returned');
+  }
+
+  return {
+    evaluationResults: response.evaluationResults.map(r => {
+      const spanContext = r.context && 'spanContext' in r.context ? r.context.spanContext : undefined;
+
+      return {
+        evaluatorArn: r.evaluatorArn,
+        evaluatorId: r.evaluatorId,
+        evaluatorName: r.evaluatorName,
+        explanation: r.explanation,
+        value: r.value,
+        label: r.label,
+        errorMessage: r.errorMessage,
+        errorCode: r.errorCode,
+        context: spanContext
+          ? {
+              sessionId: spanContext.sessionId,
+              traceId: spanContext.traceId,
+              spanId: spanContext.spanId,
+            }
+          : undefined,
+        tokenUsage: r.tokenUsage
+          ? {
+              inputTokens: r.tokenUsage.inputTokens ?? 0,
+              outputTokens: r.tokenUsage.outputTokens ?? 0,
+              totalTokens: r.tokenUsage.totalTokens ?? 0,
+            }
+          : undefined,
+      };
+    }),
+  };
+}
+
 /**
  * Stop a runtime session.
  */
diff --git a/src/cli/cli.ts b/src/cli/cli.ts
index 4d992ad7..621e0ef4 100644
--- a/src/cli/cli.ts
+++ b/src/cli/cli.ts
@@ -2,11 +2,15 @@ import { registerAdd } from './commands/add';
 import { registerCreate } from './commands/create';
 import { registerDeploy } from './commands/deploy';
 import { registerDev } from './commands/dev';
+import { registerEval } from './commands/eval';
 import { registerHelp } from './commands/help';
 import { registerInvoke } from './commands/invoke';
 import { registerLogs } from './commands/logs';
 import { registerPackage } from './commands/package';
+import { registerPause } from './commands/pause';
 import { registerRemove } from './commands/remove';
+import { registerResume } from './commands/resume';
+import { registerRun } from './commands/run';
 import { registerStatus } from './commands/status';
 import { registerTraces } from './commands/traces';
 import { registerUpdate } from './commands/update';
@@ -130,11 +134,15 @@ export function registerCommands(program: Command) {
   registerDev(program);
   registerDeploy(program);
   registerCreate(program);
+  registerEval(program);
   registerHelp(program);
   registerInvoke(program);
   registerLogs(program);
   registerPackage(program);
+  registerPause(program);
   const removeCmd = registerRemove(program);
+  registerResume(program);
+  registerRun(program);
   registerStatus(program);
   registerTraces(program);
   registerUpdate(program);
diff --git a/src/cli/cloudformation/__tests__/outputs-extended.test.ts b/src/cli/cloudformation/__tests__/outputs-extended.test.ts
index 85aab1c8..16112c58 100644
--- a/src/cli/cloudformation/__tests__/outputs-extended.test.ts
+++ b/src/cli/cloudformation/__tests__/outputs-extended.test.ts
@@ -1,4 +1,4 @@
-import { buildDeployedState, parseAgentOutputs } from '../outputs.js';
+import { buildDeployedState, parseAgentOutputs, parseEvaluatorOutputs, parseOnlineEvalOutputs } from '../outputs.js';
 import type { StackOutputs } from '../outputs.js';
 import { describe, expect, it } from 'vitest';
 
@@ -233,4 +233,170 @@ describe('buildDeployedState', () => {
     const state = buildDeployedState({ targetName: 'default', stackName: 'Stack', agents: {}, gateways: {} });
     expect(state.targets.default!.resources?.agents).toBeUndefined();
   });
+
+  it('includes evaluators in deployed state when provided', () => {
+    const evaluators = {
+      MyEval: {
+        evaluatorId: 'proj_MyEval-abc',
+        evaluatorArn: 'arn:aws:bedrock:us-east-1:123:evaluator/proj_MyEval-abc',
+      },
+    };
+
+    const state = buildDeployedState({
+      targetName: 'default',
+      stackName: 'Stack',
+      agents: {},
+      gateways: {},
+      evaluators,
+    });
+    expect(state.targets.default!.resources?.evaluators).toEqual(evaluators);
+  });
+
+  it('omits evaluators from deployed state when empty', () => {
+    const state = buildDeployedState({
+      targetName: 'default',
+      stackName: 'Stack',
+      agents: {},
+      gateways: {},
+      evaluators: {},
+    });
+    expect(state.targets.default!.resources?.evaluators).toBeUndefined();
+  });
+
+  it('includes onlineEvalConfigs in deployed state when provided', () => {
+    const onlineEvalConfigs = {
+      TestConfig: {
+        onlineEvaluationConfigId: 'proj_TestConfig-xyz',
+        onlineEvaluationConfigArn: 'arn:aws:bedrock:us-east-1:123:online-evaluation-config/proj_TestConfig-xyz',
+      },
+    };
+
+    const state = buildDeployedState({
+      targetName: 'default',
+      stackName: 'Stack',
+      agents: {},
+      gateways: {},
+      onlineEvalConfigs,
+    });
+    expect(state.targets.default!.resources?.onlineEvalConfigs).toEqual(onlineEvalConfigs);
+  });
+
+  it('omits onlineEvalConfigs from deployed state when empty', () => {
+    const state = buildDeployedState({
+      targetName: 'default',
+      stackName: 'Stack',
+      agents: {},
+      gateways: {},
+      onlineEvalConfigs: {},
+    });
+    expect(state.targets.default!.resources?.onlineEvalConfigs).toBeUndefined();
+  });
+});
+
+describe('parseEvaluatorOutputs', () => {
+  it('parses evaluator Id and Arn from stack outputs', () => {
+    const outputs: StackOutputs = {
+      ApplicationEvaluatorMyEvalIdOutputABC123: 'proj_MyEval-abc',
+      ApplicationEvaluatorMyEvalArnOutputDEF456: 'arn:aws:bedrock:us-east-1:123:evaluator/proj_MyEval-abc',
+    };
+
+    const result = parseEvaluatorOutputs(outputs, ['MyEval']);
+    expect(result.MyEval).toBeDefined();
+    expect(result.MyEval!.evaluatorId).toBe('proj_MyEval-abc');
+    expect(result.MyEval!.evaluatorArn).toBe('arn:aws:bedrock:us-east-1:123:evaluator/proj_MyEval-abc');
+  });
+
+  it('parses multiple evaluators', () => {
+    const outputs: StackOutputs = {
+      ApplicationEvaluatorEvalAIdOutputA: 'id-a',
+      ApplicationEvaluatorEvalAArnOutputB: 'arn:a',
+      ApplicationEvaluatorEvalBIdOutputC: 'id-b',
+      ApplicationEvaluatorEvalBArnOutputD: 'arn:b',
+    };
+
+    const result = parseEvaluatorOutputs(outputs, ['EvalA', 'EvalB']);
+    expect(Object.keys(result)).toHaveLength(2);
+    expect(result.EvalA!.evaluatorId).toBe('id-a');
+    expect(result.EvalB!.evaluatorId).toBe('id-b');
+  });
+
+  it('skips evaluator when Id output is missing', () => {
+    const outputs: StackOutputs = {
+      ApplicationEvaluatorMyEvalArnOutputDEF456: 'arn:eval',
+    };
+
+    const result = parseEvaluatorOutputs(outputs, ['MyEval']);
+    expect(result.MyEval).toBeUndefined();
+  });
+
+  it('skips evaluator when Arn output is missing', () => {
+    const outputs: StackOutputs = {
+      ApplicationEvaluatorMyEvalIdOutputABC123: 'eval-id',
+    };
+
+    const result = parseEvaluatorOutputs(outputs, ['MyEval']);
+    expect(result.MyEval).toBeUndefined();
+  });
+
+  it('returns empty record for no matching outputs', () => {
+    const result = parseEvaluatorOutputs({ UnrelatedOutput: 'value' }, ['MyEval']);
+    expect(result).toEqual({});
+  });
+
+  it('maps PascalCase output keys back to original underscore names', () => {
+    // Evaluator name "my_eval" becomes "MyEval" in PascalCase
+    const outputs: StackOutputs = {
+      ApplicationEvaluatorMyEvalIdOutputA: 'id-1',
+      ApplicationEvaluatorMyEvalArnOutputB: 'arn:1',
+    };
+
+    const result = parseEvaluatorOutputs(outputs, ['my_eval']);
+    expect(result.my_eval).toBeDefined();
+    expect(result.my_eval!.evaluatorId).toBe('id-1');
+  });
+});
+
+describe('parseOnlineEvalOutputs', () => {
+  it('parses online eval config Id and Arn from stack outputs', () => {
+    const outputs: StackOutputs = {
+      ApplicationOnlineEvalTestConfigIdOutputABC: 'proj_TestConfig-xyz',
+      ApplicationOnlineEvalTestConfigArnOutputDEF:
+        'arn:aws:bedrock:us-east-1:123:online-evaluation-config/proj_TestConfig-xyz',
+    };
+
+    const result = parseOnlineEvalOutputs(outputs, ['TestConfig']);
+    expect(result.TestConfig).toBeDefined();
+    expect(result.TestConfig!.onlineEvaluationConfigId).toBe('proj_TestConfig-xyz');
+    expect(result.TestConfig!.onlineEvaluationConfigArn).toBe(
+      'arn:aws:bedrock:us-east-1:123:online-evaluation-config/proj_TestConfig-xyz'
+    );
+  });
+
+  it('parses multiple online eval configs', () => {
+    const outputs: StackOutputs = {
+      ApplicationOnlineEvalConfigAIdOutputA: 'id-a',
+      ApplicationOnlineEvalConfigAArnOutputB: 'arn:a',
+      ApplicationOnlineEvalConfigBIdOutputC: 'id-b',
+      ApplicationOnlineEvalConfigBArnOutputD: 'arn:b',
+    };
+
+    const result = parseOnlineEvalOutputs(outputs, ['ConfigA', 'ConfigB']);
+    expect(Object.keys(result)).toHaveLength(2);
+    expect(result.ConfigA!.onlineEvaluationConfigId).toBe('id-a');
+    expect(result.ConfigB!.onlineEvaluationConfigId).toBe('id-b');
+  });
+
+  it('skips config when Id output is missing', () => {
+    const outputs: StackOutputs = {
+      ApplicationOnlineEvalTestConfigArnOutputDEF: 'arn:config',
+    };
+
+    const result = parseOnlineEvalOutputs(outputs, ['TestConfig']);
+    expect(result.TestConfig).toBeUndefined();
+  });
+
+  it('returns empty record for empty outputs', () => {
+    const result = parseOnlineEvalOutputs({}, ['TestConfig']);
+    expect(result).toEqual({});
+  });
 });
diff --git a/src/cli/cloudformation/outputs.ts b/src/cli/cloudformation/outputs.ts
index 86ec368f..073fc05a 100644
--- a/src/cli/cloudformation/outputs.ts
+++ b/src/cli/cloudformation/outputs.ts
@@ -1,4 +1,11 @@
-import type { AgentCoreDeployedState, DeployedState, MemoryDeployedState, TargetDeployedState } from '../../schema';
+import type {
+  AgentCoreDeployedState,
+  DeployedState,
+  EvaluatorDeployedState,
+  MemoryDeployedState,
+  OnlineEvalDeployedState,
+  TargetDeployedState,
+} from '../../schema';
 import { getCredentialProvider } from '../aws';
 import { toPascalId } from './logical-ids';
 import { getStackName } from './stack-discovery';
@@ -202,6 +209,68 @@ export function parseMemoryOutputs(outputs: StackOutputs, memoryNames: string[])
   return memories;
 }
 
+/**
+ * Parse stack outputs into deployed state for evaluators.
+ *
+ * Output key pattern: ApplicationEvaluator{PascalName}(Id|Arn)Output{Hash}
+ */
+export function parseEvaluatorOutputs(
+  outputs: StackOutputs,
+  evaluatorNames: string[]
+): Record<string, EvaluatorDeployedState> {
+  const evaluators: Record<string, EvaluatorDeployedState> = {};
+  const outputKeys = Object.keys(outputs);
+
+  for (const evalName of evaluatorNames) {
+    const pascal = toPascalId('Evaluator', evalName);
+    const idPrefix = `Application${pascal}IdOutput`;
+    const arnPrefix = `Application${pascal}ArnOutput`;
+
+    const idKey = outputKeys.find(k => k.startsWith(idPrefix));
+    const arnKey = outputKeys.find(k => k.startsWith(arnPrefix));
+
+    if (idKey && arnKey) {
+      evaluators[evalName] = {
+        evaluatorId: outputs[idKey]!,
+        evaluatorArn: outputs[arnKey]!,
+      };
+    }
+  }
+
+  return evaluators;
+}
+
+/**
+ * Parse stack outputs into deployed state for online evaluation configs.
+ *
+ * Output key pattern: ApplicationOnlineEval{PascalName}(Id|Arn)Output{Hash}
+ */
+export function parseOnlineEvalOutputs(
+  outputs: StackOutputs,
+  onlineEvalNames: string[]
+): Record<string, OnlineEvalDeployedState> {
+  const configs: Record<string, OnlineEvalDeployedState> = {};
+  const outputKeys = Object.keys(outputs);
+
+  for (const configName of onlineEvalNames) {
+    const pascal = toPascalId('OnlineEval', configName);
+    const idPrefix = `Application${pascal}IdOutput`;
+    const arnPrefix = `Application${pascal}ArnOutput`;
+
+    const idKey = outputKeys.find(k => k.startsWith(idPrefix));
+    const arnKey = outputKeys.find(k => k.startsWith(arnPrefix));
+
+    if (idKey && arnKey) {
+      configs[configName] = {
+        onlineEvaluationConfigId: outputs[idKey]!,
+        onlineEvaluationConfigArn: outputs[arnKey]!,
+      };
+    }
+  }
+
+  return configs;
+}
+
 export interface BuildDeployedStateOptions {
   targetName: string;
   stackName: string;
@@ -211,13 +280,26 @@ export interface BuildDeployedStateOptions {
   identityKmsKeyArn?: string;
   credentials?: Record<string, { credentialProviderArn: string; clientSecretArn?: string; callbackUrl?: string }>;
   memories?: Record<string, MemoryDeployedState>;
+  evaluators?: Record<string, EvaluatorDeployedState>;
+  onlineEvalConfigs?: Record<string, OnlineEvalDeployedState>;
 }
 
 /**
  * Build deployed state from stack outputs.
  */
 export function buildDeployedState(opts: BuildDeployedStateOptions): DeployedState {
-  const { targetName, stackName, agents, gateways, existingState, identityKmsKeyArn, credentials, memories } = opts;
+  const {
+    targetName,
+    stackName,
+    agents,
+    gateways,
+    existingState,
+    identityKmsKeyArn,
+    credentials,
+    memories,
+    evaluators,
+    onlineEvalConfigs,
+  } = opts;
   const targetState: TargetDeployedState = {
     resources: {
       agents: Object.keys(agents).length > 0 ? agents : undefined,
@@ -239,6 +321,16 @@ export function buildDeployedState(opts: BuildDeployedStateOptions): DeployedSta
     targetState.resources!.credentials = credentials;
   }
 
+  // Add evaluator state if evaluators exist
+  if (evaluators && Object.keys(evaluators).length > 0) {
+    targetState.resources!.evaluators = evaluators;
+  }
+
+  // Add online eval config state if configs exist
+  if (onlineEvalConfigs && Object.keys(onlineEvalConfigs).length > 0) {
+    targetState.resources!.onlineEvalConfigs = onlineEvalConfigs;
+  }
+
   return {
     targets: {
       ...existingState?.targets,
diff --git a/src/cli/commands/deploy/actions.ts b/src/cli/commands/deploy/actions.ts
index 721a050a..6289d2d2 100644
--- a/src/cli/commands/deploy/actions.ts
+++ b/src/cli/commands/deploy/actions.ts
@@ -6,8 +6,10 @@ import {
   buildDeployedState,
   getStackOutputs,
   parseAgentOutputs,
+  parseEvaluatorOutputs,
   parseGatewayOutputs,
   parseMemoryOutputs,
+  parseOnlineEvalOutputs,
 } from '../../cloudformation';
 import { getErrorMessage } from '../../errors';
 import { ExecLogger } from '../../logging';
@@ -374,6 +376,14 @@ export async function handleDeploy(options: ValidatedDeployOptions): Promise<Dep
       );
     }
 
+    // Parse evaluator outputs
+    const evaluatorNames = (context.projectSpec.evaluators ?? []).map(e => e.name);
+    const evaluators = parseEvaluatorOutputs(outputs, evaluatorNames);
+
+    // Parse online eval config outputs
+    const onlineEvalNames = (context.projectSpec.onlineEvalConfigs ?? []).map(c => c.name);
+    const onlineEvalConfigs = parseOnlineEvalOutputs(outputs, onlineEvalNames);
+
     // Parse gateway outputs
     const gatewaySpecs =
       mcpSpec?.agentCoreGateways?.reduce(
@@ -395,6 +405,8 @@ export async function handleDeploy(options: ValidatedDeployOptions): Promise<Dep
       identityKmsKeyArn,
       credentials: deployedCredentials,
       memories,
+      evaluators,
+      onlineEvalConfigs,
     });
     await configIO.writeDeployedState(deployedState);
 
diff --git a/src/cli/commands/eval/command.tsx b/src/cli/commands/eval/command.tsx
new file mode 100644
index 00000000..4184b1cb
--- /dev/null
+++ b/src/cli/commands/eval/command.tsx
@@ -0,0 +1,138 @@
+import { getErrorMessage } from '../../errors';
+import { handleGetEvalRun, handleListEvalRuns } from '../../operations/eval';
+import { COMMAND_DESCRIPTIONS } from '../../tui/copy';
+import { requireProject } from '../../tui/guards';
+import type { Command } from '@commander-js/extra-typings';
+import { Text, render } from 'ink';
+import React from 'react';
+
+export const registerEval = (program: Command) => {
+  const evalCmd = program.command('eval').description(COMMAND_DESCRIPTIONS.eval);
+
+  evalCmd
+    .command('list')
+    .description('List past eval runs')
+    .option('-a, --agent <name>', 'Filter by agent name')
+    .option('-n, --limit <count>', 'Maximum number of runs to show')
+    .option('--json', 'Output as JSON')
+    .action((cliOptions: { agent?: string; limit?: string; json?: boolean }) => {
+      requireProject();
+
+      try {
+        const result = handleListEvalRuns({
+          agent: cliOptions.agent,
+          limit: cliOptions.limit ? parseInt(cliOptions.limit, 10) : undefined,
+          json: cliOptions.json,
+        });
+
+        if (cliOptions.json) {
+          console.log(JSON.stringify(result));
+          process.exit(result.success ? 0 : 1);
+          return;
+        }
+
+        if (!result.success) {
+          render(<Text color="red">{result.error}</Text>);
+          process.exit(1);
+        }
+
+        const runs = result.runs ?? [];
+        if (runs.length === 0) {
+          console.log('No eval runs found. Run `agentcore run eval` to create one.');
+          return;
+        }
+
+        console.log(
+          `\n${'Run ID'.padEnd(42)} ${'Agent'.padEnd(20)} ${'Evaluators'.padEnd(30)} ${'Sessions'.padEnd(10)} Date`
+        );
+        console.log('─'.repeat(120));
+
+        for (const run of runs) {
+          const scores = run.results.map(r => `${r.evaluator}=${r.aggregateScore.toFixed(2)}`).join(', ');
+          const date = new Date(run.timestamp).toLocaleDateString();
+          console.log(
+            `${run.runId.padEnd(42)} ${run.agent.padEnd(20)} ${scores.padEnd(30)} ${String(run.sessionCount).padEnd(10)} ${date}`
+          );
+        }
+        console.log('');
+      } catch (error) {
+        if (cliOptions.json) {
+          console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
+        } else {
+          render(<Text color="red">Error: {getErrorMessage(error)}</Text>);
+        }
+        process.exit(1);
+      }
+    });
+
+  evalCmd
+    .command('get')
+    .description('Get details of a specific eval run')
+    .argument('<runId>', 'Eval run ID')
+    .option('--sessions', 'Show per-session score breakdown')
+    .option('--json', 'Output as JSON')
+    .action(
+      (
+        runId: string,
+        cliOptions: {
+          sessions?: boolean;
+          json?: boolean;
+        }
+      ) => {
+        requireProject();
+
+        try {
+          const result = handleGetEvalRun({ runId, sessions: cliOptions.sessions, json: cliOptions.json });
+
+          if (cliOptions.json) {
+            console.log(JSON.stringify(result));
+            process.exit(result.success ? 0 : 1);
+            return;
+          }
+
+          if (!result.success) {
+            render(<Text color="red">{result.error}</Text>);
+            process.exit(1);
+          }
+
+          const run = result.run!;
+          console.log(`\nEval Run: ${run.runId}`);
+          console.log(`Agent: ${run.agent}`);
+          console.log(`Date: ${new Date(run.timestamp).toISOString()}`);
+          console.log(`Sessions: ${run.sessionCount} | Lookback: ${run.lookbackDays}d\n`);
+
+          for (const r of run.results) {
+            const errors = r.sessionScores.filter(s => s.errorMessage).length;
+            console.log(`  ${r.evaluator}: ${r.aggregateScore.toFixed(2)}${errors > 0 ? ` (${errors} errors)` : ''}`);
+
+            if (r.tokenUsage) {
+              console.log(
+                `    Tokens: ${r.tokenUsage.totalTokens} (in: ${r.tokenUsage.inputTokens}, out: ${r.tokenUsage.outputTokens})`
+              );
+            }
+
+            if (cliOptions.sessions) {
+              console.log('');
+              for (const s of r.sessionScores) {
+                const status = s.errorMessage
+                  ? `ERROR: ${s.errorMessage}`
+                  : `${s.value.toFixed(2)}${s.label ? ` (${s.label})` : ''}`;
+                console.log(`    session=${s.sessionId}  ${status}`);
+                if (s.explanation) {
+                  console.log(`      ${s.explanation}`);
+                }
+              }
+            }
+            console.log('');
+          }
+        } catch (error) {
+          if (cliOptions.json) {
+            console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
+          } else {
+            render(<Text color="red">Error: {getErrorMessage(error)}</Text>);
+          }
+          process.exit(1);
+        }
+      }
+    );
+};
diff --git a/src/cli/commands/eval/index.ts b/src/cli/commands/eval/index.ts
new file mode 100644
index 00000000..5a761e17
--- /dev/null
+++ b/src/cli/commands/eval/index.ts
@@ -0,0 +1 @@
+export { registerEval } from './command';
diff --git a/src/cli/commands/index.ts b/src/cli/commands/index.ts
index 3e1fd854..3dac1c82 100644
--- a/src/cli/commands/index.ts
+++ b/src/cli/commands/index.ts
@@ -3,9 +3,13 @@ export { registerAdd } from './add';
 export { registerDeploy } from './deploy';
 export { registerDev } from './dev';
 export { registerCreate } from './create';
+export { registerEval } from './eval';
 export { registerInvoke } from './invoke';
 export { registerPackage } from './package';
+export { registerPause } from './pause';
 export { registerRemove } from './remove';
+export { registerResume } from './resume';
+export { registerRun } from './run';
 export { registerStatus } from './status';
 export { registerTraces } from './traces';
 export { registerUpdate } from './update';
diff --git a/src/cli/commands/logs/command.tsx b/src/cli/commands/logs/command.tsx
index 977042cd..282aed81 100644
--- a/src/cli/commands/logs/command.tsx
+++ b/src/cli/commands/logs/command.tsx
@@ -1,15 +1,24 @@
 import { getErrorMessage } from '../../errors';
+import { handleLogsEval } from '../../operations/eval';
+import type { LogsEvalOptions } from '../../operations/eval';
 import { COMMAND_DESCRIPTIONS } from '../../tui/copy';
 import { requireProject } from '../../tui/guards';
 import { handleLogs } from './action';
 import type { LogsOptions } from './types';
 import type { Command } from '@commander-js/extra-typings';
 import { Text, render } from 'ink';
+import React from 'react';
 
 export const registerLogs = (program: Command) => {
-  program
+  // enablePositionalOptions + passThroughOptions ensure options like --since and --agent
+  // are passed to the 'eval' subcommand rather than being consumed by the parent 'logs' command.
+  program.enablePositionalOptions();
+
+  const logsCmd = program
     .command('logs')
     .alias('l')
+    .enablePositionalOptions()
+    .passThroughOptions()
     .description(COMMAND_DESCRIPTIONS.logs)
     .option('--agent <name>', 'Select specific agent')
     .option('--since <time>', 'Start time — defaults to 1h ago in search mode (e.g. "1h", "30m", "2d", ISO 8601)')
@@ -24,6 +33,31 @@ export const registerLogs = (program: Command) => {
       try {
         const result = await handleLogs(cliOptions);
 
+        if (!result.success) {
+          render(<Text color="red">{result.error}</Text>);
+          process.exit(1);
+        }
+      } catch (error) {
+        render(<Text color="red">Error: {getErrorMessage(error)}</Text>);
+        process.exit(1);
+      }
+    });
+
+  logsCmd
+    .command('eval')
+    .description('Stream or search online eval logs')
+    .option('-a, --agent <name>', 'Select specific agent')
+    .option('--since <time>', 'Start time (e.g. "1h", "30m", "2d", ISO 8601)')
+    .option('--until <time>', 'End time (e.g. "now", ISO 8601)')
+    .option('-n, --lines <count>', 'Maximum number of log lines')
+    .option('-f, --follow', 'Stream logs in real-time (default when no --since/--until)')
+    .option('--json', 'Output as JSON Lines')
+    .action(async (cliOptions: LogsEvalOptions) => {
+      requireProject();
+
+      try {
+        const result = await handleLogsEval(cliOptions);
+
         if (!result.success) {
           render(<Text color="red">{result.error}</Text>);
           process.exit(1);
diff --git a/src/cli/commands/pause/command.tsx b/src/cli/commands/pause/command.tsx
new file mode 100644
index 00000000..aaaaf76a
--- /dev/null
+++ b/src/cli/commands/pause/command.tsx
@@ -0,0 +1,41 @@
+import { getErrorMessage } from '../../errors';
+import { handlePauseResume } from '../../operations/eval';
+import { COMMAND_DESCRIPTIONS } from '../../tui/copy';
+import { requireProject } from '../../tui/guards';
+import type { Command } from '@commander-js/extra-typings';
+import { Text, render } from 'ink';
+import React from 'react';
+
+export const registerPause = (program: Command) => {
+  const pauseCmd = program.command('pause').description(COMMAND_DESCRIPTIONS.pause);
+
+  pauseCmd
+    .command('online-eval')
+    .description('Pause a deployed online eval config')
+    .argument('<name>', 'Online eval config name')
+    .option('--json', 'Output as JSON')
+    .action(async (name: string, cliOptions: { json?: boolean }) => {
+      requireProject();
+
+      try {
+        const result = await handlePauseResume({ name, json: cliOptions.json }, 'pause');
+
+        if (cliOptions.json) {
+          console.log(JSON.stringify(result));
+        } else if (result.success) {
+          console.log(`Paused online eval config "${name}" (status: ${result.executionStatus})`);
+        } else {
+          render(<Text color="red">{result.error}</Text>);
+        }
+
+        process.exit(result.success ? 0 : 1);
+      } catch (error) {
+        if (cliOptions.json) {
+          console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
+        } else {
+          render(<Text color="red">Error: {getErrorMessage(error)}</Text>);
+        }
+        process.exit(1);
+      }
+    });
+};
diff --git a/src/cli/commands/pause/index.ts b/src/cli/commands/pause/index.ts
new file mode 100644
index 00000000..858054fd
--- /dev/null
+++ b/src/cli/commands/pause/index.ts
@@ -0,0 +1 @@
+export { registerPause } from './command';
diff --git a/src/cli/commands/resume/command.tsx b/src/cli/commands/resume/command.tsx
new file mode 100644
index 00000000..49abcca0
--- /dev/null
+++ b/src/cli/commands/resume/command.tsx
@@ -0,0 +1,41 @@
+import { getErrorMessage } from '../../errors';
+import { handlePauseResume } from '../../operations/eval';
+import { COMMAND_DESCRIPTIONS } from '../../tui/copy';
+import { requireProject } from '../../tui/guards';
+import type { Command } from '@commander-js/extra-typings';
+import { Text, render } from 'ink';
+import React from 'react';
+
+export const registerResume = (program: Command) => {
+  const resumeCmd = program.command('resume').description(COMMAND_DESCRIPTIONS.resume);
+
+  resumeCmd
+    .command('online-eval')
+    .description('Resume a paused online eval config')
+    .argument('<name>', 'Online eval config name')
+    .option('--json', 'Output as JSON')
+    .action(async (name: string, cliOptions: { json?: boolean }) => {
+      requireProject();
+
+      try {
+        const result = await handlePauseResume({ name, json: cliOptions.json }, 'resume');
+
+        if (cliOptions.json) {
+          console.log(JSON.stringify(result));
+        } else if (result.success) {
+          console.log(`Resumed online eval config "${name}" (status: ${result.executionStatus})`);
+        } else {
+          render(<Text color="red">{result.error}</Text>);
+        }
+
+        process.exit(result.success ? 0 : 1);
+      } catch (error) {
+        if (cliOptions.json) {
+          console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
+        } else {
+          render(<Text color="red">Error: {getErrorMessage(error)}</Text>);
+        }
+        process.exit(1);
+      }
+    });
+};
diff --git a/src/cli/commands/resume/index.ts b/src/cli/commands/resume/index.ts
new file mode 100644
index 00000000..5303df0f
--- /dev/null
+++ b/src/cli/commands/resume/index.ts
@@ -0,0 +1 @@
+export { registerResume } from './command';
diff --git a/src/cli/commands/run/command.tsx b/src/cli/commands/run/command.tsx
new file mode 100644
index 00000000..b63cfbab
--- /dev/null
+++ b/src/cli/commands/run/command.tsx
@@ -0,0 +1,94 @@
+import { getErrorMessage } from '../../errors';
+import { handleRunEval } from '../../operations/eval';
+import type { RunEvalOptions } from '../../operations/eval';
+import { COMMAND_DESCRIPTIONS } from '../../tui/copy';
+import { requireProject } from '../../tui/guards';
+import type { Command } from '@commander-js/extra-typings';
+import { Text, render } from 'ink';
+import React from 'react';
+
+function formatRunOutput(result: Awaited<ReturnType<typeof handleRunEval>>): void {
+  if (!result.run) return;
+
+  const { run } = result;
+  console.log(`\nEval Run: ${run.runId}`);
+  console.log(`Agent: ${run.agent} | Sessions: ${run.sessionCount} | Lookback: ${run.lookbackDays}d\n`);
+
+  for (const r of run.results) {
+    const score = r.aggregateScore.toFixed(2);
+    const errors = r.sessionScores.filter(s => s.errorMessage).length;
+    const errorSuffix = errors > 0 ? ` (${errors} errors)` : '';
+    console.log(`  ${r.evaluator}: ${score}${errorSuffix}`);
+  }
+
+  if (result.filePath) {
+    console.log(`\nResults saved to: ${result.filePath}`);
+  }
+}
+
+export const registerRun = (program: Command) => {
+  const runCmd = program.command('run').description(COMMAND_DESCRIPTIONS.run);
+
+  runCmd
+    .command('eval')
+    .description('Run on-demand evaluation of agent traces')
+    .option('-a, --agent <name>', 'Agent to evaluate')
+    .option('-e, --evaluator <names...>', 'Evaluator name(s) or Builtin.* IDs')
+    .option('--evaluator-arn <arns...>', 'Evaluator ARN(s) to use directly')
+    .option('--days <days>', 'Lookback window in days', '7')
+    .option('--output <path>', 'Custom output file path for results')
+    .option('--json', 'Output as JSON')
+    .action(
+      async (cliOptions: {
+        agent?: string;
+        evaluator?: string[];
+        evaluatorArn?: string[];
+        days: string;
+        output?: string;
+        json?: boolean;
+      }) => {
+        requireProject();
+
+        if (!cliOptions.evaluator && !cliOptions.evaluatorArn) {
+          const error = 'At least one --evaluator or --evaluator-arn is required';
+          if (cliOptions.json) {
+            console.log(JSON.stringify({ success: false, error }));
+          } else {
+            render(<Text color="red">{error}</Text>);
+          }
+          process.exit(1);
+        }
+
+        const options: RunEvalOptions = {
+          agent: cliOptions.agent,
+          evaluator: cliOptions.evaluator ?? [],
+          evaluatorArn: cliOptions.evaluatorArn,
+          days: parseInt(cliOptions.days, 10),
+          output: cliOptions.output,
+          json: cliOptions.json,
+        };
+
+        try {
+          const result = await handleRunEval(options);
+
+          if (cliOptions.json) {
+            console.log(JSON.stringify(result));
+          } else if (result.success) {
+            formatRunOutput(result);
+          } else {
+            formatRunOutput(result);
+            render(<Text color="red">{result.error}</Text>);
+          }
+
+          process.exit(result.success ? 0 : 1);
+        } catch (error) {
+          if (cliOptions.json) {
+            console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
+          } else {
+            render(<Text color="red">Error: {getErrorMessage(error)}</Text>);
+          }
+          process.exit(1);
+        }
+      }
+    );
+};
diff --git a/src/cli/commands/run/index.ts b/src/cli/commands/run/index.ts
new file mode 100644
index 00000000..a9200f87
--- /dev/null
+++ b/src/cli/commands/run/index.ts
@@ -0,0 +1 @@
+export { registerRun } from './command';
diff --git a/src/cli/commands/status/__tests__/action.test.ts b/src/cli/commands/status/__tests__/action.test.ts
index a9ec8ef4..603f9574 100644
--- a/src/cli/commands/status/__tests__/action.test.ts
+++ b/src/cli/commands/status/__tests__/action.test.ts
@@ -259,6 +259,116 @@ describe('computeResourceStatuses', () => {
     expect(gwEntry!.identifier).toBe('gw-456');
   });
 
+  it('marks evaluator as deployed when in both local and deployed state', () => {
+    const project = {
+      ...baseProject,
+      evaluators: [{ name: 'MyEval', level: 'SESSION', config: {} }],
+    } as unknown as AgentCoreProjectSpec;
+
+    const resources: DeployedResourceState = {
+      evaluators: {
+        MyEval: {
+          evaluatorId: 'proj_MyEval-abc123',
+          evaluatorArn: 'arn:aws:bedrock:us-east-1:123456789:evaluator/proj_MyEval-abc123',
+        },
+      },
+    };
+
+    const result = computeResourceStatuses(project, resources);
+    const evalEntry = result.find(r => r.resourceType === 'evaluator' && r.name === 'MyEval');
+
+    expect(evalEntry).toBeDefined();
+    expect(evalEntry!.deploymentState).toBe('deployed');
+    expect(evalEntry!.identifier).toBe('arn:aws:bedrock:us-east-1:123456789:evaluator/proj_MyEval-abc123');
+    expect(evalEntry!.detail).toBe('SESSION — LLM-as-a-Judge');
+  });
+
+  it('marks evaluator as local-only when not deployed', () => {
+    const project = {
+      ...baseProject,
+      evaluators: [{ name: 'MyEval', level: 'TRACE', config: {} }],
+    } as unknown as AgentCoreProjectSpec;
+
+    const result = computeResourceStatuses(project, undefined);
+    const evalEntry = result.find(r => r.resourceType === 'evaluator' && r.name === 'MyEval');
+
+    expect(evalEntry).toBeDefined();
+    expect(evalEntry!.deploymentState).toBe('local-only');
+    expect(evalEntry!.detail).toBe('TRACE — LLM-as-a-Judge');
+  });
+
+  it('marks evaluator as pending-removal when deployed but removed from schema', () => {
+    const resources: DeployedResourceState = {
+      evaluators: {
+        RemovedEval: {
+          evaluatorId: 'proj_RemovedEval-xyz',
+          evaluatorArn: 'arn:aws:bedrock:us-east-1:123456789:evaluator/proj_RemovedEval-xyz',
+        },
+      },
+    };
+
+    const result = computeResourceStatuses(baseProject, resources);
+    const evalEntry = result.find(r => r.resourceType === 'evaluator' && r.name === 'RemovedEval');
+
+    expect(evalEntry).toBeDefined();
+    expect(evalEntry!.deploymentState).toBe('pending-removal');
+  });
+
+  it('marks online-eval config as deployed when in both local and deployed state', () => {
+    const project = {
+      ...baseProject,
+      onlineEvalConfigs: [{ name: 'TestConfig', agents: ['Agent1'], evaluators: ['Builtin.Helpfulness'] }],
+    } as unknown as AgentCoreProjectSpec;
+
+    const resources: DeployedResourceState = {
+      onlineEvalConfigs: {
+        TestConfig: {
+          onlineEvaluationConfigId: 'proj_TestConfig-abc',
+          onlineEvaluationConfigArn: 'arn:aws:bedrock:us-east-1:123456789:online-evaluation-config/proj_TestConfig-abc',
+        },
+      },
+    };
+
+    const result = computeResourceStatuses(project, resources);
+    const configEntry = result.find(r => r.resourceType === 'online-eval' && r.name === 'TestConfig');
+
+    expect(configEntry).toBeDefined();
+    expect(configEntry!.deploymentState).toBe('deployed');
+    expect(configEntry!.detail).toBe('1 agent, 1 evaluator');
+  });
+
+  it('marks online-eval config as local-only when not deployed', () => {
+    const project = {
+      ...baseProject,
+      onlineEvalConfigs: [{ name: 'TestConfig', agents: ['A', 'B'], evaluators: ['Builtin.X', 'Builtin.Y', 'Custom'] }],
+    } as unknown as AgentCoreProjectSpec;
+
+    const result = computeResourceStatuses(project, undefined);
+    const configEntry = result.find(r => r.resourceType === 'online-eval' && r.name === 'TestConfig');
+
+    expect(configEntry).toBeDefined();
+    expect(configEntry!.deploymentState).toBe('local-only');
+    expect(configEntry!.detail).toBe('2 agents, 3 evaluators');
+  });
+
+  it('marks online-eval config as pending-removal when deployed but removed from schema', () => {
+    const resources: DeployedResourceState = {
+      onlineEvalConfigs: {
+        RemovedConfig: {
+          onlineEvaluationConfigId: 'proj_RemovedConfig-xyz',
+          onlineEvaluationConfigArn:
+            'arn:aws:bedrock:us-east-1:123456789:online-evaluation-config/proj_RemovedConfig-xyz',
+        },
+      },
+    };
+
+    const result = computeResourceStatuses(baseProject, resources);
+    const configEntry = result.find(r => r.resourceType === 'online-eval' && r.name === 'RemovedConfig');
+
+    expect(configEntry).toBeDefined();
+    expect(configEntry!.deploymentState).toBe('pending-removal');
+  });
+
   it('handles mixed deployed and local-only resources', () => {
     const project = {
       ...baseProject,
diff --git a/src/cli/commands/status/action.ts b/src/cli/commands/status/action.ts
index f14f6bbf..0b00b144 100644
--- a/src/cli/commands/status/action.ts
+++ b/src/cli/commands/status/action.ts
@@ -152,19 +152,22 @@ export function computeResourceStatuses(
     },
   });
 
-  const evaluators: ResourceStatusEntry[] = (project.evaluators ?? []).map(e => ({
+  const evaluators = diffResourceSet({
     resourceType: 'evaluator',
-    name: e.name,
-    deploymentState: 'local-only' as ResourceDeploymentState,
-    detail: `${e.level} — LLM-as-a-Judge`,
-  }));
+    localItems: project.evaluators ?? [],
+    deployedRecord: resources?.evaluators ?? {},
+    getIdentifier: deployed => deployed.evaluatorArn,
+    getLocalDetail: item => `${item.level} — LLM-as-a-Judge`,
+  });
 
-  const onlineEvalConfigs: ResourceStatusEntry[] = (project.onlineEvalConfigs ?? []).map(c => ({
+  const onlineEvalConfigs = diffResourceSet({
     resourceType: 'online-eval',
-    name: c.name,
-    deploymentState: 'local-only' as ResourceDeploymentState,
-    detail: `${c.agents.length} agent${c.agents.length !== 1 ? 's' : ''}, ${c.evaluators.length} evaluator${c.evaluators.length !== 1 ? 's' : ''}`,
-  }));
+    localItems: project.onlineEvalConfigs ?? [],
+    deployedRecord: resources?.onlineEvalConfigs ?? {},
+    getIdentifier: deployed => deployed.onlineEvaluationConfigArn,
+    getLocalDetail: item =>
+      `${item.agents.length} agent${item.agents.length !== 1 ? 's' : ''}, ${item.evaluators.length} evaluator${item.evaluators.length !== 1 ? 's' : ''}`,
+  });
 
   return [...agents, ...credentials, ...memories, ...gateways, ...evaluators, ...onlineEvalConfigs];
 }
diff --git a/src/cli/operations/eval/get-eval-run.ts b/src/cli/operations/eval/get-eval-run.ts
new file mode 100644
index 00000000..724e0ede
--- /dev/null
+++ b/src/cli/operations/eval/get-eval-run.ts
@@ -0,0 +1,17 @@
+import { loadEvalRun } from './storage';
+import type { EvalRunResult, GetEvalRunOptions } from './types';
+
+export interface GetEvalRunResult {
+  success: boolean;
+  error?: string;
+  run?: EvalRunResult;
+}
+
+export function handleGetEvalRun(options: GetEvalRunOptions): GetEvalRunResult {
+  try {
+    const run = loadEvalRun(options.runId);
+    return { success: true, run };
+  } catch (err) {
+    return { success: false, error: (err as Error).message };
+  }
+}
diff --git a/src/cli/operations/eval/index.ts b/src/cli/operations/eval/index.ts
new file mode 100644
index 00000000..f991a4d4
--- /dev/null
+++ b/src/cli/operations/eval/index.ts
@@ -0,0 +1,18 @@
+export { handleRunEval } from './run-eval';
+export type { RunEvalResult } from './run-eval';
+export { handleListEvalRuns } from './list-eval-runs';
+export type { ListEvalRunsResult } from './list-eval-runs';
+export { handleGetEvalRun } from './get-eval-run';
+export type { GetEvalRunResult } from './get-eval-run';
+export { handlePauseResume } from './pause-resume';
+export type { PauseResumeResult } from './pause-resume';
+export { handleLogsEval } from './logs-eval';
+export type { LogsEvalResult } from './logs-eval';
+export type {
+  EvalRunResult,
+  RunEvalOptions,
+  ListEvalRunsOptions,
+  GetEvalRunOptions,
+  OnlineEvalActionOptions,
+} from './types';
+export type { LogsEvalOptions } from './logs-eval';
diff --git a/src/cli/operations/eval/list-eval-runs.ts b/src/cli/operations/eval/list-eval-runs.ts
new file mode 100644
index 00000000..53676a9e
--- /dev/null
+++ b/src/cli/operations/eval/list-eval-runs.ts
@@ -0,0 +1,26 @@
+import { listEvalRuns } from './storage';
+import type { EvalRunResult, ListEvalRunsOptions } from './types';
+
+export interface ListEvalRunsResult {
+  success: boolean;
+  error?: string;
+  runs?: EvalRunResult[];
+}
+
+export function handleListEvalRuns(options: ListEvalRunsOptions): ListEvalRunsResult {
+  try {
+    let runs = listEvalRuns();
+
+    if (options.agent) {
+      runs = runs.filter(r => r.agent === options.agent);
+    }
+
+    if (options.limit) {
+      runs = runs.slice(0, options.limit);
+    }
+
+    return { success: true, runs };
+  } catch (err) {
+    return { success: false, error: (err as Error).message };
+  }
+}
diff --git a/src/cli/operations/eval/logs-eval.ts b/src/cli/operations/eval/logs-eval.ts
new file mode 100644
index 00000000..3446d5f9
--- /dev/null
+++ b/src/cli/operations/eval/logs-eval.ts
@@ -0,0 +1,140 @@
+import { parseTimeString } from '../../../lib/utils';
+import { searchLogs, streamLogs } from '../../aws/cloudwatch';
+import { loadDeployedProjectConfig, resolveAgent } from '../resolve-agent';
+
+export interface LogsEvalOptions {
+  agent?: string;
+  since?: string;
+  until?: string;
+  lines?: string;
+  json?: boolean;
+  follow?: boolean;
+}
+
+export interface LogsEvalResult {
+  success: boolean;
+  error?: string;
+}
+
+function formatLogLine(event: { timestamp: number; message: string }, json: boolean): string {
+  if (json) {
+    return JSON.stringify({ timestamp: new Date(event.timestamp).toISOString(), message: event.message });
+  }
+  const ts = new Date(event.timestamp).toISOString();
+  return `${ts}  ${event.message}`;
+}
+
+/**
+ * Resolve the online eval config log group names for a given agent.
+ * Online eval results are written to: /aws/bedrock-agentcore/evaluations/results/{onlineEvalConfigId}
+ */
+function resolveEvalLogGroups(
+  context: ReturnType<typeof loadDeployedProjectConfig> extends Promise<infer T> ? T : never,
+  agentName: string,
+  targetName: string
+): string[] {
+  const { project, deployedState } = context;
+  const targetResources = deployedState.targets[targetName]?.resources;
+
+  // Find online eval configs that monitor this agent
+  const matchingConfigs = (project.onlineEvalConfigs ?? []).filter(c => c.agents.includes(agentName));
+
+  const logGroups: string[] = [];
+  for (const config of matchingConfigs) {
+    const deployed = targetResources?.onlineEvalConfigs?.[config.name];
+    if (deployed?.onlineEvaluationConfigId) {
+      logGroups.push(`/aws/bedrock-agentcore/evaluations/results/${deployed.onlineEvaluationConfigId}`);
+    }
+  }
+
+  return logGroups;
+}
+
+export async function handleLogsEval(options: LogsEvalOptions): Promise<LogsEvalResult> {
+  const context = await loadDeployedProjectConfig();
+  const agentResult = resolveAgent(context, { agent: options.agent });
+
+  if (!agentResult.success) {
+    return { success: false, error: agentResult.error };
+  }
+
+  const { agent } = agentResult;
+
+  const logGroups = resolveEvalLogGroups(context, agent.agentName, agent.targetName);
+
+  if (logGroups.length === 0) {
+    return {
+      success: false,
+      error: `No deployed online eval configs found for agent '${agent.agentName}'. Add one with 'agentcore add online-eval' and deploy.`,
+    };
+  }
+
+  const isJson = options.json ?? false;
+  const isFollow = options.follow ?? (!options.since && !options.until);
+
+  const ac = new AbortController();
+  const onSignal = () => ac.abort();
+  process.on('SIGINT', onSignal);
+
+  try {
+    // Query all matching log groups
+    for (const logGroupName of logGroups) {
+      if (!isFollow) {
+        const startTimeMs = options.since ? parseTimeString(options.since) : Date.now() - 3_600_000;
+        const endTimeMs = options.until ? parseTimeString(options.until) : Date.now();
+        const limit = options.lines ? parseInt(options.lines, 10) : undefined;
+
+        try {
+          for await (const event of searchLogs({
+            logGroupName,
+            region: agent.region,
+            startTimeMs,
+            endTimeMs,
+            limit,
+          })) {
+            console.log(formatLogLine(event, isJson));
+          }
+        } catch (err: unknown) {
+          const errorName = (err as { name?: string })?.name;
+          if (errorName === 'ResourceNotFoundException') {
+            // Log group exists in config but not yet in CloudWatch — skip
+            continue;
+          }
+          throw err;
+        }
+      } else {
+        console.error(`Streaming eval logs for ${agent.agentName} from ${logGroupName}... (Ctrl+C to stop)`);
+
+        try {
+          for await (const event of streamLogs({
+            logGroupName,
+            region: agent.region,
+            accountId: agent.accountId,
+            abortSignal: ac.signal,
+          })) {
+            console.log(formatLogLine(event, isJson));
+          }
+        } catch (err: unknown) {
+          const errorName = (err as { name?: string })?.name;
+          if (errorName === 'ResourceNotFoundException') {
+            console.error(`Log group ${logGroupName} not found yet — waiting for online eval results...`);
+            continue;
+          }
+          throw err;
+        }
+      }
+    }
+
+    return { success: true };
+  } catch (err: unknown) {
+    const errorName = (err as { name?: string })?.name;
+
+    if (errorName === 'AbortError' || ac.signal.aborted) {
+      return { success: true };
+    }
+
+    throw err;
+  } finally {
+    process.removeListener('SIGINT', onSignal);
+  }
+}
diff --git a/src/cli/operations/eval/pause-resume.ts b/src/cli/operations/eval/pause-resume.ts
new file mode 100644
index 00000000..7e3b280f
--- /dev/null
+++ b/src/cli/operations/eval/pause-resume.ts
@@ -0,0 +1,72 @@
+import type { OnlineEvalExecutionStatus } from '../../aws/agentcore-control';
+import { updateOnlineEvalExecutionStatus } from '../../aws/agentcore-control';
+import { loadDeployedProjectConfig } from '../resolve-agent';
+import type { OnlineEvalActionOptions } from './types';
+
+export interface PauseResumeResult {
+  success: boolean;
+  error?: string;
+  configId?: string;
+  executionStatus?: string;
+}
+
+async function resolveOnlineEvalConfig(
+  configName: string
+): Promise<{ success: true; configId: string; region: string } | { success: false; error: string }> {
+  const context = await loadDeployedProjectConfig();
+  const targetNames = Object.keys(context.deployedState.targets);
+
+  if (targetNames.length === 0) {
+    return { success: false, error: 'No deployed targets found. Run `agentcore deploy` first.' };
+  }
+
+  const targetName = targetNames[0]!;
+  const targetResources = context.deployedState.targets[targetName]?.resources;
+  const deployedConfig = targetResources?.onlineEvalConfigs?.[configName];
+
+  if (!deployedConfig) {
+    return {
+      success: false,
+      error: `Online eval config "${configName}" not found in deployed state. Has it been deployed?`,
+    };
+  }
+
+  const targetConfig = context.awsTargets.find(t => t.name === targetName);
+  if (!targetConfig) {
+    return { success: false, error: `Target config "${targetName}" not found in aws-targets.` };
+  }
+
+  return {
+    success: true,
+    configId: deployedConfig.onlineEvaluationConfigId,
+    region: targetConfig.region,
+  };
+}
+
+export async function handlePauseResume(
+  options: OnlineEvalActionOptions,
+  action: 'pause' | 'resume'
+): Promise<PauseResumeResult> {
+  const resolution = await resolveOnlineEvalConfig(options.name);
+  if (!resolution.success) {
+    return resolution;
+  }
+
+  const executionStatus: OnlineEvalExecutionStatus = action === 'pause' ? 'DISABLED' : 'ENABLED';
+
+  try {
+    const result = await updateOnlineEvalExecutionStatus({
+      region: resolution.region,
+      onlineEvaluationConfigId: resolution.configId,
+      executionStatus,
+    });
+
+    return {
+      success: true,
+      configId: result.configId,
+      executionStatus: result.executionStatus,
+    };
+  } catch (err) {
+    return { success: false, error: (err as Error).message };
+  }
+}
diff --git a/src/cli/operations/eval/run-eval.ts b/src/cli/operations/eval/run-eval.ts
new file mode 100644
index 00000000..c440b7aa
--- /dev/null
+++ b/src/cli/operations/eval/run-eval.ts
@@ -0,0 +1,387 @@
+import { getCredentialProvider } from '../../aws';
+import { evaluate } from '../../aws/agentcore';
+import { DEFAULT_ENDPOINT_NAME } from '../../constants';
+import type { DeployedProjectConfig } from '../resolve-agent';
+import { loadDeployedProjectConfig, resolveAgent } from '../resolve-agent';
+import { generateRunId, saveEvalRun } from './storage';
+import type { EvalEvaluatorResult, EvalRunResult, EvalSessionScore, RunEvalOptions } from './types';
+import { CloudWatchLogsClient, GetQueryResultsCommand, StartQueryCommand } from '@aws-sdk/client-cloudwatch-logs';
+import type { ResultField } from '@aws-sdk/client-cloudwatch-logs';
+import type { DocumentType } from '@smithy/types';
+
+const SPANS_LOG_GROUP = 'aws/spans';
+
+const SUPPORTED_SCOPES = new Set([
+  'strands.telemetry.tracer',
+  'opentelemetry.instrumentation.langchain',
+  'openinference.instrumentation.langchain',
+]);
+
+interface ResolvedEvalContext {
+  agentName: string;
+  region: string;
+  accountId: string;
+  runtimeId: string;
+  runtimeLogGroup: string;
+  evaluatorIds: string[];
+}
+
+function resolveEvalContext(
+  context: DeployedProjectConfig,
+  options: RunEvalOptions
+): { success: true; ctx: ResolvedEvalContext } | { success: false; error: string } {
+  const agentResult = resolveAgent(context, { agent: options.agent });
+  if (!agentResult.success) {
+    return agentResult;
+  }
+
+  const { agent } = agentResult;
+  const runtimeLogGroup = `/aws/bedrock-agentcore/runtimes/${agent.runtimeId}-${DEFAULT_ENDPOINT_NAME}`;
+
+  // Resolve evaluator names to IDs
+  const evaluatorIds: string[] = [];
+  const targetResources = context.deployedState.targets[agent.targetName]?.resources;
+
+  for (const evalName of options.evaluator) {
+    if (evalName.startsWith('Builtin.')) {
+      evaluatorIds.push(evalName);
+      continue;
+    }
+
+    const deployedEval = targetResources?.evaluators?.[evalName];
+    if (!deployedEval) {
+      return {
+        success: false,
+        error: `Evaluator "${evalName}" not found in deployed state. Has it been deployed?`,
+      };
+    }
+    evaluatorIds.push(deployedEval.evaluatorId);
+  }
+
+  // Also add any direct ARNs/IDs — extract ID from ARN if full ARN is passed
+  if (options.evaluatorArn) {
+    for (const arnOrId of options.evaluatorArn) {
+      const arnMatch = /evaluator\/(.+)$/.exec(arnOrId);
+      evaluatorIds.push(arnMatch ? arnMatch[1]! : arnOrId);
+    }
+  }
+
+  if (evaluatorIds.length === 0) {
+    return { success: false, error: 'No evaluators specified. Use -e/--evaluator or --evaluator-arn.' };
+  }
+
+  return {
+    success: true,
+    ctx: {
+      agentName: agent.agentName,
+      region: agent.region,
+      accountId: agent.accountId,
+      runtimeId: agent.runtimeId,
+      runtimeLogGroup,
+      evaluatorIds,
+    },
+  };
+}
+
+/**
+ * Execute a CloudWatch Logs Insights query and wait for results.
+ */
+async function executeQuery(
+  client: CloudWatchLogsClient,
+  logGroupName: string,
+  queryString: string,
+  startTimeSec: number,
+  endTimeSec: number
+): Promise<ResultField[][]> {
+  const startQuery = await client.send(
+    new StartQueryCommand({
+      logGroupName,
+      startTime: startTimeSec,
+      endTime: endTimeSec,
+      queryString,
+    })
+  );
+
+  if (!startQuery.queryId) {
+    throw new Error('Failed to start CloudWatch Logs Insights query');
+  }
+
+  for (let i = 0; i < 60; i++) {
+    await new Promise(resolve => setTimeout(resolve, 1000));
+
+    const queryResults = await client.send(new GetQueryResultsCommand({ queryId: startQuery.queryId }));
+    const status = queryResults.status ?? 'Unknown';
+
+    if (status === 'Failed' || status === 'Cancelled') {
+      throw new Error(`CloudWatch query ${status.toLowerCase()}`);
+    }
+
+    if (status === 'Complete') {
+      return queryResults.results ?? [];
+    }
+  }
+
+  throw new Error('CloudWatch query timed out after 60 seconds');
+}
+
+/**
+ * Extract parsed @message documents from CloudWatch Insights results.
+ */
+function extractMessages(rows: ResultField[][]): Record<string, unknown>[] {
+  const docs: Record<string, unknown>[] = [];
+  for (const row of rows) {
+    const messageField = row.find(f => f.field === '@message');
+    if (messageField?.value) {
+      try {
+        docs.push(JSON.parse(messageField.value) as Record<string, unknown>);
+      } catch {
+        // Skip non-JSON log lines
+      }
+    }
+  }
+  return docs;
+}
+
+/**
+ * Check if a document is relevant for evaluation:
+ * - Has a supported instrumentation scope, OR
+ * - Is a log record with conversation data (body.input / body.output)
+ */
+function isRelevantForEval(doc: Record<string, unknown>): boolean {
+  const scope = doc.scope as Record<string, unknown> | undefined;
+  const scopeName = scope?.name as string | undefined;
+  if (scopeName && SUPPORTED_SCOPES.has(scopeName)) {
+    return true;
+  }
+
+  const body = doc.body;
+  if (body && typeof body === 'object' && ('input' in body || 'output' in body)) {
+    return true;
+  }
+
+  return false;
+}
+
+interface SessionSpans {
+  sessionId: string;
+  spans: DocumentType[];
+}
+
+/**
+ * Fetch OTel spans from the `aws/spans` log group and runtime logs from the agent's
+ * log group, then group them by session.
+ *
+ * The Evaluate API requires spans from a single session per call.
+ */
+async function fetchSessionSpans(
+  runtimeId: string,
+  runtimeLogGroup: string,
+  region: string,
+  lookbackDays: number
+): Promise<SessionSpans[]> {
+  const endTimeMs = Date.now();
+  const startTimeMs = endTimeMs - lookbackDays * 24 * 60 * 60 * 1000;
+  const startTimeSec = Math.floor(startTimeMs / 1000);
+  const endTimeSec = Math.floor(endTimeMs / 1000);
+
+  const client = new CloudWatchLogsClient({
+    credentials: getCredentialProvider(),
+    region,
+  });
+
+  // 1. Query proper OTel spans from the aws/spans log group
+  const spanRows = await executeQuery(
+    client,
+    SPANS_LOG_GROUP,
+    `fields @message, attributes.session.id as sessionId, traceId
+     | parse resource.attributes.cloud.resource_id "runtime/*/" as parsedAgentId
+     | filter parsedAgentId = '${runtimeId}'
+     | sort startTimeUnixNano asc
+     | limit 10000`,
+    startTimeSec,
+    endTimeSec
+  );
+
+  // Group spans by session and collect trace IDs
+  const sessionMap = new Map<string, DocumentType[]>();
+  const traceIds = new Set<string>();
+
+  for (const row of spanRows) {
+    const messageField = row.find(f => f.field === '@message');
+    const sessionField = row.find(f => f.field === 'sessionId');
+    const traceField = row.find(f => f.field === 'traceId');
+
+    if (!messageField?.value) continue;
+
+    let doc: Record<string, unknown>;
+    try {
+      doc = JSON.parse(messageField.value) as Record<string, unknown>;
+    } catch {
+      continue;
+    }
+
+    const sessionId = sessionField?.value ?? 'unknown';
+    if (!sessionMap.has(sessionId)) {
+      sessionMap.set(sessionId, []);
+    }
+    sessionMap.get(sessionId)!.push(doc as DocumentType);
+
+    if (traceField?.value) {
+      traceIds.add(traceField.value);
+    }
+  }
+
+  if (sessionMap.size === 0) {
+    return [];
+  }
+
+  // 2. Query runtime logs from the agent's log group for the trace IDs found
+  if (traceIds.size > 0) {
+    const traceFilter = [...traceIds].map(t => `'${t}'`).join(', ');
+    let logRows: ResultField[][] = [];
+    try {
+      logRows = await executeQuery(
+        client,
+        runtimeLogGroup,
+        `fields @message, traceId
+         | filter traceId in [${traceFilter}]
+         | sort @timestamp asc
+         | limit 10000`,
+        startTimeSec,
+        endTimeSec
+      );
+    } catch {
+      // Runtime log group may not exist yet; continue with spans only
+    }
+
+    const logDocs = extractMessages(logRows);
+
+    // Match runtime logs to sessions via traceId
+    // Build traceId → sessionId mapping from spans
+    const traceToSession = new Map<string, string>();
+    for (const row of spanRows) {
+      const traceField = row.find(f => f.field === 'traceId');
+      const sessionField = row.find(f => f.field === 'sessionId');
+      if (traceField?.value && sessionField?.value) {
+        traceToSession.set(traceField.value, sessionField.value);
+      }
+    }
+
+    for (const logDoc of logDocs) {
+      if (!isRelevantForEval(logDoc)) continue;
+
+      const logTraceId = logDoc.traceId as string | undefined;
+      const sessionId = logTraceId ? (traceToSession.get(logTraceId) ?? 'unknown') : 'unknown';
+      if (!sessionMap.has(sessionId)) {
+        sessionMap.set(sessionId, []);
+      }
+      sessionMap.get(sessionId)!.push(logDoc as DocumentType);
+    }
+  }
+
+  // 3. Build session list — aws/spans docs are already scoped by runtimeId (step 1),
+  //    and runtime log docs were filtered through isRelevantForEval (step 2).
+  //    We keep all docs so the Evaluate API has full trace context for resolving
+  //    template variables like {actual_trajectory}.
+  const sessions: SessionSpans[] = [];
+  for (const [sessionId, docs] of sessionMap) {
+    if (docs.length > 0) {
+      sessions.push({ sessionId, spans: docs });
+    }
+  }
+
+  return sessions;
+}
+
+export interface RunEvalResult {
+  success: boolean;
+  error?: string;
+  run?: EvalRunResult;
+  filePath?: string;
+}
+
+export async function handleRunEval(options: RunEvalOptions): Promise<RunEvalResult> {
+  const context = await loadDeployedProjectConfig();
+  const resolution = resolveEvalContext(context, options);
+
+  if (!resolution.success) {
+    return { success: false, error: resolution.error };
+  }
+
+  const { ctx } = resolution;
+
+  // Fetch spans grouped by session
+  const sessions = await fetchSessionSpans(ctx.runtimeId, ctx.runtimeLogGroup, ctx.region, options.days);
+
+  if (sessions.length === 0) {
+    return {
+      success: false,
+      error: `No session spans found for agent "${ctx.agentName}" in the last ${options.days} day(s). Has the agent been invoked?`,
+    };
+  }
+
+  // Run each evaluator against each session
+  const results: EvalEvaluatorResult[] = [];
+  const allEvaluatorNames = [...options.evaluator, ...(options.evaluatorArn ?? [])];
+
+  for (let i = 0; i < ctx.evaluatorIds.length; i++) {
+    const evaluatorId = ctx.evaluatorIds[i]!;
+    const evaluatorName = allEvaluatorNames[i] ?? evaluatorId;
+
+    const sessionScores: EvalSessionScore[] = [];
+    let totalInputTokens = 0;
+    let totalOutputTokens = 0;
+    let totalTokens = 0;
+
+    for (const session of sessions) {
+      const response = await evaluate({
+        region: ctx.region,
+        evaluatorId,
+        sessionSpans: session.spans,
+      });
+
+      for (const r of response.evaluationResults) {
+        sessionScores.push({
+          sessionId: r.context?.sessionId ?? session.sessionId,
+          traceId: r.context?.traceId,
+          spanId: r.context?.spanId,
+          value: r.value ?? 0,
+          label: r.label,
+          explanation: r.explanation,
+          errorMessage: r.errorMessage,
+        });
+
+        totalInputTokens += r.tokenUsage?.inputTokens ?? 0;
+        totalOutputTokens += r.tokenUsage?.outputTokens ?? 0;
+        totalTokens += r.tokenUsage?.totalTokens ?? 0;
+      }
+    }
+
+    const validScores = sessionScores.filter(s => !s.errorMessage);
+    const aggregateScore =
+      validScores.length > 0 ? validScores.reduce((sum, s) => sum + s.value, 0) / validScores.length : 0;
+
+    results.push({
+      evaluator: evaluatorName,
+      aggregateScore,
+      sessionScores,
+      tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, totalTokens },
+    });
+  }
+
+  // Build run result
+  const run: EvalRunResult = {
+    runId: generateRunId(),
+    timestamp: new Date().toISOString(),
+    agent: ctx.agentName,
+    evaluators: allEvaluatorNames,
+    lookbackDays: options.days,
+    sessionCount: sessions.length,
+    results,
+  };
+
+  // Save to disk
+  const filePath = options.output ?? saveEvalRun(run);
+
+  return { success: true, run, filePath };
+}
diff --git a/src/cli/operations/eval/storage.ts b/src/cli/operations/eval/storage.ts
new file mode 100644
index 00000000..5329e654
--- /dev/null
+++ b/src/cli/operations/eval/storage.ts
@@ -0,0 +1,54 @@
+import { findConfigRoot } from '../../../lib';
+import type { EvalRunResult } from './types';
+import { randomUUID } from 'crypto';
+import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync } from 'fs';
+import { join } from 'path';
+
+const EVAL_RESULTS_DIR = 'eval-results';
+
+function getResultsDir(): string {
+  const configRoot = findConfigRoot();
+  if (!configRoot) {
+    throw new Error('No agentcore project found. Run `agentcore create` first.');
+  }
+  return join(configRoot, EVAL_RESULTS_DIR);
+}
+
+export function generateRunId(): string {
+  return `run_${randomUUID()}`;
+}
+
+export function saveEvalRun(result: EvalRunResult): string {
+  const dir = getResultsDir();
+  mkdirSync(dir, { recursive: true });
+
+  const filePath = join(dir, `${result.runId}.json`);
+  writeFileSync(filePath, JSON.stringify(result, null, 2));
+  return filePath;
+}
+
+export function loadEvalRun(runId: string): EvalRunResult {
+  const dir = getResultsDir();
+  const filePath = join(dir, `${runId}.json`);
+
+  if (!existsSync(filePath)) {
+    throw new Error(`Eval run "${runId}" not found at ${filePath}`);
+  }
+
+  return JSON.parse(readFileSync(filePath, 'utf-8')) as EvalRunResult;
+}
+
+export function listEvalRuns(): EvalRunResult[] {
+  const dir = getResultsDir();
+
+  if (!existsSync(dir)) {
+    return [];
+  }
+
+  const files = readdirSync(dir)
+    .filter(f => f.startsWith('run_') && f.endsWith('.json'))
+    .sort()
+    .reverse();
+
+  return files.map(f => JSON.parse(readFileSync(join(dir, f), 'utf-8')) as EvalRunResult);
+}
diff --git a/src/cli/operations/eval/types.ts b/src/cli/operations/eval/types.ts
new file mode 100644
index 00000000..73447aba
--- /dev/null
+++ b/src/cli/operations/eval/types.ts
@@ -0,0 +1,63 @@
+/** Result of a single evaluator within an eval run */
+export interface EvalEvaluatorResult {
+  evaluator: string;
+  aggregateScore: number;
+  sessionScores: EvalSessionScore[];
+  tokenUsage?: {
+    inputTokens: number;
+    outputTokens: number;
+    totalTokens: number;
+  };
+}
+
+/** Per-session score from an evaluator */
+export interface EvalSessionScore {
+  sessionId: string;
+  traceId?: string;
+  spanId?: string;
+  value: number;
+  label?: string;
+  explanation?: string;
+  errorMessage?: string;
+}
+
+/** Full eval run result stored to disk */
+export interface EvalRunResult {
+  runId: string;
+  timestamp: string;
+  agent: string;
+  evaluators: string[];
+  lookbackDays: number;
+  sessionCount: number;
+  results: EvalEvaluatorResult[];
+}
+
+/** Options for running an eval */
+export interface RunEvalOptions {
+  agent?: string;
+  evaluator: string[];
+  evaluatorArn?: string[];
+  days: number;
+  output?: string;
+  json?: boolean;
+}
+
+/** Options for listing eval runs */
+export interface ListEvalRunsOptions {
+  agent?: string;
+  limit?: number;
+  json?: boolean;
+}
+
+/** Options for getting a single eval run */
+export interface GetEvalRunOptions {
+  runId: string;
+  sessions?: boolean;
+  json?: boolean;
+}
+
+/** Options for pause/resume online eval */
+export interface OnlineEvalActionOptions {
+  name: string;
+  json?: boolean;
+}
diff --git a/src/cli/primitives/OnlineEvalConfigPrimitive.ts b/src/cli/primitives/OnlineEvalConfigPrimitive.ts
index 36e66069..c4378d03 100644
--- a/src/cli/primitives/OnlineEvalConfigPrimitive.ts
+++ b/src/cli/primitives/OnlineEvalConfigPrimitive.ts
@@ -12,7 +12,6 @@ export interface AddOnlineEvalConfigOptions {
   agents: string[];
   evaluators: string[];
   samplingRate: number;
-  enableOnCreate?: boolean;
 }
 
 export type RemovableOnlineEvalConfig = RemovableResource;
@@ -203,12 +202,11 @@ export class OnlineEvalConfigPrimitive extends BasePrimitive<AddOnlineEvalConfig
     this.checkDuplicate(project.onlineEvalConfigs, options.name, 'Online eval config');
 
     const config: OnlineEvalConfig = {
-      type: 'OnlineEvalConfig',
+      type: 'OnlineEvaluationConfig',
       name: options.name,
       agents: options.agents,
       evaluators: options.evaluators,
       samplingRate: options.samplingRate,
-      enableOnCreate: options.enableOnCreate ?? true,
     };
 
     project.onlineEvalConfigs.push(config);
diff --git a/src/cli/tui/App.tsx b/src/cli/tui/App.tsx
index 23511d9d..5d2873c5 100644
--- a/src/cli/tui/App.tsx
+++ b/src/cli/tui/App.tsx
@@ -7,6 +7,7 @@ import { AddFlow } from './screens/add/AddFlow';
 import { CreateScreen } from './screens/create';
 import { DeployScreen } from './screens/deploy/DeployScreen';
 import { DevScreen } from './screens/dev/DevScreen';
+import { EvalScreen } from './screens/eval';
 import { HelpScreen, HomeScreen } from './screens/home';
 import { InvokeScreen } from './screens/invoke';
 import { PackageScreen } from './screens/package';
@@ -32,6 +33,7 @@ type Route =
   | { name: 'add' }
   | { name: 'status' }
   | { name: 'remove' }
+  | { name: 'eval' }
   | { name: 'validate' }
   | { name: 'package' }
   | { name: 'update' };
@@ -84,6 +86,8 @@ function AppContent() {
       setRoute({ name: 'add' });
     } else if (id === 'remove') {
       setRoute({ name: 'remove' });
+    } else if (id === 'eval') {
+      setRoute({ name: 'eval' });
     } else if (id === 'validate') {
       setRoute({ name: 'validate' });
     } else if (id === 'package') {
@@ -179,6 +183,10 @@ function AppContent() {
     );
   }
 
+  if (route.name === 'eval') {
+    return <EvalScreen isInteractive={true} onExit={() => setRoute({ name: 'help' })} />;
+  }
+
   if (route.name === 'validate') {
     return <ValidateScreen isInteractive={true} onExit={() => setRoute({ name: 'help' })} />;
   }
diff --git a/src/cli/tui/copy.ts b/src/cli/tui/copy.ts
index 507365da..8bfd6c1a 100644
--- a/src/cli/tui/copy.ts
+++ b/src/cli/tui/copy.ts
@@ -40,6 +40,10 @@ export const COMMAND_DESCRIPTIONS = {
   remove: 'Remove AgentCore resources and project',
   status: 'Retrieve details of deployed AgentCore resources.',
   traces: 'View and download agent traces.',
+  eval: 'View eval run results.',
+  pause: 'Pause a running resource.',
+  resume: 'Resume a paused resource.',
+  run: 'Run operations (eval, etc.).',
   update: 'Check for and install CLI updates',
   validate: 'Validate agentcore/ config files.',
 } as const;
diff --git a/src/cli/tui/hooks/useCreateOnlineEval.ts b/src/cli/tui/hooks/useCreateOnlineEval.ts
index ab4ed1c4..e8ee3e9a 100644
--- a/src/cli/tui/hooks/useCreateOnlineEval.ts
+++ b/src/cli/tui/hooks/useCreateOnlineEval.ts
@@ -6,7 +6,6 @@ interface CreateOnlineEvalConfig {
   agents: string[];
   evaluators: string[];
   samplingRate: number;
-  enableOnCreate?: boolean;
 }
 
 export function useCreateOnlineEval() {
@@ -22,7 +21,6 @@ export function useCreateOnlineEval() {
         agents: config.agents,
         evaluators: config.evaluators,
         samplingRate: config.samplingRate,
-        enableOnCreate: config.enableOnCreate,
       });
       if (!addResult.success) {
         throw new Error(addResult.error ?? 'Failed to create online eval config');
diff --git a/src/cli/tui/screens/cli-only/CliOnlyScreen.tsx b/src/cli/tui/screens/cli-only/CliOnlyScreen.tsx
new file mode 100644
index 00000000..bd50d553
--- /dev/null
+++ b/src/cli/tui/screens/cli-only/CliOnlyScreen.tsx
@@ -0,0 +1,28 @@
+import { Screen } from '../../components';
+import { Box, Text } from 'ink';
+import React from 'react';
+
+interface CliOnlyScreenProps {
+  title: string;
+  description: string;
+  examples: string[];
+  onExit: () => void;
+}
+
+export function CliOnlyScreen({ title, description, examples, onExit }: CliOnlyScreenProps) {
+  return (
+    <Screen title={title} onExit={onExit}>
+      <Box flexDirection="column" marginTop={1}>
+        <Text>{description}</Text>
+        <Box marginTop={1} flexDirection="column">
+          <Text bold>Usage:</Text>
+          {examples.map((example, i) => (
+            <Text key={i} dimColor>
+              {'  '}$ {example}
+            </Text>
+          ))}
+        </Box>
+      </Box>
+    </Screen>
+  );
+}
diff --git a/src/cli/tui/screens/cli-only/index.ts b/src/cli/tui/screens/cli-only/index.ts
new file mode 100644
index 00000000..79a69101
--- /dev/null
+++ b/src/cli/tui/screens/cli-only/index.ts
@@ -0,0 +1 @@
+export { CliOnlyScreen } from './CliOnlyScreen';
diff --git a/src/cli/tui/screens/deploy/useDeployFlow.ts b/src/cli/tui/screens/deploy/useDeployFlow.ts
index bb461ff8..2ceea8c1 100644
--- a/src/cli/tui/screens/deploy/useDeployFlow.ts
+++ b/src/cli/tui/screens/deploy/useDeployFlow.ts
@@ -4,8 +4,10 @@ import {
   buildDeployedState,
   getStackOutputs,
   parseAgentOutputs,
+  parseEvaluatorOutputs,
   parseGatewayOutputs,
   parseMemoryOutputs,
+  parseOnlineEvalOutputs,
 } from '../../../cloudformation';
 import { getErrorMessage, isChangesetInProgressError, isExpiredTokenError } from '../../../errors';
 import { ExecLogger } from '../../../logging';
@@ -257,6 +259,14 @@ export function useDeployFlow(options: DeployFlowOptions = {}): DeployFlowState
       );
     }
 
+    // Parse evaluator outputs
+    const evaluatorNames = (ctx.projectSpec.evaluators ?? []).map((e: { name: string }) => e.name);
+    const evaluators = parseEvaluatorOutputs(outputs, evaluatorNames);
+
+    // Parse online eval config outputs
+    const onlineEvalNames = (ctx.projectSpec.onlineEvalConfigs ?? []).map((c: { name: string }) => c.name);
+    const onlineEvalConfigs = parseOnlineEvalOutputs(outputs, onlineEvalNames);
+
     // Expose outputs to UI
     setStackOutputs(outputs);
 
@@ -269,6 +279,8 @@ export function useDeployFlow(options: DeployFlowOptions = {}): DeployFlowState
       existingState,
       identityKmsKeyArn,
       memories,
+      evaluators,
+      onlineEvalConfigs,
       credentials: Object.keys(oauthCredentials).length > 0 ? oauthCredentials : undefined,
     });
     await configIO.writeDeployedState(deployedState);
diff --git a/src/cli/tui/screens/eval/EvalScreen.tsx b/src/cli/tui/screens/eval/EvalScreen.tsx
new file mode 100644
index 00000000..f178d537
--- /dev/null
+++ b/src/cli/tui/screens/eval/EvalScreen.tsx
@@ -0,0 +1,88 @@
+import { handleListEvalRuns } from '../../../operations/eval';
+import type { EvalRunResult } from '../../../operations/eval/types';
+import { Screen } from '../../components';
+import { STATUS_COLORS } from '../../theme';
+import { Box, Text } from 'ink';
+import React, { useEffect, useState } from 'react';
+
+interface EvalScreenProps {
+  isInteractive: boolean;
+  onExit: () => void;
+}
+
+type Phase = 'loading' | 'loaded' | 'error';
+
+interface EvalState {
+  phase: Phase;
+  runs: EvalRunResult[];
+  error: string | null;
+}
+
+export function EvalScreen({ isInteractive, onExit }: EvalScreenProps) {
+  const [state, setState] = useState<EvalState>({
+    phase: 'loading',
+    runs: [],
+    error: null,
+  });
+
+  useEffect(() => {
+    const load = async () => {
+      // Yield to allow React to paint the loading state
+      await new Promise(resolve => setTimeout(resolve, 0));
+
+      const result = handleListEvalRuns({});
+
+      if (!result.success) {
+        setState({ phase: 'error', runs: [], error: result.error ?? 'Unknown error' });
+        return;
+      }
+
+      setState({ phase: 'loaded', runs: result.runs ?? [], error: null });
+    };
+
+    void load();
+  }, []);
+
+  return (
+    <Screen title="Eval Runs" onExit={onExit}>
+      <Box flexDirection="column" marginTop={1}>
+        {state.phase === 'loading' && <Text dimColor>Loading eval runs...</Text>}
+
+        {state.phase === 'error' && <Text color={STATUS_COLORS.error}>{state.error}</Text>}
+
+        {state.phase === 'loaded' && state.runs.length === 0 && (
+          <Text dimColor>No eval runs found. Run `agentcore run eval` to create one.</Text>
+        )}
+
+        {state.phase === 'loaded' && state.runs.length > 0 && (
+          <Box flexDirection="column">
+            <Box>
+              <Text bold>
+                {'Run ID'.padEnd(42)} {'Agent'.padEnd(20)} {'Score'.padEnd(30)} {'Sessions'.padEnd(10)} {'Date'}
+              </Text>
+            </Box>
+            <Text dimColor>{'─'.repeat(110)}</Text>
+            {state.runs.map(run => {
+              const scores = run.results.map(r => `${r.evaluator}=${r.aggregateScore.toFixed(2)}`).join(', ');
+              const date = new Date(run.timestamp).toLocaleDateString();
+              return (
+                <Box key={run.runId}>
+                  <Text>
+                    {run.runId.padEnd(42)} {run.agent.padEnd(20)} {scores.padEnd(30)}{' '}
+                    {String(run.sessionCount).padEnd(10)} {date}
+                  </Text>
+                </Box>
+              );
+            })}
+          </Box>
+        )}
+
+        {state.phase !== 'loading' && (
+          <Box marginTop={1}>
+            <Text dimColor>{isInteractive ? 'Esc/B back' : ''}</Text>
+          </Box>
+        )}
+      </Box>
+    </Screen>
+  );
+}
diff --git a/src/cli/tui/screens/eval/index.ts b/src/cli/tui/screens/eval/index.ts
new file mode 100644
index 00000000..861edb00
--- /dev/null
+++ b/src/cli/tui/screens/eval/index.ts
@@ -0,0 +1 @@
+export { EvalScreen } from './EvalScreen';
diff --git a/src/cli/tui/screens/evaluator/AddEvaluatorScreen.tsx b/src/cli/tui/screens/evaluator/AddEvaluatorScreen.tsx
index 8969c010..66731159 100644
--- a/src/cli/tui/screens/evaluator/AddEvaluatorScreen.tsx
+++ b/src/cli/tui/screens/evaluator/AddEvaluatorScreen.tsx
@@ -16,6 +16,7 @@ import {
   validateInstructionPlaceholders,
 } from './types';
 import { useAddEvaluatorWizard } from './useAddEvaluatorWizard';
+import { Box, Text } from 'ink';
 import React, { useMemo } from 'react';
 
 interface AddEvaluatorScreenProps {
@@ -122,14 +123,21 @@ export function AddEvaluatorScreen({ onComplete, onExit, existingEvaluatorNames
         )}
 
         {isInstructionsStep && (
-          <TextInput
-            key="instructions"
-            prompt={`Evaluation instructions (must include at least one: ${LEVEL_PLACEHOLDERS[wizard.config.level].map(p => `{${p}}`).join(', ')})`}
-            initialValue={DEFAULT_INSTRUCTIONS[wizard.config.level]}
-            onSubmit={wizard.setInstructions}
-            onCancel={() => wizard.goBack()}
-            customValidation={value => validateInstructionPlaceholders(value, wizard.config.level)}
-          />
+          <Box flexDirection="column">
+            <Text>Evaluation instructions</Text>
+            <Text dimColor>
+              Must include at least one: {LEVEL_PLACEHOLDERS[wizard.config.level].map(p => `{${p}}`).join(', ')}
+            </Text>
+            <TextInput
+              key="instructions"
+              prompt=""
+              hideArrow={false}
+              initialValue={DEFAULT_INSTRUCTIONS[wizard.config.level]}
+              onSubmit={wizard.setInstructions}
+              onCancel={() => wizard.goBack()}
+              customValidation={value => validateInstructionPlaceholders(value, wizard.config.level)}
+            />
+          </Box>
         )}
 
         {isRatingScaleStep && (
diff --git a/src/cli/tui/screens/evaluator/types.ts b/src/cli/tui/screens/evaluator/types.ts
index f22a56c4..af2d2ce5 100644
--- a/src/cli/tui/screens/evaluator/types.ts
+++ b/src/cli/tui/screens/evaluator/types.ts
@@ -52,9 +52,9 @@ export const LEVEL_PLACEHOLDERS: Record<EvaluationLevel, string[]> = {
  */
 export const DEFAULT_INSTRUCTIONS: Record<EvaluationLevel, string> = {
   SESSION:
-    'Evaluate the agent session. Context: {context}. The agent trajectory was: {actual_trajectory}. Rate the overall quality of the response.',
+    'Evaluate the agent session based on the following conversation. Context: {context}. Rate the overall quality of the response.',
   TRACE:
-    'Evaluate the agent trace. Context: {context}. The agent trajectory was: {actual_trajectory}. Rate the quality of this trace.',
+    'Evaluate the agent trace based on the following conversation. Context: {context}. Rate the quality of this trace.',
   TOOL_CALL:
     'Evaluate the tool call. Tool: {tool_name}. Input: {tool_input}. Output: {tool_output}. Rate the quality of this tool usage.',
 };
diff --git a/src/cli/tui/screens/online-eval/types.ts b/src/cli/tui/screens/online-eval/types.ts
index 0c2d70b7..12f4c151 100644
--- a/src/cli/tui/screens/online-eval/types.ts
+++ b/src/cli/tui/screens/online-eval/types.ts
@@ -9,7 +9,6 @@ export interface AddOnlineEvalConfig {
   agents: string[];
   evaluators: string[];
   samplingRate: number;
-  enableOnCreate: boolean;
 }
 
 export const ONLINE_EVAL_STEP_LABELS: Record<AddOnlineEvalStep, string> = {
diff --git a/src/cli/tui/screens/online-eval/useAddOnlineEvalWizard.ts b/src/cli/tui/screens/online-eval/useAddOnlineEvalWizard.ts
index a4743cb3..64141ef3 100644
--- a/src/cli/tui/screens/online-eval/useAddOnlineEvalWizard.ts
+++ b/src/cli/tui/screens/online-eval/useAddOnlineEvalWizard.ts
@@ -10,7 +10,6 @@ function getDefaultConfig(): AddOnlineEvalConfig {
     agents: [],
     evaluators: [],
     samplingRate: DEFAULT_SAMPLING_RATE,
-    enableOnCreate: true,
   };
 }
 
diff --git a/src/cli/tui/utils/commands.ts b/src/cli/tui/utils/commands.ts
index 918d5afb..e1bd4980 100644
--- a/src/cli/tui/utils/commands.ts
+++ b/src/cli/tui/utils/commands.ts
@@ -11,7 +11,7 @@ export interface CommandMeta {
 /**
  * Commands hidden from TUI help but still available via CLI.
  */
-const HIDDEN_FROM_TUI = ['help', 'update', 'package', 'logs', 'traces'] as const;
+const HIDDEN_FROM_TUI = ['help', 'update', 'package', 'logs', 'traces', 'run', 'pause', 'resume'] as const;
 
 /**
  * Commands hidden from TUI when inside an existing project.
diff --git a/src/schema/schemas/deployed-state.ts b/src/schema/schemas/deployed-state.ts
index 9741e69d..d8a57185 100644
--- a/src/schema/schemas/deployed-state.ts
+++ b/src/schema/schemas/deployed-state.ts
@@ -119,6 +119,29 @@ export const CredentialDeployedStateSchema = z.object({
 
 export type CredentialDeployedState = z.infer<typeof CredentialDeployedStateSchema>;
 
+// ============================================================================
+// Evaluator Deployed State
+// ============================================================================
+
+export const EvaluatorDeployedStateSchema = z.object({
+  evaluatorId: z.string().min(1),
+  evaluatorArn: z.string().min(1),
+});
+
+export type EvaluatorDeployedState = z.infer<typeof EvaluatorDeployedStateSchema>;
+
+// ============================================================================
+// Online Eval Config Deployed State
+// ============================================================================
+
+export const OnlineEvalDeployedStateSchema = z.object({
+  onlineEvaluationConfigId: z.string().min(1),
+  onlineEvaluationConfigArn: z.string().min(1),
+  executionStatus: z.enum(['ENABLED', 'DISABLED']).optional(),
+});
+
+export type OnlineEvalDeployedState = z.infer<typeof OnlineEvalDeployedStateSchema>;
+
 // ============================================================================
 // Deployed Resource State
 // ============================================================================
@@ -129,6 +152,8 @@ export const DeployedResourceStateSchema = z.object({
   mcp: McpDeployedStateSchema.optional(),
   externallyManaged: ExternallyManagedStateSchema.optional(),
   credentials: z.record(z.string(), CredentialDeployedStateSchema).optional(),
+  evaluators: z.record(z.string(), EvaluatorDeployedStateSchema).optional(),
+  onlineEvalConfigs: z.record(z.string(), OnlineEvalDeployedStateSchema).optional(),
   stackName: z.string().optional(),
   identityKmsKeyArn: z.string().optional(),
 });
diff --git a/src/schema/schemas/primitives/online-eval-config.ts b/src/schema/schemas/primitives/online-eval-config.ts
index ea6ef95c..b90f6295 100644
--- a/src/schema/schemas/primitives/online-eval-config.ts
+++ b/src/schema/schemas/primitives/online-eval-config.ts
@@ -14,7 +14,7 @@ export const OnlineEvalConfigNameSchema = z
   );
 
 export const OnlineEvalConfigSchema = z.object({
-  type: z.literal('OnlineEvalConfig'),
+  type: z.literal('OnlineEvaluationConfig'),
   name: OnlineEvalConfigNameSchema,
   /** Agent names this online eval config monitors */
   agents: z.array(z.string().min(1)).min(1, 'At least one agent is required'),
@@ -22,8 +22,6 @@ export const OnlineEvalConfigSchema = z.object({
   evaluators: z.array(z.string().min(1)).min(1, 'At least one evaluator is required'),
   /** Sampling rate as a percentage (0.01 to 100) */
   samplingRate: z.number().min(0.01).max(100),
-  /** Whether to start the pipeline immediately on deploy */
-  enableOnCreate: z.boolean().default(true),
 });
 
 export type OnlineEvalConfig = z.infer<typeof OnlineEvalConfigSchema>;

From fe33a20bb421ec4bf0df230187dea561e37317db Mon Sep 17 00:00:00 2001
From: notgitika <gitijh@gmail.com>
Date: Wed, 11 Mar 2026 20:33:57 -0400
Subject: [PATCH 3/9] fix: eval data plane code clean up

---
 src/cli/commands/pause/command.tsx        | 23 +++++++++----
 src/cli/commands/resume/command.tsx       | 42 +----------------------
 src/cli/operations/eval/get-eval-run.ts   |  3 +-
 src/cli/operations/eval/list-eval-runs.ts |  3 +-
 src/cli/operations/eval/logs-eval.ts      |  7 ++--
 src/cli/operations/eval/run-eval.ts       | 18 ++++++++--
 6 files changed, 39 insertions(+), 57 deletions(-)

diff --git a/src/cli/commands/pause/command.tsx b/src/cli/commands/pause/command.tsx
index aaaaf76a..a70bb17e 100644
--- a/src/cli/commands/pause/command.tsx
+++ b/src/cli/commands/pause/command.tsx
@@ -6,24 +6,25 @@ import type { Command } from '@commander-js/extra-typings';
 import { Text, render } from 'ink';
 import React from 'react';
 
-export const registerPause = (program: Command) => {
-  const pauseCmd = program.command('pause').description(COMMAND_DESCRIPTIONS.pause);
+function registerOnlineEvalSubcommand(parent: Command, action: 'pause' | 'resume') {
+  const description = action === 'pause' ? 'Pause a deployed online eval config' : 'Resume a paused online eval config';
+  const pastTense = action === 'pause' ? 'Paused' : 'Resumed';
 
-  pauseCmd
+  parent
     .command('online-eval')
-    .description('Pause a deployed online eval config')
+    .description(description)
     .argument('<name>', 'Online eval config name')
     .option('--json', 'Output as JSON')
     .action(async (name: string, cliOptions: { json?: boolean }) => {
       requireProject();
 
       try {
-        const result = await handlePauseResume({ name, json: cliOptions.json }, 'pause');
+        const result = await handlePauseResume({ name, json: cliOptions.json }, action);
 
         if (cliOptions.json) {
           console.log(JSON.stringify(result));
         } else if (result.success) {
-          console.log(`Paused online eval config "${name}" (status: ${result.executionStatus})`);
+          console.log(`${pastTense} online eval config "${name}" (status: ${result.executionStatus})`);
         } else {
           render(<Text color="red">{result.error}</Text>);
         }
@@ -38,4 +39,14 @@ export const registerPause = (program: Command) => {
         process.exit(1);
       }
     });
+}
+
+export const registerPause = (program: Command) => {
+  const pauseCmd = program.command('pause').description(COMMAND_DESCRIPTIONS.pause);
+  registerOnlineEvalSubcommand(pauseCmd, 'pause');
+};
+
+export const registerResume = (program: Command) => {
+  const resumeCmd = program.command('resume').description(COMMAND_DESCRIPTIONS.resume);
+  registerOnlineEvalSubcommand(resumeCmd, 'resume');
 };
diff --git a/src/cli/commands/resume/command.tsx b/src/cli/commands/resume/command.tsx
index 49abcca0..15b214f0 100644
--- a/src/cli/commands/resume/command.tsx
+++ b/src/cli/commands/resume/command.tsx
@@ -1,41 +1 @@
-import { getErrorMessage } from '../../errors';
-import { handlePauseResume } from '../../operations/eval';
-import { COMMAND_DESCRIPTIONS } from '../../tui/copy';
-import { requireProject } from '../../tui/guards';
-import type { Command } from '@commander-js/extra-typings';
-import { Text, render } from 'ink';
-import React from 'react';
-
-export const registerResume = (program: Command) => {
-  const resumeCmd = program.command('resume').description(COMMAND_DESCRIPTIONS.resume);
-
-  resumeCmd
-    .command('online-eval')
-    .description('Resume a paused online eval config')
-    .argument('<name>', 'Online eval config name')
-    .option('--json', 'Output as JSON')
-    .action(async (name: string, cliOptions: { json?: boolean }) => {
-      requireProject();
-
-      try {
-        const result = await handlePauseResume({ name, json: cliOptions.json }, 'resume');
-
-        if (cliOptions.json) {
-          console.log(JSON.stringify(result));
-        } else if (result.success) {
-          console.log(`Resumed online eval config "${name}" (status: ${result.executionStatus})`);
-        } else {
-          render(<Text color="red">{result.error}</Text>);
-        }
-
-        process.exit(result.success ? 0 : 1);
-      } catch (error) {
-        if (cliOptions.json) {
-          console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
-        } else {
-          render(<Text color="red">Error: {getErrorMessage(error)}</Text>);
-        }
-        process.exit(1);
-      }
-    });
-};
+export { registerResume } from '../pause/command';
diff --git a/src/cli/operations/eval/get-eval-run.ts b/src/cli/operations/eval/get-eval-run.ts
index 724e0ede..6f592887 100644
--- a/src/cli/operations/eval/get-eval-run.ts
+++ b/src/cli/operations/eval/get-eval-run.ts
@@ -1,3 +1,4 @@
+import { getErrorMessage } from '../../errors';
 import { loadEvalRun } from './storage';
 import type { EvalRunResult, GetEvalRunOptions } from './types';
 
@@ -12,6 +13,6 @@ export function handleGetEvalRun(options: GetEvalRunOptions): GetEvalRunResult {
     const run = loadEvalRun(options.runId);
     return { success: true, run };
   } catch (err) {
-    return { success: false, error: (err as Error).message };
+    return { success: false, error: getErrorMessage(err) };
   }
 }
diff --git a/src/cli/operations/eval/list-eval-runs.ts b/src/cli/operations/eval/list-eval-runs.ts
index 53676a9e..66b0ed52 100644
--- a/src/cli/operations/eval/list-eval-runs.ts
+++ b/src/cli/operations/eval/list-eval-runs.ts
@@ -1,3 +1,4 @@
+import { getErrorMessage } from '../../errors';
 import { listEvalRuns } from './storage';
 import type { EvalRunResult, ListEvalRunsOptions } from './types';
 
@@ -21,6 +22,6 @@ export function handleListEvalRuns(options: ListEvalRunsOptions): ListEvalRunsRe
 
     return { success: true, runs };
   } catch (err) {
-    return { success: false, error: (err as Error).message };
+    return { success: false, error: getErrorMessage(err) };
   }
 }
diff --git a/src/cli/operations/eval/logs-eval.ts b/src/cli/operations/eval/logs-eval.ts
index 3446d5f9..0e60fa0e 100644
--- a/src/cli/operations/eval/logs-eval.ts
+++ b/src/cli/operations/eval/logs-eval.ts
@@ -1,5 +1,6 @@
 import { parseTimeString } from '../../../lib/utils';
 import { searchLogs, streamLogs } from '../../aws/cloudwatch';
+import type { DeployedProjectConfig } from '../resolve-agent';
 import { loadDeployedProjectConfig, resolveAgent } from '../resolve-agent';
 
 export interface LogsEvalOptions {
@@ -28,11 +29,7 @@ function formatLogLine(event: { timestamp: number; message: string }, json: bool
  * Resolve the online eval config log group names for a given agent.
  * Online eval results are written to: /aws/bedrock-agentcore/evaluations/results/{onlineEvalConfigId}
  */
-function resolveEvalLogGroups(
-  context: ReturnType<typeof loadDeployedProjectConfig> extends Promise<infer T> ? T : never,
-  agentName: string,
-  targetName: string
-): string[] {
+function resolveEvalLogGroups(context: DeployedProjectConfig, agentName: string, targetName: string): string[] {
   const { project, deployedState } = context;
   const targetResources = deployedState.targets[targetName]?.resources;
 
diff --git a/src/cli/operations/eval/run-eval.ts b/src/cli/operations/eval/run-eval.ts
index c440b7aa..d925df3f 100644
--- a/src/cli/operations/eval/run-eval.ts
+++ b/src/cli/operations/eval/run-eval.ts
@@ -8,6 +8,7 @@ import type { EvalEvaluatorResult, EvalRunResult, EvalSessionScore, RunEvalOptio
 import { CloudWatchLogsClient, GetQueryResultsCommand, StartQueryCommand } from '@aws-sdk/client-cloudwatch-logs';
 import type { ResultField } from '@aws-sdk/client-cloudwatch-logs';
 import type { DocumentType } from '@smithy/types';
+import { writeFileSync } from 'fs';
 
 const SPANS_LOG_GROUP = 'aws/spans';
 
@@ -162,6 +163,11 @@ function isRelevantForEval(doc: Record<string, unknown>): boolean {
   return false;
 }
 
+/** Sanitize a value for use in CloudWatch Insights query strings by removing single quotes. */
+function sanitizeQueryValue(value: string): string {
+  return value.replace(/'/g, '');
+}
+
 interface SessionSpans {
   sessionId: string;
   spans: DocumentType[];
@@ -195,7 +201,7 @@ async function fetchSessionSpans(
     SPANS_LOG_GROUP,
     `fields @message, attributes.session.id as sessionId, traceId
      | parse resource.attributes.cloud.resource_id "runtime/*/" as parsedAgentId
-     | filter parsedAgentId = '${runtimeId}'
+     | filter parsedAgentId = '${sanitizeQueryValue(runtimeId)}'
      | sort startTimeUnixNano asc
      | limit 10000`,
     startTimeSec,
@@ -237,7 +243,7 @@ async function fetchSessionSpans(
 
   // 2. Query runtime logs from the agent's log group for the trace IDs found
   if (traceIds.size > 0) {
-    const traceFilter = [...traceIds].map(t => `'${t}'`).join(', ');
+    const traceFilter = [...traceIds].map(t => `'${sanitizeQueryValue(t)}'`).join(', ');
     let logRows: ResultField[][] = [];
     try {
       logRows = await executeQuery(
@@ -381,7 +387,13 @@ export async function handleRunEval(options: RunEvalOptions): Promise<RunEvalRes
   };
 
   // Save to disk
-  const filePath = options.output ?? saveEvalRun(run);
+  let filePath: string;
+  if (options.output) {
+    writeFileSync(options.output, JSON.stringify(run, null, 2));
+    filePath = options.output;
+  } else {
+    filePath = saveEvalRun(run);
+  }
 
   return { success: true, run, filePath };
 }

From 056ce7a4838b38fd71484ef429edf33cba66f6b6 Mon Sep 17 00:00:00 2001
From: notgitika <gitijh@gmail.com>
Date: Thu, 12 Mar 2026 00:38:28 -0400
Subject: [PATCH 4/9] tests: add test coverage for evals

---
 .../aws/__tests__/agentcore-control.test.ts   |  90 +++-
 .../aws/__tests__/agentcore-evaluate.test.ts  | 235 +++++++++
 .../eval/__tests__/get-eval-run.test.ts       |  62 +++
 .../eval/__tests__/list-eval-runs.test.ts     |  99 ++++
 .../eval/__tests__/logs-eval.test.ts          | 209 ++++++++
 .../eval/__tests__/pause-resume.test.ts       | 122 +++++
 .../eval/__tests__/run-eval.test.ts           | 482 ++++++++++++++++++
 .../operations/eval/__tests__/storage.test.ts | 149 ++++++
 .../__tests__/EvaluatorPrimitive.test.ts      | 233 +++++++++
 .../OnlineEvalConfigPrimitive.test.ts         | 211 ++++++++
 .../screens/evaluator/__tests__/types.test.ts |  99 ++++
 .../primitives/__tests__/evaluator.test.ts    | 158 ++++++
 .../__tests__/online-eval-config.test.ts      |  90 ++++
 13 files changed, 2238 insertions(+), 1 deletion(-)
 create mode 100644 src/cli/aws/__tests__/agentcore-evaluate.test.ts
 create mode 100644 src/cli/operations/eval/__tests__/get-eval-run.test.ts
 create mode 100644 src/cli/operations/eval/__tests__/list-eval-runs.test.ts
 create mode 100644 src/cli/operations/eval/__tests__/logs-eval.test.ts
 create mode 100644 src/cli/operations/eval/__tests__/pause-resume.test.ts
 create mode 100644 src/cli/operations/eval/__tests__/run-eval.test.ts
 create mode 100644 src/cli/operations/eval/__tests__/storage.test.ts
 create mode 100644 src/cli/primitives/__tests__/EvaluatorPrimitive.test.ts
 create mode 100644 src/cli/primitives/__tests__/OnlineEvalConfigPrimitive.test.ts
 create mode 100644 src/cli/tui/screens/evaluator/__tests__/types.test.ts
 create mode 100644 src/schema/schemas/primitives/__tests__/evaluator.test.ts
 create mode 100644 src/schema/schemas/primitives/__tests__/online-eval-config.test.ts

diff --git a/src/cli/aws/__tests__/agentcore-control.test.ts b/src/cli/aws/__tests__/agentcore-control.test.ts
index 9ec6bae3..b4d629a6 100644
--- a/src/cli/aws/__tests__/agentcore-control.test.ts
+++ b/src/cli/aws/__tests__/agentcore-control.test.ts
@@ -1,4 +1,4 @@
-import { getAgentRuntimeStatus } from '../agentcore-control.js';
+import { getAgentRuntimeStatus, updateOnlineEvalExecutionStatus } from '../agentcore-control.js';
 import { beforeEach, describe, expect, it, vi } from 'vitest';
 
 const { mockSend } = vi.hoisted(() => ({
@@ -12,6 +12,9 @@ vi.mock('@aws-sdk/client-bedrock-agentcore-control', () => ({
   GetAgentRuntimeCommand: class {
     constructor(public input: unknown) {}
   },
+  UpdateOnlineEvaluationConfigCommand: class {
+    constructor(public input: unknown) {}
+  },
 }));
 
 vi.mock('../account', () => ({
@@ -56,3 +59,88 @@ describe('getAgentRuntimeStatus', () => {
     );
   });
 });
+
+describe('updateOnlineEvalExecutionStatus', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it('sends DISABLED to pause and returns result', async () => {
+    mockSend.mockResolvedValue({
+      onlineEvaluationConfigId: 'cfg-123',
+      executionStatus: 'DISABLED',
+      status: 'ACTIVE',
+    });
+
+    const result = await updateOnlineEvalExecutionStatus({
+      region: 'us-east-1',
+      onlineEvaluationConfigId: 'cfg-123',
+      executionStatus: 'DISABLED',
+    });
+
+    expect(result.configId).toBe('cfg-123');
+    expect(result.executionStatus).toBe('DISABLED');
+    expect(result.status).toBe('ACTIVE');
+  });
+
+  it('sends ENABLED to resume', async () => {
+    mockSend.mockResolvedValue({
+      onlineEvaluationConfigId: 'cfg-456',
+      executionStatus: 'ENABLED',
+      status: 'ACTIVE',
+    });
+
+    const result = await updateOnlineEvalExecutionStatus({
+      region: 'us-west-2',
+      onlineEvaluationConfigId: 'cfg-456',
+      executionStatus: 'ENABLED',
+    });
+
+    expect(result.configId).toBe('cfg-456');
+    expect(result.executionStatus).toBe('ENABLED');
+  });
+
+  it('passes correct params in command', async () => {
+    mockSend.mockResolvedValue({
+      onlineEvaluationConfigId: 'cfg-789',
+      executionStatus: 'DISABLED',
+      status: 'ACTIVE',
+    });
+
+    await updateOnlineEvalExecutionStatus({
+      region: 'us-east-1',
+      onlineEvaluationConfigId: 'cfg-789',
+      executionStatus: 'DISABLED',
+    });
+
+    const command = mockSend.mock.calls[0]![0];
+    expect(command.input.onlineEvaluationConfigId).toBe('cfg-789');
+    expect(command.input.executionStatus).toBe('DISABLED');
+  });
+
+  it('falls back to input values when response fields are undefined', async () => {
+    mockSend.mockResolvedValue({});
+
+    const result = await updateOnlineEvalExecutionStatus({
+      region: 'us-east-1',
+      onlineEvaluationConfigId: 'cfg-fallback',
+      executionStatus: 'ENABLED',
+    });
+
+    expect(result.configId).toBe('cfg-fallback');
+    expect(result.executionStatus).toBe('ENABLED');
+    expect(result.status).toBe('UNKNOWN');
+  });
+
+  it('propagates SDK errors', async () => {
+    mockSend.mockRejectedValue(new Error('Throttling'));
+
+    await expect(
+      updateOnlineEvalExecutionStatus({
+        region: 'us-east-1',
+        onlineEvaluationConfigId: 'cfg-err',
+        executionStatus: 'DISABLED',
+      })
+    ).rejects.toThrow('Throttling');
+  });
+});
diff --git a/src/cli/aws/__tests__/agentcore-evaluate.test.ts b/src/cli/aws/__tests__/agentcore-evaluate.test.ts
new file mode 100644
index 00000000..30eafffd
--- /dev/null
+++ b/src/cli/aws/__tests__/agentcore-evaluate.test.ts
@@ -0,0 +1,235 @@
+import { evaluate } from '../agentcore.js';
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+const { mockSend } = vi.hoisted(() => ({
+  mockSend: vi.fn(),
+}));
+
+vi.mock('@aws-sdk/client-bedrock-agentcore', () => ({
+  BedrockAgentCoreClient: class {
+    send = mockSend;
+  },
+  EvaluateCommand: class {
+    constructor(public input: unknown) {}
+  },
+}));
+
+vi.mock('../account', () => ({
+  getCredentialProvider: vi.fn().mockReturnValue({}),
+}));
+
+describe('evaluate', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it('sends evaluatorId and sessionSpans in the command', async () => {
+    mockSend.mockResolvedValue({
+      evaluationResults: [{ value: 4.0 }],
+    });
+
+    await evaluate({
+      region: 'us-east-1',
+      evaluatorId: 'eval-123',
+      sessionSpans: [{ traceId: 't1', spanId: 's1' }],
+    });
+
+    const command = mockSend.mock.calls[0]![0];
+    expect(command.input.evaluatorId).toBe('eval-123');
+    expect(command.input.evaluationInput.sessionSpans).toEqual([{ traceId: 't1', spanId: 's1' }]);
+  });
+
+  it('includes spanIds target when targetSpanIds is provided', async () => {
+    mockSend.mockResolvedValue({
+      evaluationResults: [{ value: 3.0 }],
+    });
+
+    await evaluate({
+      region: 'us-east-1',
+      evaluatorId: 'eval-123',
+      sessionSpans: [],
+      targetSpanIds: ['span-1', 'span-2'],
+    });
+
+    const command = mockSend.mock.calls[0]![0];
+    expect(command.input.evaluationTarget).toEqual({ spanIds: ['span-1', 'span-2'] });
+  });
+
+  it('includes traceIds target when targetTraceIds is provided', async () => {
+    mockSend.mockResolvedValue({
+      evaluationResults: [{ value: 3.0 }],
+    });
+
+    await evaluate({
+      region: 'us-east-1',
+      evaluatorId: 'eval-123',
+      sessionSpans: [],
+      targetTraceIds: ['trace-1'],
+    });
+
+    const command = mockSend.mock.calls[0]![0];
+    expect(command.input.evaluationTarget).toEqual({ traceIds: ['trace-1'] });
+  });
+
+  it('prefers spanIds over traceIds when both are provided', async () => {
+    mockSend.mockResolvedValue({
+      evaluationResults: [{ value: 3.0 }],
+    });
+
+    await evaluate({
+      region: 'us-east-1',
+      evaluatorId: 'eval-123',
+      sessionSpans: [],
+      targetSpanIds: ['span-1'],
+      targetTraceIds: ['trace-1'],
+    });
+
+    const command = mockSend.mock.calls[0]![0];
+    expect(command.input.evaluationTarget).toEqual({ spanIds: ['span-1'] });
+  });
+
+  it('omits evaluationTarget when neither targetSpanIds nor targetTraceIds provided', async () => {
+    mockSend.mockResolvedValue({
+      evaluationResults: [{ value: 3.0 }],
+    });
+
+    await evaluate({
+      region: 'us-east-1',
+      evaluatorId: 'eval-123',
+      sessionSpans: [],
+    });
+
+    const command = mockSend.mock.calls[0]![0];
+    expect(command.input.evaluationTarget).toBeUndefined();
+  });
+
+  it('throws when evaluationResults is undefined', async () => {
+    mockSend.mockResolvedValue({ evaluationResults: undefined });
+
+    await expect(evaluate({ region: 'us-east-1', evaluatorId: 'eval-123', sessionSpans: [] })).rejects.toThrow(
+      'No evaluation results returned'
+    );
+  });
+
+  it('maps response with spanContext correctly', async () => {
+    mockSend.mockResolvedValue({
+      evaluationResults: [
+        {
+          evaluatorArn: 'arn:aws:evaluator/eval-123',
+          evaluatorId: 'eval-123',
+          evaluatorName: 'MyEval',
+          explanation: 'Good quality',
+          value: 4.5,
+          label: 'Excellent',
+          errorMessage: undefined,
+          errorCode: undefined,
+          context: {
+            spanContext: {
+              sessionId: 'sess-1',
+              traceId: 'trace-1',
+              spanId: 'span-1',
+            },
+          },
+          tokenUsage: {
+            inputTokens: 100,
+            outputTokens: 50,
+            totalTokens: 150,
+          },
+        },
+      ],
+    });
+
+    const result = await evaluate({
+      region: 'us-east-1',
+      evaluatorId: 'eval-123',
+      sessionSpans: [],
+    });
+
+    expect(result.evaluationResults).toHaveLength(1);
+    const r = result.evaluationResults[0]!;
+    expect(r.evaluatorArn).toBe('arn:aws:evaluator/eval-123');
+    expect(r.value).toBe(4.5);
+    expect(r.explanation).toBe('Good quality');
+    expect(r.context).toEqual({ sessionId: 'sess-1', traceId: 'trace-1', spanId: 'span-1' });
+    expect(r.tokenUsage).toEqual({ inputTokens: 100, outputTokens: 50, totalTokens: 150 });
+  });
+
+  it('handles response without spanContext', async () => {
+    mockSend.mockResolvedValue({
+      evaluationResults: [
+        {
+          value: 3.0,
+          context: undefined,
+          tokenUsage: undefined,
+        },
+      ],
+    });
+
+    const result = await evaluate({
+      region: 'us-east-1',
+      evaluatorId: 'eval-123',
+      sessionSpans: [],
+    });
+
+    const r = result.evaluationResults[0]!;
+    expect(r.context).toBeUndefined();
+    expect(r.tokenUsage).toBeUndefined();
+  });
+
+  it('defaults token usage values to 0 when partially undefined', async () => {
+    mockSend.mockResolvedValue({
+      evaluationResults: [
+        {
+          value: 3.0,
+          tokenUsage: {
+            inputTokens: undefined,
+            outputTokens: 25,
+            totalTokens: undefined,
+          },
+        },
+      ],
+    });
+
+    const result = await evaluate({
+      region: 'us-east-1',
+      evaluatorId: 'eval-123',
+      sessionSpans: [],
+    });
+
+    expect(result.evaluationResults[0]!.tokenUsage).toEqual({
+      inputTokens: 0,
+      outputTokens: 25,
+      totalTokens: 0,
+    });
+  });
+
+  it('maps error results correctly', async () => {
+    mockSend.mockResolvedValue({
+      evaluationResults: [
+        {
+          value: 0,
+          errorMessage: 'Prompt template missing required field',
+          errorCode: 'TEMPLATE_ERROR',
+        },
+      ],
+    });
+
+    const result = await evaluate({
+      region: 'us-east-1',
+      evaluatorId: 'eval-123',
+      sessionSpans: [],
+    });
+
+    const r = result.evaluationResults[0]!;
+    expect(r.errorMessage).toBe('Prompt template missing required field');
+    expect(r.errorCode).toBe('TEMPLATE_ERROR');
+  });
+
+  it('propagates SDK errors', async () => {
+    mockSend.mockRejectedValue(new Error('AccessDeniedException'));
+
+    await expect(evaluate({ region: 'us-east-1', evaluatorId: 'eval-123', sessionSpans: [] })).rejects.toThrow(
+      'AccessDeniedException'
+    );
+  });
+});
diff --git a/src/cli/operations/eval/__tests__/get-eval-run.test.ts b/src/cli/operations/eval/__tests__/get-eval-run.test.ts
new file mode 100644
index 00000000..d019835b
--- /dev/null
+++ b/src/cli/operations/eval/__tests__/get-eval-run.test.ts
@@ -0,0 +1,62 @@
+import { handleGetEvalRun } from '../get-eval-run.js';
+import type { EvalRunResult } from '../types.js';
+import { afterEach, describe, expect, it, vi } from 'vitest';
+
+const mockLoadEvalRun = vi.fn();
+
+vi.mock('../storage', () => ({
+  loadEvalRun: (...args: unknown[]) => mockLoadEvalRun(...args),
+}));
+
+const sampleRun: EvalRunResult = {
+  runId: 'run_abc',
+  timestamp: '2025-01-15T10:00:00.000Z',
+  agent: 'test-agent',
+  evaluators: ['Builtin.GoalSuccessRate'],
+  lookbackDays: 7,
+  sessionCount: 5,
+  results: [
+    {
+      evaluator: 'Builtin.GoalSuccessRate',
+      aggregateScore: 0.9,
+      sessionScores: [{ sessionId: 's1', value: 0.9 }],
+    },
+  ],
+};
+
+describe('handleGetEvalRun', () => {
+  afterEach(() => vi.clearAllMocks());
+
+  it('returns the run on success', () => {
+    mockLoadEvalRun.mockReturnValue(sampleRun);
+
+    const result = handleGetEvalRun({ runId: 'run_abc' });
+
+    expect(result.success).toBe(true);
+    expect(result.run).toEqual(sampleRun);
+    expect(mockLoadEvalRun).toHaveBeenCalledWith('run_abc');
+  });
+
+  it('returns error when run is not found', () => {
+    mockLoadEvalRun.mockImplementation(() => {
+      throw new Error('Eval run "run_missing" not found');
+    });
+
+    const result = handleGetEvalRun({ runId: 'run_missing' });
+
+    expect(result.success).toBe(false);
+    expect(result.error).toContain('run_missing');
+    expect(result.run).toBeUndefined();
+  });
+
+  it('handles non-Error thrown values via getErrorMessage', () => {
+    mockLoadEvalRun.mockImplementation(() => {
+      throw new Error('string error');
+    });
+
+    const result = handleGetEvalRun({ runId: 'run_bad' });
+
+    expect(result.success).toBe(false);
+    expect(result.error).toBe('string error');
+  });
+});
diff --git a/src/cli/operations/eval/__tests__/list-eval-runs.test.ts b/src/cli/operations/eval/__tests__/list-eval-runs.test.ts
new file mode 100644
index 00000000..52c68ee7
--- /dev/null
+++ b/src/cli/operations/eval/__tests__/list-eval-runs.test.ts
@@ -0,0 +1,99 @@
+import { handleListEvalRuns } from '../list-eval-runs.js';
+import type { EvalRunResult } from '../types.js';
+import { afterEach, describe, expect, it, vi } from 'vitest';
+
+const mockListEvalRuns = vi.fn();
+
+vi.mock('../storage', () => ({
+  listEvalRuns: () => mockListEvalRuns(),
+}));
+
+function makeRun(agent: string, runId: string): EvalRunResult {
+  return {
+    runId,
+    timestamp: '2025-01-15T10:00:00.000Z',
+    agent,
+    evaluators: ['Builtin.GoalSuccessRate'],
+    lookbackDays: 7,
+    sessionCount: 3,
+    results: [],
+  };
+}
+
+describe('handleListEvalRuns', () => {
+  afterEach(() => vi.clearAllMocks());
+
+  it('returns all runs when no filters specified', () => {
+    const runs = [makeRun('agent-a', 'run_1'), makeRun('agent-b', 'run_2')];
+    mockListEvalRuns.mockReturnValue(runs);
+
+    const result = handleListEvalRuns({});
+
+    expect(result.success).toBe(true);
+    expect(result.runs).toHaveLength(2);
+  });
+
+  it('filters by agent name', () => {
+    const runs = [makeRun('agent-a', 'run_1'), makeRun('agent-b', 'run_2'), makeRun('agent-a', 'run_3')];
+    mockListEvalRuns.mockReturnValue(runs);
+
+    const result = handleListEvalRuns({ agent: 'agent-a' });
+
+    expect(result.success).toBe(true);
+    expect(result.runs).toHaveLength(2);
+    expect(result.runs!.every(r => r.agent === 'agent-a')).toBe(true);
+  });
+
+  it('limits the number of results', () => {
+    const runs = [makeRun('a', 'run_1'), makeRun('a', 'run_2'), makeRun('a', 'run_3')];
+    mockListEvalRuns.mockReturnValue(runs);
+
+    const result = handleListEvalRuns({ limit: 2 });
+
+    expect(result.success).toBe(true);
+    expect(result.runs).toHaveLength(2);
+  });
+
+  it('applies agent filter before limit', () => {
+    const runs = [makeRun('a', 'run_1'), makeRun('b', 'run_2'), makeRun('a', 'run_3'), makeRun('a', 'run_4')];
+    mockListEvalRuns.mockReturnValue(runs);
+
+    const result = handleListEvalRuns({ agent: 'a', limit: 2 });
+
+    expect(result.runs).toHaveLength(2);
+    expect(result.runs![0]!.runId).toBe('run_1');
+    expect(result.runs![1]!.runId).toBe('run_3');
+  });
+
+  it('returns empty array when no runs exist', () => {
+    mockListEvalRuns.mockReturnValue([]);
+
+    const result = handleListEvalRuns({});
+
+    expect(result.success).toBe(true);
+    expect(result.runs).toEqual([]);
+  });
+
+  it('returns error when storage throws', () => {
+    mockListEvalRuns.mockImplementation(() => {
+      throw new Error('disk error');
+    });
+
+    const result = handleListEvalRuns({});
+
+    expect(result.success).toBe(false);
+    expect(result.error).toBe('disk error');
+    expect(result.runs).toBeUndefined();
+  });
+
+  it('handles non-Error thrown values', () => {
+    mockListEvalRuns.mockImplementation(() => {
+      throw new Error('42');
+    });
+
+    const result = handleListEvalRuns({});
+
+    expect(result.success).toBe(false);
+    expect(result.error).toBe('42');
+  });
+});
diff --git a/src/cli/operations/eval/__tests__/logs-eval.test.ts b/src/cli/operations/eval/__tests__/logs-eval.test.ts
new file mode 100644
index 00000000..70c88252
--- /dev/null
+++ b/src/cli/operations/eval/__tests__/logs-eval.test.ts
@@ -0,0 +1,209 @@
+import { handleLogsEval } from '../logs-eval.js';
+import { afterEach, describe, expect, it, vi } from 'vitest';
+
+const mockLoadDeployedProjectConfig = vi.fn();
+const mockResolveAgent = vi.fn();
+const mockSearchLogs = vi.fn();
+const mockStreamLogs = vi.fn();
+
+vi.mock('../../resolve-agent', () => ({
+  loadDeployedProjectConfig: () => mockLoadDeployedProjectConfig(),
+  resolveAgent: (...args: unknown[]) => mockResolveAgent(...args),
+}));
+
+vi.mock('../../../aws/cloudwatch', () => ({
+  searchLogs: (...args: unknown[]) => mockSearchLogs(...args),
+  streamLogs: (...args: unknown[]) => mockStreamLogs(...args),
+}));
+
+vi.mock('../../../../lib/utils', () => ({
+  parseTimeString: (s: string) => (s === '1h' ? Date.now() - 3_600_000 : Date.now()),
+}));
+
+function makeContext({
+  agentName = 'my-agent',
+  onlineEvalConfigs = [{ name: 'eval-config', agents: ['my-agent'] }] as { name: string; agents: string[] }[],
+  deployedConfigId = 'cfg-123',
+} = {}) {
+  return {
+    project: {
+      agents: [{ name: agentName }],
+      onlineEvalConfigs,
+    },
+    awsTargets: [{ name: 'dev', region: 'us-east-1', account: '111222333444' }],
+    deployedState: {
+      targets: {
+        dev: {
+          resources: {
+            agents: {
+              [agentName]: {
+                runtimeId: 'rt-123',
+                runtimeArn: `arn:aws:bedrock:us-east-1:111222333444:agent-runtime/rt-123`,
+                roleArn: 'arn:aws:iam::111222333444:role/test',
+              },
+            },
+            onlineEvalConfigs: deployedConfigId
+              ? {
+                  'eval-config': {
+                    onlineEvaluationConfigId: deployedConfigId,
+                    onlineEvaluationConfigArn: `arn:aws:bedrock:us-east-1:111222333444:online-evaluation-config/${deployedConfigId}`,
+                  },
+                }
+              : {},
+          },
+        },
+      },
+    },
+  };
+}
+
+function makeResolvedAgent(agentName = 'my-agent') {
+  return {
+    success: true as const,
+    agent: {
+      agentName,
+      targetName: 'dev',
+      region: 'us-east-1',
+      accountId: '111222333444',
+      runtimeId: 'rt-123',
+    },
+  };
+}
+
+describe('handleLogsEval', () => {
+  afterEach(() => vi.clearAllMocks());
+
+  it('returns error when agent resolution fails', async () => {
+    mockLoadDeployedProjectConfig.mockResolvedValue({});
+    mockResolveAgent.mockReturnValue({ success: false, error: 'No agents defined' });
+
+    const result = await handleLogsEval({});
+
+    expect(result.success).toBe(false);
+    expect(result.error).toBe('No agents defined');
+  });
+
+  it('returns error when no online eval configs exist for the agent', async () => {
+    const ctx = makeContext({ onlineEvalConfigs: [] });
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue(makeResolvedAgent());
+
+    const result = await handleLogsEval({});
+
+    expect(result.success).toBe(false);
+    expect(result.error).toContain('No deployed online eval configs found');
+  });
+
+  it('returns error when online eval configs exist but none are deployed', async () => {
+    const ctx = makeContext({ deployedConfigId: '' });
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue(makeResolvedAgent());
+
+    const result = await handleLogsEval({});
+
+    expect(result.success).toBe(false);
+    expect(result.error).toContain('No deployed online eval configs found');
+  });
+
+  it('searches logs with time range when --since is specified', async () => {
+    const ctx = makeContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue(makeResolvedAgent());
+
+    async function* emptyGenerator() {
+      // no events
+    }
+    mockSearchLogs.mockReturnValue(emptyGenerator());
+
+    const result = await handleLogsEval({ since: '1h' });
+
+    expect(result.success).toBe(true);
+    expect(mockSearchLogs).toHaveBeenCalledWith(
+      expect.objectContaining({
+        logGroupName: '/aws/bedrock-agentcore/evaluations/results/cfg-123',
+        region: 'us-east-1',
+      })
+    );
+    expect(mockStreamLogs).not.toHaveBeenCalled();
+  });
+
+  it('streams logs by default when no time range is specified', async () => {
+    const ctx = makeContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue(makeResolvedAgent());
+
+    async function* emptyGenerator() {
+      // no events
+    }
+    mockStreamLogs.mockReturnValue(emptyGenerator());
+
+    // eslint-disable-next-line @typescript-eslint/no-empty-function
+    const consoleSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
+
+    const result = await handleLogsEval({});
+
+    expect(result.success).toBe(true);
+    expect(mockStreamLogs).toHaveBeenCalledWith(
+      expect.objectContaining({
+        logGroupName: '/aws/bedrock-agentcore/evaluations/results/cfg-123',
+        region: 'us-east-1',
+      })
+    );
+    expect(mockSearchLogs).not.toHaveBeenCalled();
+
+    consoleSpy.mockRestore();
+  });
+
+  it('skips ResourceNotFoundException during search', async () => {
+    const ctx = makeContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue(makeResolvedAgent());
+
+    // eslint-disable-next-line require-yield, @typescript-eslint/require-await
+    async function* throwNotFound(): AsyncGenerator<never> {
+      const err = new Error('Log group not found');
+      (err as Error & { name: string }).name = 'ResourceNotFoundException';
+      throw err;
+    }
+    mockSearchLogs.mockReturnValue(throwNotFound());
+
+    const result = await handleLogsEval({ since: '1h' });
+
+    expect(result.success).toBe(true);
+  });
+
+  it('resolves correct log group path from deployed config', async () => {
+    const ctx = makeContext({ deployedConfigId: 'my-custom-config-id' });
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue(makeResolvedAgent());
+
+    async function* emptyGenerator() {
+      // no events
+    }
+    mockSearchLogs.mockReturnValue(emptyGenerator());
+
+    await handleLogsEval({ since: '1h' });
+
+    expect(mockSearchLogs).toHaveBeenCalledWith(
+      expect.objectContaining({
+        logGroupName: '/aws/bedrock-agentcore/evaluations/results/my-custom-config-id',
+      })
+    );
+  });
+
+  it('only resolves configs that match the target agent', async () => {
+    const ctx = makeContext({
+      agentName: 'my-agent',
+      onlineEvalConfigs: [
+        { name: 'eval-config', agents: ['other-agent'] }, // doesn't match
+      ],
+    });
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue(makeResolvedAgent());
+
+    const result = await handleLogsEval({});
+
+    expect(result.success).toBe(false);
+    expect(result.error).toContain('No deployed online eval configs found');
+  });
+});
diff --git a/src/cli/operations/eval/__tests__/pause-resume.test.ts b/src/cli/operations/eval/__tests__/pause-resume.test.ts
new file mode 100644
index 00000000..085f7326
--- /dev/null
+++ b/src/cli/operations/eval/__tests__/pause-resume.test.ts
@@ -0,0 +1,122 @@
+import { handlePauseResume } from '../pause-resume.js';
+import { afterEach, describe, expect, it, vi } from 'vitest';
+
+const mockLoadDeployedProjectConfig = vi.fn();
+const mockUpdateOnlineEvalExecutionStatus = vi.fn();
+
+vi.mock('../../resolve-agent', () => ({
+  loadDeployedProjectConfig: () => mockLoadDeployedProjectConfig(),
+}));
+
+vi.mock('../../../aws/agentcore-control', () => ({
+  updateOnlineEvalExecutionStatus: (...args: unknown[]) => mockUpdateOnlineEvalExecutionStatus(...args),
+}));
+
+function makeContext(configName: string, configId: string, targetName = 'dev', region = 'us-east-1') {
+  return {
+    project: {},
+    awsTargets: [{ name: targetName, region, account: '123456789012' }],
+    deployedState: {
+      targets: {
+        [targetName]: {
+          resources: {
+            onlineEvalConfigs: {
+              [configName]: {
+                onlineEvaluationConfigId: configId,
+                onlineEvaluationConfigArn: `arn:aws:bedrock:${region}:123456789012:online-evaluation-config/${configId}`,
+              },
+            },
+          },
+        },
+      },
+    },
+  };
+}
+
+describe('handlePauseResume', () => {
+  afterEach(() => vi.clearAllMocks());
+
+  it('pauses an online eval config', async () => {
+    mockLoadDeployedProjectConfig.mockResolvedValue(makeContext('my-config', 'cfg-123'));
+    mockUpdateOnlineEvalExecutionStatus.mockResolvedValue({
+      configId: 'cfg-123',
+      executionStatus: 'DISABLED',
+      status: 'ACTIVE',
+    });
+
+    const result = await handlePauseResume({ name: 'my-config' }, 'pause');
+
+    expect(result.success).toBe(true);
+    expect(result.executionStatus).toBe('DISABLED');
+    expect(mockUpdateOnlineEvalExecutionStatus).toHaveBeenCalledWith({
+      region: 'us-east-1',
+      onlineEvaluationConfigId: 'cfg-123',
+      executionStatus: 'DISABLED',
+    });
+  });
+
+  it('resumes an online eval config', async () => {
+    mockLoadDeployedProjectConfig.mockResolvedValue(makeContext('my-config', 'cfg-123'));
+    mockUpdateOnlineEvalExecutionStatus.mockResolvedValue({
+      configId: 'cfg-123',
+      executionStatus: 'ENABLED',
+      status: 'ACTIVE',
+    });
+
+    const result = await handlePauseResume({ name: 'my-config' }, 'resume');
+
+    expect(result.success).toBe(true);
+    expect(result.executionStatus).toBe('ENABLED');
+    expect(mockUpdateOnlineEvalExecutionStatus).toHaveBeenCalledWith({
+      region: 'us-east-1',
+      onlineEvaluationConfigId: 'cfg-123',
+      executionStatus: 'ENABLED',
+    });
+  });
+
+  it('returns error when no deployed targets exist', async () => {
+    mockLoadDeployedProjectConfig.mockResolvedValue({
+      project: {},
+      awsTargets: [],
+      deployedState: { targets: {} },
+    });
+
+    const result = await handlePauseResume({ name: 'my-config' }, 'pause');
+
+    expect(result.success).toBe(false);
+    expect(result.error).toContain('No deployed targets found');
+  });
+
+  it('returns error when config name is not found in deployed state', async () => {
+    mockLoadDeployedProjectConfig.mockResolvedValue(makeContext('other-config', 'cfg-999'));
+
+    const result = await handlePauseResume({ name: 'missing-config' }, 'pause');
+
+    expect(result.success).toBe(false);
+    expect(result.error).toContain('missing-config');
+    expect(result.error).toContain('not found');
+  });
+
+  it('returns error when target config is missing from aws-targets', async () => {
+    const context = makeContext('my-config', 'cfg-123');
+    // Remove the target from awsTargets but keep it in deployedState
+    context.awsTargets = [];
+    mockLoadDeployedProjectConfig.mockResolvedValue(context);
+
+    const result = await handlePauseResume({ name: 'my-config' }, 'pause');
+
+    expect(result.success).toBe(false);
+    expect(result.error).toContain('Target config');
+    expect(result.error).toContain('not found');
+  });
+
+  it('returns error when the SDK call fails', async () => {
+    mockLoadDeployedProjectConfig.mockResolvedValue(makeContext('my-config', 'cfg-123'));
+    mockUpdateOnlineEvalExecutionStatus.mockRejectedValue(new Error('Service unavailable'));
+
+    const result = await handlePauseResume({ name: 'my-config' }, 'pause');
+
+    expect(result.success).toBe(false);
+    expect(result.error).toBe('Service unavailable');
+  });
+});
diff --git a/src/cli/operations/eval/__tests__/run-eval.test.ts b/src/cli/operations/eval/__tests__/run-eval.test.ts
new file mode 100644
index 00000000..9b19ea0e
--- /dev/null
+++ b/src/cli/operations/eval/__tests__/run-eval.test.ts
@@ -0,0 +1,482 @@
+import { handleRunEval } from '../run-eval.js';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+
+// ─── Mocks ────────────────────────────────────────────────────────────────────
+
+const mockResolveAgent = vi.fn();
+const mockLoadDeployedProjectConfig = vi.fn();
+const mockEvaluate = vi.fn();
+const mockSaveEvalRun = vi.fn();
+const mockGenerateRunId = vi.fn();
+const mockSend = vi.fn();
+const mockGetCredentialProvider = vi.fn().mockReturnValue({});
+const mockWriteFileSync = vi.fn();
+
+vi.mock('../../resolve-agent', () => ({
+  loadDeployedProjectConfig: () => mockLoadDeployedProjectConfig(),
+  resolveAgent: (...args: unknown[]) => mockResolveAgent(...args),
+}));
+
+vi.mock('../../../aws/agentcore', () => ({
+  evaluate: (...args: unknown[]) => mockEvaluate(...args),
+}));
+
+vi.mock('../../../aws', () => ({
+  getCredentialProvider: () => mockGetCredentialProvider(),
+}));
+
+vi.mock('../storage', () => ({
+  generateRunId: () => mockGenerateRunId(),
+  saveEvalRun: (...args: unknown[]) => mockSaveEvalRun(...args),
+}));
+
+vi.mock('fs', async importOriginal => {
+  const original = await importOriginal<typeof import('fs')>();
+  return {
+    ...original,
+    writeFileSync: (...args: unknown[]) => mockWriteFileSync(...args),
+  };
+});
+
+vi.mock('@aws-sdk/client-cloudwatch-logs', () => ({
+  CloudWatchLogsClient: class {
+    send = mockSend;
+  },
+  StartQueryCommand: class {
+    constructor(public input: unknown) {}
+  },
+  GetQueryResultsCommand: class {
+    constructor(public input: unknown) {}
+  },
+}));
+
+// ─── Helpers ──────────────────────────────────────────────────────────────────
+
+function makeDeployedContext({
+  agentName = 'my-agent',
+  runtimeId = 'rt-123',
+  evaluators = {} as Record<string, { evaluatorId: string }>,
+} = {}) {
+  return {
+    project: {
+      agents: [{ name: agentName }],
+      onlineEvalConfigs: [],
+    },
+    awsTargets: [{ name: 'dev', region: 'us-east-1', account: '111222333444' }],
+    deployedState: {
+      targets: {
+        dev: {
+          resources: {
+            agents: {
+              [agentName]: {
+                runtimeId,
+                runtimeArn: `arn:aws:bedrock:us-east-1:111222333444:agent-runtime/${runtimeId}`,
+                roleArn: 'arn:aws:iam::111222333444:role/test',
+              },
+            },
+            evaluators,
+          },
+        },
+      },
+    },
+  };
+}
+
+function makeOtelSpanRow(sessionId: string, traceId: string, spanBody: Record<string, unknown> = {}) {
+  const message = JSON.stringify({
+    scope: { name: 'strands.telemetry.tracer' },
+    body: spanBody,
+    traceId,
+  });
+  return [
+    { field: '@message', value: message },
+    { field: 'sessionId', value: sessionId },
+    { field: 'traceId', value: traceId },
+  ];
+}
+
+function setupCloudWatchToReturn(spanRows: unknown[][], runtimeLogRows: unknown[][] = []) {
+  let queryCount = 0;
+  mockSend.mockImplementation((cmd: { input: unknown }) => {
+    const input = cmd.input as Record<string, unknown>;
+
+    if ('queryString' in input) {
+      // StartQueryCommand
+      queryCount++;
+      return Promise.resolve({ queryId: `q-${queryCount}` });
+    }
+
+    // GetQueryResultsCommand — return Complete immediately
+    if (queryCount === 1) {
+      return Promise.resolve({ status: 'Complete', results: spanRows });
+    }
+    return Promise.resolve({ status: 'Complete', results: runtimeLogRows });
+  });
+}
+
+describe('handleRunEval', () => {
+  beforeEach(() => {
+    mockGenerateRunId.mockReturnValue('run_test-123');
+    mockSaveEvalRun.mockReturnValue('/tmp/eval-results/run_test-123.json');
+  });
+
+  afterEach(() => vi.clearAllMocks());
+
+  // ─── Context resolution ───────────────────────────────────────────────────
+
+  it('returns error when agent resolution fails', async () => {
+    mockLoadDeployedProjectConfig.mockResolvedValue({});
+    mockResolveAgent.mockReturnValue({ success: false, error: 'No agents defined' });
+
+    const result = await handleRunEval({ evaluator: ['Builtin.GoalSuccessRate'], days: 7 });
+
+    expect(result.success).toBe(false);
+    expect(result.error).toBe('No agents defined');
+  });
+
+  it('returns error when a custom evaluator is not found in deployed state', async () => {
+    const ctx = makeDeployedContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue({
+      success: true,
+      agent: {
+        agentName: 'my-agent',
+        targetName: 'dev',
+        region: 'us-east-1',
+        accountId: '111222333444',
+        runtimeId: 'rt-123',
+      },
+    });
+
+    const result = await handleRunEval({ evaluator: ['MissingEval'], days: 7 });
+
+    expect(result.success).toBe(false);
+    expect(result.error).toContain('MissingEval');
+    expect(result.error).toContain('not found in deployed state');
+  });
+
+  it('resolves builtin evaluators without deployed state lookup', async () => {
+    const ctx = makeDeployedContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue({
+      success: true,
+      agent: {
+        agentName: 'my-agent',
+        targetName: 'dev',
+        region: 'us-east-1',
+        accountId: '111222333444',
+        runtimeId: 'rt-123',
+      },
+    });
+
+    // No spans found — will return before calling evaluate
+    setupCloudWatchToReturn([]);
+
+    const result = await handleRunEval({ evaluator: ['Builtin.GoalSuccessRate'], days: 7 });
+
+    // Fails because no spans, but NOT because evaluator wasn't found
+    expect(result.error).toContain('No session spans found');
+  });
+
+  it('resolves custom evaluator name to deployed evaluator ID', async () => {
+    const ctx = makeDeployedContext({
+      evaluators: { MyCustomEval: { evaluatorId: 'eval-custom-id' } },
+    });
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue({
+      success: true,
+      agent: {
+        agentName: 'my-agent',
+        targetName: 'dev',
+        region: 'us-east-1',
+        accountId: '111222333444',
+        runtimeId: 'rt-123',
+      },
+    });
+
+    const spanRows = [makeOtelSpanRow('session-1', 'trace-1')];
+    setupCloudWatchToReturn(spanRows);
+
+    mockEvaluate.mockResolvedValue({
+      evaluationResults: [{ value: 4.0, context: { spanContext: { sessionId: 'session-1' } } }],
+    });
+
+    const result = await handleRunEval({ evaluator: ['MyCustomEval'], days: 7 });
+
+    expect(result.success).toBe(true);
+    expect(mockEvaluate).toHaveBeenCalledWith(expect.objectContaining({ evaluatorId: 'eval-custom-id' }));
+  });
+
+  it('extracts evaluator ID from ARN when --evaluator-arn is passed', async () => {
+    const ctx = makeDeployedContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue({
+      success: true,
+      agent: {
+        agentName: 'my-agent',
+        targetName: 'dev',
+        region: 'us-east-1',
+        accountId: '111222333444',
+        runtimeId: 'rt-123',
+      },
+    });
+
+    const spanRows = [makeOtelSpanRow('session-1', 'trace-1')];
+    setupCloudWatchToReturn(spanRows);
+
+    mockEvaluate.mockResolvedValue({
+      evaluationResults: [{ value: 3.0, context: { spanContext: { sessionId: 'session-1' } } }],
+    });
+
+    const result = await handleRunEval({
+      evaluator: [],
+      evaluatorArn: ['arn:aws:bedrock:us-east-1:123:evaluator/my-eval-id'],
+      days: 7,
+    });
+
+    expect(result.success).toBe(true);
+    expect(mockEvaluate).toHaveBeenCalledWith(expect.objectContaining({ evaluatorId: 'my-eval-id' }));
+  });
+
+  // ─── No sessions ──────────────────────────────────────────────────────────
+
+  it('returns error when no session spans are found', async () => {
+    const ctx = makeDeployedContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue({
+      success: true,
+      agent: {
+        agentName: 'my-agent',
+        targetName: 'dev',
+        region: 'us-east-1',
+        accountId: '111222333444',
+        runtimeId: 'rt-123',
+      },
+    });
+
+    setupCloudWatchToReturn([]);
+
+    const result = await handleRunEval({ evaluator: ['Builtin.GoalSuccessRate'], days: 7 });
+
+    expect(result.success).toBe(false);
+    expect(result.error).toContain('No session spans found');
+    expect(result.error).toContain('my-agent');
+  });
+
+  // ─── Successful evaluation ────────────────────────────────────────────────
+
+  it('runs evaluation across sessions and computes aggregate score', async () => {
+    const ctx = makeDeployedContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue({
+      success: true,
+      agent: {
+        agentName: 'my-agent',
+        targetName: 'dev',
+        region: 'us-east-1',
+        accountId: '111222333444',
+        runtimeId: 'rt-123',
+      },
+    });
+
+    const spanRows = [makeOtelSpanRow('session-1', 'trace-1'), makeOtelSpanRow('session-2', 'trace-2')];
+    setupCloudWatchToReturn(spanRows);
+
+    mockEvaluate
+      .mockResolvedValueOnce({
+        evaluationResults: [
+          {
+            value: 4.0,
+            context: { spanContext: { sessionId: 'session-1', traceId: 'trace-1' } },
+            tokenUsage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
+          },
+        ],
+      })
+      .mockResolvedValueOnce({
+        evaluationResults: [
+          {
+            value: 2.0,
+            context: { spanContext: { sessionId: 'session-2', traceId: 'trace-2' } },
+            tokenUsage: { inputTokens: 80, outputTokens: 40, totalTokens: 120 },
+          },
+        ],
+      });
+
+    const result = await handleRunEval({ evaluator: ['Builtin.GoalSuccessRate'], days: 7 });
+
+    expect(result.success).toBe(true);
+    expect(result.run).toBeDefined();
+    expect(result.run!.sessionCount).toBe(2);
+    expect(result.run!.results).toHaveLength(1);
+
+    const evalResult = result.run!.results[0]!;
+    expect(evalResult.aggregateScore).toBe(3.0); // (4 + 2) / 2
+    expect(evalResult.sessionScores).toHaveLength(2);
+    expect(evalResult.tokenUsage).toEqual({ inputTokens: 180, outputTokens: 90, totalTokens: 270 });
+  });
+
+  it('excludes errored sessions from aggregate score', async () => {
+    const ctx = makeDeployedContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue({
+      success: true,
+      agent: {
+        agentName: 'my-agent',
+        targetName: 'dev',
+        region: 'us-east-1',
+        accountId: '111222333444',
+        runtimeId: 'rt-123',
+      },
+    });
+
+    const spanRows = [makeOtelSpanRow('session-1', 'trace-1')];
+    setupCloudWatchToReturn(spanRows);
+
+    mockEvaluate.mockResolvedValue({
+      evaluationResults: [
+        { value: 5.0, context: { spanContext: { sessionId: 's1' } } },
+        { value: 0, errorMessage: 'something failed', context: { spanContext: { sessionId: 's2' } } },
+      ],
+    });
+
+    const result = await handleRunEval({ evaluator: ['Builtin.GoalSuccessRate'], days: 7 });
+
+    expect(result.success).toBe(true);
+    const evalResult = result.run!.results[0]!;
+    // Only the non-errored session (value 5.0) should be in the aggregate
+    expect(evalResult.aggregateScore).toBe(5.0);
+    expect(evalResult.sessionScores).toHaveLength(2);
+  });
+
+  // ─── Output handling ──────────────────────────────────────────────────────
+
+  it('saves to default location when no output option', async () => {
+    const ctx = makeDeployedContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue({
+      success: true,
+      agent: {
+        agentName: 'my-agent',
+        targetName: 'dev',
+        region: 'us-east-1',
+        accountId: '111222333444',
+        runtimeId: 'rt-123',
+      },
+    });
+
+    setupCloudWatchToReturn([makeOtelSpanRow('s1', 't1')]);
+    mockEvaluate.mockResolvedValue({
+      evaluationResults: [{ value: 3.0, context: { spanContext: { sessionId: 's1' } } }],
+    });
+
+    const result = await handleRunEval({ evaluator: ['Builtin.GoalSuccessRate'], days: 7 });
+
+    expect(result.success).toBe(true);
+    expect(mockSaveEvalRun).toHaveBeenCalled();
+    expect(mockWriteFileSync).not.toHaveBeenCalled();
+    expect(result.filePath).toBe('/tmp/eval-results/run_test-123.json');
+  });
+
+  it('writes to custom output path when --output is specified', async () => {
+    const ctx = makeDeployedContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue({
+      success: true,
+      agent: {
+        agentName: 'my-agent',
+        targetName: 'dev',
+        region: 'us-east-1',
+        accountId: '111222333444',
+        runtimeId: 'rt-123',
+      },
+    });
+
+    setupCloudWatchToReturn([makeOtelSpanRow('s1', 't1')]);
+    mockEvaluate.mockResolvedValue({
+      evaluationResults: [{ value: 3.0, context: { spanContext: { sessionId: 's1' } } }],
+    });
+
+    const result = await handleRunEval({
+      evaluator: ['Builtin.GoalSuccessRate'],
+      days: 7,
+      output: '/tmp/my-output.json',
+    });
+
+    expect(result.success).toBe(true);
+    expect(mockWriteFileSync).toHaveBeenCalledWith('/tmp/my-output.json', expect.any(String));
+    expect(mockSaveEvalRun).not.toHaveBeenCalled();
+    expect(result.filePath).toBe('/tmp/my-output.json');
+  });
+
+  // ─── Multiple evaluators ─────────────────────────────────────────────────
+
+  it('runs multiple evaluators and returns separate results for each', async () => {
+    const ctx = makeDeployedContext({
+      evaluators: { CustomEval: { evaluatorId: 'eval-custom' } },
+    });
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue({
+      success: true,
+      agent: {
+        agentName: 'my-agent',
+        targetName: 'dev',
+        region: 'us-east-1',
+        accountId: '111222333444',
+        runtimeId: 'rt-123',
+      },
+    });
+
+    setupCloudWatchToReturn([makeOtelSpanRow('s1', 't1')]);
+
+    mockEvaluate
+      .mockResolvedValueOnce({
+        evaluationResults: [{ value: 0.9, context: { spanContext: { sessionId: 's1' } } }],
+      })
+      .mockResolvedValueOnce({
+        evaluationResults: [{ value: 4.5, context: { spanContext: { sessionId: 's1' } } }],
+      });
+
+    const result = await handleRunEval({
+      evaluator: ['Builtin.GoalSuccessRate', 'CustomEval'],
+      days: 7,
+    });
+
+    expect(result.success).toBe(true);
+    expect(result.run!.results).toHaveLength(2);
+    expect(result.run!.results[0]!.evaluator).toBe('Builtin.GoalSuccessRate');
+    expect(result.run!.results[0]!.aggregateScore).toBe(0.9);
+    expect(result.run!.results[1]!.evaluator).toBe('CustomEval');
+    expect(result.run!.results[1]!.aggregateScore).toBe(4.5);
+  });
+
+  // ─── Query sanitization ───────────────────────────────────────────────────
+
+  it('sanitizes runtimeId in CloudWatch query to prevent injection', async () => {
+    const ctx = makeDeployedContext({ runtimeId: "rt-123'; DROP TABLE" });
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue({
+      success: true,
+      agent: {
+        agentName: 'my-agent',
+        targetName: 'dev',
+        region: 'us-east-1',
+        accountId: '111222333444',
+        runtimeId: "rt-123'; DROP TABLE",
+      },
+    });
+
+    setupCloudWatchToReturn([]);
+
+    await handleRunEval({ evaluator: ['Builtin.GoalSuccessRate'], days: 7 });
+
+    // Verify the StartQueryCommand was called with sanitized runtimeId (no single quotes)
+    const startQueryCall = mockSend.mock.calls.find((call: unknown[]) => {
+      const input = (call[0] as { input?: { queryString?: string } }).input;
+      return input?.queryString !== undefined;
+    });
+    expect(startQueryCall).toBeDefined();
+    const queryString = (startQueryCall![0] as { input: { queryString: string } }).input.queryString;
+    expect(queryString).not.toContain("'rt-123'; DROP TABLE'");
+    expect(queryString).toContain('rt-123; DROP TABLE');
+  });
+});
diff --git a/src/cli/operations/eval/__tests__/storage.test.ts b/src/cli/operations/eval/__tests__/storage.test.ts
new file mode 100644
index 00000000..db56e34b
--- /dev/null
+++ b/src/cli/operations/eval/__tests__/storage.test.ts
@@ -0,0 +1,149 @@
+import { generateRunId, listEvalRuns, loadEvalRun, saveEvalRun } from '../storage.js';
+import type { EvalRunResult } from '../types.js';
+// Use real fs via a temp directory
+import { existsSync, mkdirSync, rmSync } from 'fs';
+import { tmpdir } from 'os';
+import { join } from 'path';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+
+const mockFindConfigRoot = vi.fn();
+
+vi.mock('../../../../lib', () => ({
+  findConfigRoot: () => mockFindConfigRoot(),
+}));
+
+function makeTmpDir(): string {
+  const dir = join(tmpdir(), `eval-storage-test-${Date.now()}-${Math.random().toString(36).slice(2)}`);
+  mkdirSync(dir, { recursive: true });
+  return dir;
+}
+
+function makeRunResult(overrides: Partial<EvalRunResult> = {}): EvalRunResult {
+  return {
+    runId: overrides.runId ?? `run_${Date.now()}`,
+    timestamp: '2025-01-15T10:00:00.000Z',
+    agent: 'test-agent',
+    evaluators: ['Builtin.GoalSuccessRate'],
+    lookbackDays: 7,
+    sessionCount: 3,
+    results: [
+      {
+        evaluator: 'Builtin.GoalSuccessRate',
+        aggregateScore: 0.85,
+        sessionScores: [{ sessionId: 's1', value: 0.85 }],
+      },
+    ],
+    ...overrides,
+  };
+}
+
+describe('storage', () => {
+  let tmpDir: string;
+
+  beforeEach(() => {
+    tmpDir = makeTmpDir();
+    mockFindConfigRoot.mockReturnValue(tmpDir);
+  });
+
+  afterEach(() => {
+    if (existsSync(tmpDir)) {
+      rmSync(tmpDir, { recursive: true, force: true });
+    }
+    vi.clearAllMocks();
+  });
+
+  describe('generateRunId', () => {
+    it('returns a string starting with run_', () => {
+      const id = generateRunId();
+      expect(id).toMatch(/^run_[0-9a-f-]+$/);
+    });
+
+    it('generates unique IDs', () => {
+      const ids = new Set(Array.from({ length: 100 }, () => generateRunId()));
+      expect(ids.size).toBe(100);
+    });
+  });
+
+  describe('saveEvalRun', () => {
+    it('creates eval-results directory and writes JSON file', () => {
+      const run = makeRunResult({ runId: 'run_save-test' });
+      const filePath = saveEvalRun(run);
+
+      expect(filePath).toContain('eval-results');
+      expect(filePath).toContain('run_save-test.json');
+      expect(existsSync(filePath)).toBe(true);
+    });
+
+    it('writes valid JSON that can be read back', () => {
+      const run = makeRunResult({ runId: 'run_roundtrip' });
+      saveEvalRun(run);
+      const loaded = loadEvalRun('run_roundtrip');
+      expect(loaded).toEqual(run);
+    });
+  });
+
+  describe('loadEvalRun', () => {
+    it('loads a previously saved run', () => {
+      const run = makeRunResult({ runId: 'run_load-test', agent: 'my-agent' });
+      saveEvalRun(run);
+
+      const loaded = loadEvalRun('run_load-test');
+      expect(loaded.agent).toBe('my-agent');
+      expect(loaded.results).toHaveLength(1);
+    });
+
+    it('throws for a non-existent run ID', () => {
+      expect(() => loadEvalRun('run_does-not-exist')).toThrow('Eval run "run_does-not-exist" not found');
+    });
+  });
+
+  describe('listEvalRuns', () => {
+    it('returns empty array when eval-results dir does not exist', () => {
+      // Point to a dir with no eval-results subdirectory
+      const emptyDir = makeTmpDir();
+      mockFindConfigRoot.mockReturnValue(emptyDir);
+
+      expect(listEvalRuns()).toEqual([]);
+
+      rmSync(emptyDir, { recursive: true, force: true });
+    });
+
+    it('returns saved runs', () => {
+      saveEvalRun(makeRunResult({ runId: 'run_aaa' }));
+      saveEvalRun(makeRunResult({ runId: 'run_bbb' }));
+
+      const runs = listEvalRuns();
+      expect(runs).toHaveLength(2);
+    });
+
+    it('returns runs in reverse sorted order (newest first)', () => {
+      saveEvalRun(makeRunResult({ runId: 'run_aaa' }));
+      saveEvalRun(makeRunResult({ runId: 'run_zzz' }));
+      saveEvalRun(makeRunResult({ runId: 'run_mmm' }));
+
+      const runs = listEvalRuns();
+      expect(runs.map(r => r.runId)).toEqual(['run_zzz', 'run_mmm', 'run_aaa']);
+    });
+
+    it('ignores files that do not match the naming pattern', async () => {
+      saveEvalRun(makeRunResult({ runId: 'run_valid' }));
+
+      // Write a file that doesn't match the pattern
+      const resultsDir = join(tmpDir, 'eval-results');
+      const { writeFileSync } = await import('fs');
+      writeFileSync(join(resultsDir, 'notes.txt'), 'not a run');
+      writeFileSync(join(resultsDir, 'other.json'), '{}');
+
+      const runs = listEvalRuns();
+      expect(runs).toHaveLength(1);
+      expect(runs[0]!.runId).toBe('run_valid');
+    });
+  });
+
+  describe('error when no config root', () => {
+    it('throws when findConfigRoot returns null', () => {
+      mockFindConfigRoot.mockReturnValue(null);
+      expect(() => saveEvalRun(makeRunResult())).toThrow('No agentcore project found');
+    });
+  });
+});
diff --git a/src/cli/primitives/__tests__/EvaluatorPrimitive.test.ts b/src/cli/primitives/__tests__/EvaluatorPrimitive.test.ts
new file mode 100644
index 00000000..6cca7305
--- /dev/null
+++ b/src/cli/primitives/__tests__/EvaluatorPrimitive.test.ts
@@ -0,0 +1,233 @@
+import type { EvaluatorConfig } from '../../../schema';
+import { EvaluatorPrimitive } from '../EvaluatorPrimitive.js';
+import { afterEach, describe, expect, it, vi } from 'vitest';
+
+const mockReadProjectSpec = vi.fn();
+const mockWriteProjectSpec = vi.fn();
+
+vi.mock('../../../lib/index.js', () => ({
+  ConfigIO: class {
+    readProjectSpec = mockReadProjectSpec;
+    writeProjectSpec = mockWriteProjectSpec;
+  },
+  findConfigRoot: () => '/fake/root',
+}));
+
+const validConfig: EvaluatorConfig = {
+  llmAsAJudge: {
+    model: 'us.anthropic.claude-sonnet-4-5-20250929-v1:0',
+    instructions: 'Evaluate quality. Context: {context}',
+    ratingScale: {
+      numerical: [
+        { value: 1, label: 'Poor', definition: 'Fails' },
+        { value: 5, label: 'Excellent', definition: 'Perfect' },
+      ],
+    },
+  },
+};
+
+function makeProject(
+  evaluators: { name: string }[] = [],
+  onlineEvalConfigs: { name: string; evaluators: string[] }[] = []
+) {
+  return {
+    name: 'TestProject',
+    version: 1,
+    agents: [],
+    memories: [],
+    credentials: [],
+    evaluators,
+    onlineEvalConfigs,
+  };
+}
+
+const primitive = new EvaluatorPrimitive();
+
+describe('EvaluatorPrimitive', () => {
+  afterEach(() => vi.clearAllMocks());
+
+  it('has correct kind, label, and article', () => {
+    expect(primitive.kind).toBe('evaluator');
+    expect(primitive.label).toBe('Evaluator');
+    // eslint-disable-next-line @typescript-eslint/dot-notation
+    expect(primitive['article']).toBe('an');
+  });
+
+  describe('add', () => {
+    it('adds evaluator to project spec and returns success', async () => {
+      mockReadProjectSpec.mockResolvedValue(makeProject());
+      mockWriteProjectSpec.mockResolvedValue(undefined);
+
+      const result = await primitive.add({
+        name: 'MyEval',
+        level: 'SESSION',
+        config: validConfig,
+      });
+
+      expect(result.success).toBe(true);
+      expect(result).toHaveProperty('evaluatorName', 'MyEval');
+
+      const writtenSpec = mockWriteProjectSpec.mock.calls[0]![0];
+      expect(writtenSpec.evaluators).toHaveLength(1);
+      expect(writtenSpec.evaluators[0].name).toBe('MyEval');
+      expect(writtenSpec.evaluators[0].type).toBe('CustomEvaluator');
+      expect(writtenSpec.evaluators[0].level).toBe('SESSION');
+    });
+
+    it('includes description when provided', async () => {
+      mockReadProjectSpec.mockResolvedValue(makeProject());
+      mockWriteProjectSpec.mockResolvedValue(undefined);
+
+      await primitive.add({
+        name: 'DescEval',
+        level: 'TRACE',
+        description: 'My description',
+        config: validConfig,
+      });
+
+      const writtenSpec = mockWriteProjectSpec.mock.calls[0]![0];
+      expect(writtenSpec.evaluators[0].description).toBe('My description');
+    });
+
+    it('returns error when evaluator name already exists', async () => {
+      mockReadProjectSpec.mockResolvedValue(makeProject([{ name: 'Existing' }]));
+
+      const result = await primitive.add({
+        name: 'Existing',
+        level: 'SESSION',
+        config: validConfig,
+      });
+
+      expect(result).toEqual(
+        expect.objectContaining({ success: false, error: expect.stringContaining('already exists') })
+      );
+    });
+
+    it('returns error when readProjectSpec fails', async () => {
+      mockReadProjectSpec.mockRejectedValue(new Error('disk read error'));
+
+      const result = await primitive.add({
+        name: 'NewEval',
+        level: 'SESSION',
+        config: validConfig,
+      });
+
+      expect(result).toEqual(expect.objectContaining({ success: false, error: 'disk read error' }));
+    });
+  });
+
+  describe('remove', () => {
+    it('removes evaluator from project spec', async () => {
+      mockReadProjectSpec.mockResolvedValue(makeProject([{ name: 'EvalA' }, { name: 'EvalB' }]));
+      mockWriteProjectSpec.mockResolvedValue(undefined);
+
+      const result = await primitive.remove('EvalA');
+
+      expect(result.success).toBe(true);
+      const writtenSpec = mockWriteProjectSpec.mock.calls[0]![0];
+      expect(writtenSpec.evaluators).toHaveLength(1);
+      expect(writtenSpec.evaluators[0].name).toBe('EvalB');
+    });
+
+    it('returns error when evaluator not found', async () => {
+      mockReadProjectSpec.mockResolvedValue(makeProject());
+
+      const result = await primitive.remove('NonExistent');
+
+      expect(result.success).toBe(false);
+      if (!result.success) {
+        expect(result.error).toContain('NonExistent');
+        expect(result.error).toContain('not found');
+      }
+    });
+
+    it('blocks removal when referenced by online eval configs', async () => {
+      mockReadProjectSpec.mockResolvedValue(
+        makeProject([{ name: 'UsedEval' }], [{ name: 'MyOnlineConfig', evaluators: ['UsedEval'] }])
+      );
+
+      const result = await primitive.remove('UsedEval');
+
+      expect(result.success).toBe(false);
+      if (!result.success) {
+        expect(result.error).toContain('referenced by online eval config');
+        expect(result.error).toContain('MyOnlineConfig');
+      }
+      expect(mockWriteProjectSpec).not.toHaveBeenCalled();
+    });
+
+    it('returns error when readProjectSpec fails', async () => {
+      mockReadProjectSpec.mockRejectedValue(new Error('io error'));
+
+      const result = await primitive.remove('Whatever');
+
+      expect(result.success).toBe(false);
+      if (!result.success) {
+        expect(result.error).toBe('io error');
+      }
+    });
+  });
+
+  describe('previewRemove', () => {
+    it('returns preview with schema changes', async () => {
+      mockReadProjectSpec.mockResolvedValue(makeProject([{ name: 'EvalA' }]));
+
+      const preview = await primitive.previewRemove('EvalA');
+
+      expect(preview.summary[0]).toContain('Removing evaluator: EvalA');
+      expect(preview.schemaChanges).toHaveLength(1);
+      expect(preview.schemaChanges[0]!.file).toBe('agentcore/agentcore.json');
+      expect((preview.schemaChanges[0]!.after as { evaluators: unknown[] }).evaluators).toHaveLength(0);
+    });
+
+    it('throws when evaluator not found', async () => {
+      mockReadProjectSpec.mockResolvedValue(makeProject());
+
+      await expect(primitive.previewRemove('Missing')).rejects.toThrow('not found');
+    });
+
+    it('warns when evaluator is referenced by online eval configs', async () => {
+      mockReadProjectSpec.mockResolvedValue(
+        makeProject([{ name: 'UsedEval' }], [{ name: 'Config1', evaluators: ['UsedEval'] }])
+      );
+
+      const preview = await primitive.previewRemove('UsedEval');
+
+      const blocked = preview.summary.find(s => s.includes('Blocked'));
+      expect(blocked).toBeDefined();
+      expect(blocked).toContain('Config1');
+    });
+  });
+
+  describe('getRemovable', () => {
+    it('returns evaluator names', async () => {
+      mockReadProjectSpec.mockResolvedValue(makeProject([{ name: 'A' }, { name: 'B' }]));
+
+      const result = await primitive.getRemovable();
+
+      expect(result).toEqual([{ name: 'A' }, { name: 'B' }]);
+    });
+
+    it('returns empty array on error', async () => {
+      mockReadProjectSpec.mockRejectedValue(new Error('fail'));
+
+      expect(await primitive.getRemovable()).toEqual([]);
+    });
+  });
+
+  describe('getAllNames', () => {
+    it('returns evaluator names as strings', async () => {
+      mockReadProjectSpec.mockResolvedValue(makeProject([{ name: 'X' }, { name: 'Y' }]));
+
+      const result = await primitive.getAllNames();
+
+      expect(result).toEqual(['X', 'Y']);
+    });
+
+    it('returns empty array on error', async () => {
+      mockReadProjectSpec.mockRejectedValue(new Error('fail'));
+
+      expect(await primitive.getAllNames()).toEqual([]);
+    });
+  });
+});
diff --git a/src/cli/primitives/__tests__/OnlineEvalConfigPrimitive.test.ts b/src/cli/primitives/__tests__/OnlineEvalConfigPrimitive.test.ts
new file mode 100644
index 00000000..ca6c6bc6
--- /dev/null
+++ b/src/cli/primitives/__tests__/OnlineEvalConfigPrimitive.test.ts
@@ -0,0 +1,211 @@
+import { OnlineEvalConfigPrimitive } from '../OnlineEvalConfigPrimitive.js';
+import { afterEach, describe, expect, it, vi } from 'vitest';
+
+const mockReadProjectSpec = vi.fn();
+const mockWriteProjectSpec = vi.fn();
+
+vi.mock('../../../lib/index.js', () => ({
+  ConfigIO: class {
+    readProjectSpec = mockReadProjectSpec;
+    writeProjectSpec = mockWriteProjectSpec;
+  },
+  findConfigRoot: () => '/fake/root',
+}));
+
+function makeProject(
+  onlineEvalConfigs: { name: string; agents: string[]; evaluators: string[] }[] = [],
+  evaluators: { name: string }[] = []
+) {
+  return {
+    name: 'TestProject',
+    version: 1,
+    agents: [],
+    memories: [],
+    credentials: [],
+    evaluators,
+    onlineEvalConfigs,
+  };
+}
+
+const primitive = new OnlineEvalConfigPrimitive();
+
+describe('OnlineEvalConfigPrimitive', () => {
+  afterEach(() => vi.clearAllMocks());
+
+  it('has correct kind, label, and article', () => {
+    expect(primitive.kind).toBe('online-eval');
+    expect(primitive.label).toBe('Online Eval Config');
+    // eslint-disable-next-line @typescript-eslint/dot-notation
+    expect(primitive['article']).toBe('an');
+  });
+
+  describe('add', () => {
+    it('adds config to project spec and returns success', async () => {
+      mockReadProjectSpec.mockResolvedValue(makeProject());
+      mockWriteProjectSpec.mockResolvedValue(undefined);
+
+      const result = await primitive.add({
+        name: 'MyConfig',
+        agents: ['agent1'],
+        evaluators: ['Builtin.GoalSuccessRate'],
+        samplingRate: 10,
+      });
+
+      expect(result.success).toBe(true);
+      expect(result).toHaveProperty('configName', 'MyConfig');
+
+      const writtenSpec = mockWriteProjectSpec.mock.calls[0]![0];
+      expect(writtenSpec.onlineEvalConfigs).toHaveLength(1);
+      const config = writtenSpec.onlineEvalConfigs[0];
+      expect(config.type).toBe('OnlineEvaluationConfig');
+      expect(config.name).toBe('MyConfig');
+      expect(config.agents).toEqual(['agent1']);
+      expect(config.evaluators).toEqual(['Builtin.GoalSuccessRate']);
+      expect(config.samplingRate).toBe(10);
+    });
+
+    it('supports multiple agents and evaluators', async () => {
+      mockReadProjectSpec.mockResolvedValue(makeProject());
+      mockWriteProjectSpec.mockResolvedValue(undefined);
+
+      const result = await primitive.add({
+        name: 'MultiConfig',
+        agents: ['agent1', 'agent2'],
+        evaluators: ['Builtin.GoalSuccessRate', 'CustomEval'],
+        samplingRate: 50,
+      });
+
+      expect(result.success).toBe(true);
+      const config = mockWriteProjectSpec.mock.calls[0]![0].onlineEvalConfigs[0];
+      expect(config.agents).toEqual(['agent1', 'agent2']);
+      expect(config.evaluators).toEqual(['Builtin.GoalSuccessRate', 'CustomEval']);
+    });
+
+    it('returns error when config name already exists', async () => {
+      mockReadProjectSpec.mockResolvedValue(makeProject([{ name: 'Existing', agents: ['a'], evaluators: ['e'] }]));
+
+      const result = await primitive.add({
+        name: 'Existing',
+        agents: ['a'],
+        evaluators: ['e'],
+        samplingRate: 10,
+      });
+
+      expect(result).toEqual(
+        expect.objectContaining({ success: false, error: expect.stringContaining('already exists') })
+      );
+    });
+
+    it('returns error when readProjectSpec fails', async () => {
+      mockReadProjectSpec.mockRejectedValue(new Error('no project'));
+
+      const result = await primitive.add({
+        name: 'New',
+        agents: ['a'],
+        evaluators: ['e'],
+        samplingRate: 10,
+      });
+
+      expect(result).toEqual(expect.objectContaining({ success: false, error: 'no project' }));
+    });
+  });
+
+  describe('remove', () => {
+    it('removes config from project spec', async () => {
+      mockReadProjectSpec.mockResolvedValue(
+        makeProject([
+          { name: 'ConfigA', agents: ['a'], evaluators: ['e'] },
+          { name: 'ConfigB', agents: ['b'], evaluators: ['f'] },
+        ])
+      );
+      mockWriteProjectSpec.mockResolvedValue(undefined);
+
+      const result = await primitive.remove('ConfigA');
+
+      expect(result.success).toBe(true);
+      const writtenSpec = mockWriteProjectSpec.mock.calls[0]![0];
+      expect(writtenSpec.onlineEvalConfigs).toHaveLength(1);
+      expect(writtenSpec.onlineEvalConfigs[0].name).toBe('ConfigB');
+    });
+
+    it('returns error when config not found', async () => {
+      mockReadProjectSpec.mockResolvedValue(makeProject());
+
+      const result = await primitive.remove('NonExistent');
+
+      expect(result.success).toBe(false);
+      if (!result.success) {
+        expect(result.error).toContain('NonExistent');
+        expect(result.error).toContain('not found');
+      }
+    });
+
+    it('returns error when readProjectSpec fails', async () => {
+      mockReadProjectSpec.mockRejectedValue(new Error('io error'));
+
+      const result = await primitive.remove('Whatever');
+
+      expect(result.success).toBe(false);
+      if (!result.success) {
+        expect(result.error).toBe('io error');
+      }
+    });
+  });
+
+  describe('previewRemove', () => {
+    it('returns preview with summary including agents and evaluators', async () => {
+      mockReadProjectSpec.mockResolvedValue(
+        makeProject([{ name: 'Config1', agents: ['agentA', 'agentB'], evaluators: ['Builtin.X', 'CustomY'] }])
+      );
+
+      const preview = await primitive.previewRemove('Config1');
+
+      expect(preview.summary).toContain('Removing online eval config: Config1');
+      expect(preview.summary).toContain('Monitors agents: agentA, agentB');
+      expect(preview.summary).toContain('Uses evaluators: Builtin.X, CustomY');
+      expect(preview.schemaChanges).toHaveLength(1);
+      expect((preview.schemaChanges[0]!.after as { onlineEvalConfigs: unknown[] }).onlineEvalConfigs).toHaveLength(0);
+    });
+
+    it('throws when config not found', async () => {
+      mockReadProjectSpec.mockResolvedValue(makeProject());
+
+      await expect(primitive.previewRemove('Missing')).rejects.toThrow('not found');
+    });
+  });
+
+  describe('getRemovable', () => {
+    it('returns config names', async () => {
+      mockReadProjectSpec.mockResolvedValue(
+        makeProject([
+          { name: 'C1', agents: ['a'], evaluators: ['e'] },
+          { name: 'C2', agents: ['b'], evaluators: ['f'] },
+        ])
+      );
+
+      const result = await primitive.getRemovable();
+
+      expect(result).toEqual([{ name: 'C1' }, { name: 'C2' }]);
+    });
+
+    it('returns empty array on error', async () => {
+      mockReadProjectSpec.mockRejectedValue(new Error('fail'));
+
+      expect(await primitive.getRemovable()).toEqual([]);
+    });
+  });
+
+  describe('getAllNames', () => {
+    it('returns config names as strings', async () => {
+      mockReadProjectSpec.mockResolvedValue(makeProject([{ name: 'X', agents: ['a'], evaluators: ['e'] }]));
+
+      expect(await primitive.getAllNames()).toEqual(['X']);
+    });
+
+    it('returns empty array on error', async () => {
+      mockReadProjectSpec.mockRejectedValue(new Error('fail'));
+
+      expect(await primitive.getAllNames()).toEqual([]);
+    });
+  });
+});
diff --git a/src/cli/tui/screens/evaluator/__tests__/types.test.ts b/src/cli/tui/screens/evaluator/__tests__/types.test.ts
new file mode 100644
index 00000000..3d807c01
--- /dev/null
+++ b/src/cli/tui/screens/evaluator/__tests__/types.test.ts
@@ -0,0 +1,99 @@
+import { DEFAULT_INSTRUCTIONS, DEFAULT_MODEL, LEVEL_PLACEHOLDERS, validateInstructionPlaceholders } from '../types.js';
+import { describe, expect, it } from 'vitest';
+
+describe('LEVEL_PLACEHOLDERS', () => {
+  it('defines placeholders for all three levels', () => {
+    expect(LEVEL_PLACEHOLDERS).toHaveProperty('SESSION');
+    expect(LEVEL_PLACEHOLDERS).toHaveProperty('TRACE');
+    expect(LEVEL_PLACEHOLDERS).toHaveProperty('TOOL_CALL');
+  });
+
+  it('SESSION and TRACE share context and trajectory placeholders', () => {
+    expect(LEVEL_PLACEHOLDERS.SESSION).toContain('context');
+    expect(LEVEL_PLACEHOLDERS.TRACE).toContain('context');
+    expect(LEVEL_PLACEHOLDERS.SESSION).toContain('actual_trajectory');
+    expect(LEVEL_PLACEHOLDERS.TRACE).toContain('actual_trajectory');
+  });
+
+  it('TOOL_CALL has tool-specific placeholders', () => {
+    expect(LEVEL_PLACEHOLDERS.TOOL_CALL).toContain('tool_name');
+    expect(LEVEL_PLACEHOLDERS.TOOL_CALL).toContain('tool_input');
+    expect(LEVEL_PLACEHOLDERS.TOOL_CALL).toContain('tool_output');
+  });
+});
+
+describe('DEFAULT_INSTRUCTIONS', () => {
+  it('each default instruction passes its own level validation', () => {
+    for (const level of ['SESSION', 'TRACE', 'TOOL_CALL'] as const) {
+      const result = validateInstructionPlaceholders(DEFAULT_INSTRUCTIONS[level], level);
+      expect(result).toBe(true);
+    }
+  });
+
+  it('SESSION default uses {context}', () => {
+    expect(DEFAULT_INSTRUCTIONS.SESSION).toContain('{context}');
+  });
+
+  it('TOOL_CALL default uses {tool_name}, {tool_input}, {tool_output}', () => {
+    expect(DEFAULT_INSTRUCTIONS.TOOL_CALL).toContain('{tool_name}');
+    expect(DEFAULT_INSTRUCTIONS.TOOL_CALL).toContain('{tool_input}');
+    expect(DEFAULT_INSTRUCTIONS.TOOL_CALL).toContain('{tool_output}');
+  });
+});
+
+describe('DEFAULT_MODEL', () => {
+  it('is a Claude Sonnet model ID', () => {
+    expect(DEFAULT_MODEL).toContain('anthropic');
+    expect(DEFAULT_MODEL).toContain('sonnet');
+  });
+});
+
+describe('validateInstructionPlaceholders', () => {
+  it('returns true when at least one valid placeholder is present for SESSION', () => {
+    expect(validateInstructionPlaceholders('Check {context} now', 'SESSION')).toBe(true);
+    expect(validateInstructionPlaceholders('See {available_tools}', 'SESSION')).toBe(true);
+    expect(validateInstructionPlaceholders('Trajectory: {actual_trajectory}', 'SESSION')).toBe(true);
+  });
+
+  it('returns true when at least one valid placeholder is present for TOOL_CALL', () => {
+    expect(validateInstructionPlaceholders('Tool: {tool_name}', 'TOOL_CALL')).toBe(true);
+    expect(validateInstructionPlaceholders('Output: {tool_output}', 'TOOL_CALL')).toBe(true);
+  });
+
+  it('returns error string when no valid placeholders are present', () => {
+    const result = validateInstructionPlaceholders('No placeholders here', 'SESSION');
+    expect(typeof result).toBe('string');
+    expect(result).toContain('must contain at least one placeholder');
+  });
+
+  it('rejects SESSION-level placeholders for TOOL_CALL level', () => {
+    const result = validateInstructionPlaceholders('Check {context} now', 'TOOL_CALL');
+    // {context} IS valid for TOOL_CALL, so this should pass
+    expect(result).toBe(true);
+  });
+
+  it('rejects TOOL_CALL-level placeholders for SESSION level', () => {
+    const result = validateInstructionPlaceholders('Tool: {tool_name}', 'SESSION');
+    expect(typeof result).toBe('string');
+    expect(result).toContain('must contain at least one placeholder');
+  });
+
+  it('does not match partial placeholder names', () => {
+    // {tool_names} should not match {tool_name} since includes checks for exact {placeholder}
+    const result = validateInstructionPlaceholders('Extra: {contexts}', 'SESSION');
+    expect(typeof result).toBe('string');
+  });
+
+  it('handles multiple placeholders — at least one valid is enough', () => {
+    const result = validateInstructionPlaceholders('{unknown_thing} and {context}', 'SESSION');
+    expect(result).toBe(true);
+  });
+
+  it('returns descriptive error listing allowed placeholders', () => {
+    const result = validateInstructionPlaceholders('nothing', 'TOOL_CALL');
+    expect(typeof result).toBe('string');
+    expect(result as string).toContain('{tool_name}');
+    expect(result as string).toContain('{tool_input}');
+    expect(result as string).toContain('{tool_output}');
+  });
+});
diff --git a/src/schema/schemas/primitives/__tests__/evaluator.test.ts b/src/schema/schemas/primitives/__tests__/evaluator.test.ts
new file mode 100644
index 00000000..9147c5cf
--- /dev/null
+++ b/src/schema/schemas/primitives/__tests__/evaluator.test.ts
@@ -0,0 +1,158 @@
+import {
+  CategoricalRatingSchema,
+  EvaluationLevelSchema,
+  EvaluatorConfigSchema,
+  EvaluatorNameSchema,
+  NumericalRatingSchema,
+  RatingScaleSchema,
+} from '../evaluator';
+import { describe, expect, it } from 'vitest';
+
+describe('EvaluationLevelSchema', () => {
+  it.each(['SESSION', 'TRACE', 'TOOL_CALL'])('accepts %s', level => {
+    expect(EvaluationLevelSchema.safeParse(level).success).toBe(true);
+  });
+
+  it.each(['session', 'INVALID', '', 'SPAN'])('rejects %s', level => {
+    expect(EvaluationLevelSchema.safeParse(level).success).toBe(false);
+  });
+});
+
+describe('EvaluatorNameSchema', () => {
+  it('accepts valid names', () => {
+    expect(EvaluatorNameSchema.safeParse('MyEval').success).toBe(true);
+    expect(EvaluatorNameSchema.safeParse('eval_1').success).toBe(true);
+    expect(EvaluatorNameSchema.safeParse('A').success).toBe(true);
+  });
+
+  it('rejects empty string', () => {
+    expect(EvaluatorNameSchema.safeParse('').success).toBe(false);
+  });
+
+  it('rejects names starting with a number', () => {
+    expect(EvaluatorNameSchema.safeParse('1eval').success).toBe(false);
+  });
+
+  it('rejects names starting with underscore', () => {
+    expect(EvaluatorNameSchema.safeParse('_eval').success).toBe(false);
+  });
+
+  it('rejects names with special characters', () => {
+    expect(EvaluatorNameSchema.safeParse('my-eval').success).toBe(false);
+    expect(EvaluatorNameSchema.safeParse('my eval').success).toBe(false);
+    expect(EvaluatorNameSchema.safeParse('my.eval').success).toBe(false);
+  });
+
+  it('rejects names longer than 48 characters', () => {
+    const longName = 'A' + 'a'.repeat(48);
+    expect(longName.length).toBe(49);
+    expect(EvaluatorNameSchema.safeParse(longName).success).toBe(false);
+  });
+
+  it('accepts names exactly 48 characters', () => {
+    const name = 'A' + 'a'.repeat(47);
+    expect(name.length).toBe(48);
+    expect(EvaluatorNameSchema.safeParse(name).success).toBe(true);
+  });
+});
+
+describe('NumericalRatingSchema', () => {
+  it('accepts valid numerical rating', () => {
+    const result = NumericalRatingSchema.safeParse({ value: 1, label: 'Poor', definition: 'Fails expectations' });
+    expect(result.success).toBe(true);
+  });
+
+  it('rejects non-integer value', () => {
+    const result = NumericalRatingSchema.safeParse({ value: 1.5, label: 'Ok', definition: 'Decent' });
+    expect(result.success).toBe(false);
+  });
+
+  it('rejects empty label', () => {
+    const result = NumericalRatingSchema.safeParse({ value: 1, label: '', definition: 'Test' });
+    expect(result.success).toBe(false);
+  });
+
+  it('rejects empty definition', () => {
+    const result = NumericalRatingSchema.safeParse({ value: 1, label: 'Test', definition: '' });
+    expect(result.success).toBe(false);
+  });
+});
+
+describe('CategoricalRatingSchema', () => {
+  it('accepts valid categorical rating', () => {
+    const result = CategoricalRatingSchema.safeParse({ label: 'Pass', definition: 'Meets criteria' });
+    expect(result.success).toBe(true);
+  });
+
+  it('rejects empty label', () => {
+    expect(CategoricalRatingSchema.safeParse({ label: '', definition: 'Test' }).success).toBe(false);
+  });
+});
+
+describe('RatingScaleSchema', () => {
+  it('accepts numerical-only scale', () => {
+    const result = RatingScaleSchema.safeParse({
+      numerical: [
+        { value: 1, label: 'Bad', definition: 'Poor' },
+        { value: 2, label: 'Good', definition: 'Nice' },
+      ],
+    });
+    expect(result.success).toBe(true);
+  });
+
+  it('accepts categorical-only scale', () => {
+    const result = RatingScaleSchema.safeParse({
+      categorical: [
+        { label: 'Pass', definition: 'Good' },
+        { label: 'Fail', definition: 'Bad' },
+      ],
+    });
+    expect(result.success).toBe(true);
+  });
+
+  it('rejects scale with both numerical and categorical', () => {
+    const result = RatingScaleSchema.safeParse({
+      numerical: [{ value: 1, label: 'Bad', definition: 'Poor' }],
+      categorical: [{ label: 'Pass', definition: 'Good' }],
+    });
+    expect(result.success).toBe(false);
+  });
+
+  it('rejects scale with neither numerical nor categorical', () => {
+    const result = RatingScaleSchema.safeParse({});
+    expect(result.success).toBe(false);
+  });
+});
+
+describe('EvaluatorConfigSchema', () => {
+  const validConfig = {
+    llmAsAJudge: {
+      model: 'us.anthropic.claude-sonnet-4-5-20250929-v1:0',
+      instructions: 'Evaluate the quality. Context: {context}',
+      ratingScale: {
+        numerical: [
+          { value: 1, label: 'Poor', definition: 'Fails' },
+          { value: 5, label: 'Excellent', definition: 'Perfect' },
+        ],
+      },
+    },
+  };
+
+  it('accepts valid evaluator config', () => {
+    expect(EvaluatorConfigSchema.safeParse(validConfig).success).toBe(true);
+  });
+
+  it('rejects missing model', () => {
+    const config = { llmAsAJudge: { ...validConfig.llmAsAJudge, model: '' } };
+    expect(EvaluatorConfigSchema.safeParse(config).success).toBe(false);
+  });
+
+  it('rejects missing instructions', () => {
+    const config = { llmAsAJudge: { ...validConfig.llmAsAJudge, instructions: '' } };
+    expect(EvaluatorConfigSchema.safeParse(config).success).toBe(false);
+  });
+
+  it('rejects missing llmAsAJudge key', () => {
+    expect(EvaluatorConfigSchema.safeParse({}).success).toBe(false);
+  });
+});
diff --git a/src/schema/schemas/primitives/__tests__/online-eval-config.test.ts b/src/schema/schemas/primitives/__tests__/online-eval-config.test.ts
new file mode 100644
index 00000000..9381539e
--- /dev/null
+++ b/src/schema/schemas/primitives/__tests__/online-eval-config.test.ts
@@ -0,0 +1,90 @@
+import { OnlineEvalConfigNameSchema, OnlineEvalConfigSchema } from '../online-eval-config';
+import { describe, expect, it } from 'vitest';
+
+describe('OnlineEvalConfigNameSchema', () => {
+  it('accepts valid names', () => {
+    expect(OnlineEvalConfigNameSchema.safeParse('MyConfig').success).toBe(true);
+    expect(OnlineEvalConfigNameSchema.safeParse('config_1').success).toBe(true);
+  });
+
+  it('rejects empty string', () => {
+    expect(OnlineEvalConfigNameSchema.safeParse('').success).toBe(false);
+  });
+
+  it('rejects names starting with a number', () => {
+    expect(OnlineEvalConfigNameSchema.safeParse('1config').success).toBe(false);
+  });
+
+  it('rejects names with hyphens', () => {
+    expect(OnlineEvalConfigNameSchema.safeParse('my-config').success).toBe(false);
+  });
+
+  it('rejects names longer than 48 characters', () => {
+    const longName = 'A' + 'a'.repeat(48);
+    expect(OnlineEvalConfigNameSchema.safeParse(longName).success).toBe(false);
+  });
+});
+
+describe('OnlineEvalConfigSchema', () => {
+  const validConfig = {
+    type: 'OnlineEvaluationConfig' as const,
+    name: 'TestConfig',
+    agents: ['agent1'],
+    evaluators: ['Builtin.GoalSuccessRate'],
+    samplingRate: 10,
+  };
+
+  it('accepts valid config', () => {
+    expect(OnlineEvalConfigSchema.safeParse(validConfig).success).toBe(true);
+  });
+
+  it('accepts multiple agents and evaluators', () => {
+    const config = { ...validConfig, agents: ['a1', 'a2'], evaluators: ['Builtin.X', 'CustomEval'] };
+    expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(true);
+  });
+
+  it('rejects wrong type literal', () => {
+    const config = { ...validConfig, type: 'WrongType' };
+    expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(false);
+  });
+
+  it('rejects empty agents array', () => {
+    const config = { ...validConfig, agents: [] };
+    expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(false);
+  });
+
+  it('rejects empty evaluators array', () => {
+    const config = { ...validConfig, evaluators: [] };
+    expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(false);
+  });
+
+  it('rejects sampling rate below 0.01', () => {
+    const config = { ...validConfig, samplingRate: 0.001 };
+    expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(false);
+  });
+
+  it('rejects sampling rate above 100', () => {
+    const config = { ...validConfig, samplingRate: 101 };
+    expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(false);
+  });
+
+  it('accepts minimum sampling rate of 0.01', () => {
+    const config = { ...validConfig, samplingRate: 0.01 };
+    expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(true);
+  });
+
+  it('accepts maximum sampling rate of 100', () => {
+    const config = { ...validConfig, samplingRate: 100 };
+    expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(true);
+  });
+
+  it('rejects empty string in agents array', () => {
+    const config = { ...validConfig, agents: [''] };
+    expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(false);
+  });
+
+  it('rejects empty string in evaluators array', () => {
+    const config = { ...validConfig, evaluators: [''] };
+    expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(false);
+  });
+});

From 173f83d965542e96156e2c08b14fa374d71c67b5 Mon Sep 17 00:00:00 2001
From: notgitika <gitijh@gmail.com>
Date: Thu, 12 Mar 2026 01:08:55 -0400
Subject: [PATCH 5/9] add support for running evals on agents created outside
 the cli

---
 .../aws/__tests__/agentcore-control.test.ts   | 285 +++++++++++++++++-
 src/cli/aws/agentcore-control.ts              | 194 ++++++++++++
 src/cli/commands/run/command.tsx              |  10 +-
 .../eval/__tests__/run-eval.test.ts           | 126 ++++++++
 src/cli/operations/eval/run-eval.ts           | 132 ++++++--
 src/cli/operations/eval/types.ts              |   7 +
 6 files changed, 732 insertions(+), 22 deletions(-)

diff --git a/src/cli/aws/__tests__/agentcore-control.test.ts b/src/cli/aws/__tests__/agentcore-control.test.ts
index b4d629a6..75a5b769 100644
--- a/src/cli/aws/__tests__/agentcore-control.test.ts
+++ b/src/cli/aws/__tests__/agentcore-control.test.ts
@@ -1,4 +1,11 @@
-import { getAgentRuntimeStatus, updateOnlineEvalExecutionStatus } from '../agentcore-control.js';
+import {
+  getAgentRuntimeStatus,
+  getEvaluator,
+  getOnlineEvaluationConfig,
+  listEvaluators,
+  listOnlineEvaluationConfigs,
+  updateOnlineEvalExecutionStatus,
+} from '../agentcore-control.js';
 import { beforeEach, describe, expect, it, vi } from 'vitest';
 
 const { mockSend } = vi.hoisted(() => ({
@@ -12,6 +19,18 @@ vi.mock('@aws-sdk/client-bedrock-agentcore-control', () => ({
   GetAgentRuntimeCommand: class {
     constructor(public input: unknown) {}
   },
+  GetEvaluatorCommand: class {
+    constructor(public input: unknown) {}
+  },
+  GetOnlineEvaluationConfigCommand: class {
+    constructor(public input: unknown) {}
+  },
+  ListEvaluatorsCommand: class {
+    constructor(public input: unknown) {}
+  },
+  ListOnlineEvaluationConfigsCommand: class {
+    constructor(public input: unknown) {}
+  },
   UpdateOnlineEvaluationConfigCommand: class {
     constructor(public input: unknown) {}
   },
@@ -60,6 +79,130 @@ describe('getAgentRuntimeStatus', () => {
   });
 });
 
+describe('getEvaluator', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it('returns evaluator details', async () => {
+    mockSend.mockResolvedValue({
+      evaluatorId: 'eval-123',
+      evaluatorArn: 'arn:aws:bedrock-agentcore:us-east-1:123456:evaluator/eval-123',
+      evaluatorName: 'my-evaluator',
+      level: 'SESSION',
+      status: 'ACTIVE',
+      description: 'A test evaluator',
+    });
+
+    const result = await getEvaluator({ region: 'us-east-1', evaluatorId: 'eval-123' });
+    expect(result.evaluatorId).toBe('eval-123');
+    expect(result.evaluatorName).toBe('my-evaluator');
+    expect(result.level).toBe('SESSION');
+    expect(result.status).toBe('ACTIVE');
+    expect(result.description).toBe('A test evaluator');
+  });
+
+  it('throws when no evaluatorId in response', async () => {
+    mockSend.mockResolvedValue({ evaluatorId: undefined });
+
+    await expect(getEvaluator({ region: 'us-east-1', evaluatorId: 'eval-missing' })).rejects.toThrow(
+      'No evaluator found for ID eval-missing'
+    );
+  });
+
+  it('passes correct evaluatorId in command', async () => {
+    mockSend.mockResolvedValue({
+      evaluatorId: 'eval-abc',
+      evaluatorName: 'test',
+      level: 'TRACE',
+      status: 'ACTIVE',
+    });
+
+    await getEvaluator({ region: 'us-west-2', evaluatorId: 'eval-abc' });
+
+    const command = mockSend.mock.calls[0]![0];
+    expect(command.input.evaluatorId).toBe('eval-abc');
+  });
+
+  it('defaults level to SESSION when undefined', async () => {
+    mockSend.mockResolvedValue({
+      evaluatorId: 'eval-no-level',
+      level: undefined,
+      status: 'ACTIVE',
+    });
+
+    const result = await getEvaluator({ region: 'us-east-1', evaluatorId: 'eval-no-level' });
+    expect(result.level).toBe('SESSION');
+  });
+
+  it('propagates SDK errors', async () => {
+    mockSend.mockRejectedValue(new Error('AccessDenied'));
+
+    await expect(getEvaluator({ region: 'us-east-1', evaluatorId: 'eval-err' })).rejects.toThrow('AccessDenied');
+  });
+});
+
+describe('listEvaluators', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it('returns evaluator list', async () => {
+    mockSend.mockResolvedValue({
+      evaluators: [
+        {
+          evaluatorId: 'eval-1',
+          evaluatorArn: 'arn:1',
+          evaluatorName: 'builtin-help',
+          evaluatorType: 'Builtin',
+          level: 'SESSION',
+          status: 'ACTIVE',
+        },
+        {
+          evaluatorId: 'eval-2',
+          evaluatorArn: 'arn:2',
+          evaluatorName: 'custom-tone',
+          evaluatorType: 'Custom',
+          level: 'TRACE',
+          status: 'ACTIVE',
+          description: 'Tone checker',
+        },
+      ],
+      nextToken: 'page2',
+    });
+
+    const result = await listEvaluators({ region: 'us-east-1', maxResults: 10 });
+    expect(result.evaluators).toHaveLength(2);
+    expect(result.evaluators[0]!.evaluatorType).toBe('Builtin');
+    expect(result.evaluators[1]!.description).toBe('Tone checker');
+    expect(result.nextToken).toBe('page2');
+  });
+
+  it('returns empty list when no evaluators', async () => {
+    mockSend.mockResolvedValue({ evaluators: undefined });
+
+    const result = await listEvaluators({ region: 'us-east-1' });
+    expect(result.evaluators).toEqual([]);
+    expect(result.nextToken).toBeUndefined();
+  });
+
+  it('passes maxResults and nextToken in command', async () => {
+    mockSend.mockResolvedValue({ evaluators: [] });
+
+    await listEvaluators({ region: 'us-east-1', maxResults: 5, nextToken: 'tok-1' });
+
+    const command = mockSend.mock.calls[0]![0];
+    expect(command.input.maxResults).toBe(5);
+    expect(command.input.nextToken).toBe('tok-1');
+  });
+
+  it('propagates SDK errors', async () => {
+    mockSend.mockRejectedValue(new Error('Throttling'));
+
+    await expect(listEvaluators({ region: 'us-east-1' })).rejects.toThrow('Throttling');
+  });
+});
+
 describe('updateOnlineEvalExecutionStatus', () => {
   beforeEach(() => {
     vi.clearAllMocks();
@@ -144,3 +287,143 @@ describe('updateOnlineEvalExecutionStatus', () => {
     ).rejects.toThrow('Throttling');
   });
 });
+
+describe('getOnlineEvaluationConfig', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it('returns config details with output log group', async () => {
+    mockSend.mockResolvedValue({
+      onlineEvaluationConfigId: 'oec-123',
+      onlineEvaluationConfigArn: 'arn:aws:bedrock-agentcore:us-east-1:123456:online-eval/oec-123',
+      onlineEvaluationConfigName: 'my-online-eval',
+      status: 'ACTIVE',
+      executionStatus: 'ENABLED',
+      description: 'Production eval',
+      outputConfig: {
+        cloudWatchConfig: { logGroupName: '/aws/bedrock-agentcore/evaluations/oec-123' },
+      },
+    });
+
+    const result = await getOnlineEvaluationConfig({ region: 'us-east-1', configId: 'oec-123' });
+    expect(result.configId).toBe('oec-123');
+    expect(result.configName).toBe('my-online-eval');
+    expect(result.status).toBe('ACTIVE');
+    expect(result.executionStatus).toBe('ENABLED');
+    expect(result.description).toBe('Production eval');
+    expect(result.outputLogGroupName).toBe('/aws/bedrock-agentcore/evaluations/oec-123');
+  });
+
+  it('throws when no configId in response', async () => {
+    mockSend.mockResolvedValue({ onlineEvaluationConfigId: undefined });
+
+    await expect(getOnlineEvaluationConfig({ region: 'us-east-1', configId: 'oec-missing' })).rejects.toThrow(
+      'No online evaluation config found for ID oec-missing'
+    );
+  });
+
+  it('returns failureReason when present', async () => {
+    mockSend.mockResolvedValue({
+      onlineEvaluationConfigId: 'oec-fail',
+      onlineEvaluationConfigName: 'broken-eval',
+      status: 'CREATE_FAILED',
+      executionStatus: 'DISABLED',
+      failureReason: 'IAM role not found',
+    });
+
+    const result = await getOnlineEvaluationConfig({ region: 'us-east-1', configId: 'oec-fail' });
+    expect(result.status).toBe('CREATE_FAILED');
+    expect(result.failureReason).toBe('IAM role not found');
+  });
+
+  it('handles missing outputConfig', async () => {
+    mockSend.mockResolvedValue({
+      onlineEvaluationConfigId: 'oec-no-output',
+      status: 'CREATING',
+      executionStatus: 'DISABLED',
+    });
+
+    const result = await getOnlineEvaluationConfig({ region: 'us-east-1', configId: 'oec-no-output' });
+    expect(result.outputLogGroupName).toBeUndefined();
+  });
+
+  it('passes correct configId in command', async () => {
+    mockSend.mockResolvedValue({
+      onlineEvaluationConfigId: 'oec-abc',
+      status: 'ACTIVE',
+      executionStatus: 'ENABLED',
+    });
+
+    await getOnlineEvaluationConfig({ region: 'us-west-2', configId: 'oec-abc' });
+
+    const command = mockSend.mock.calls[0]![0];
+    expect(command.input.onlineEvaluationConfigId).toBe('oec-abc');
+  });
+
+  it('propagates SDK errors', async () => {
+    mockSend.mockRejectedValue(new Error('ResourceNotFoundException'));
+
+    await expect(getOnlineEvaluationConfig({ region: 'us-east-1', configId: 'oec-err' })).rejects.toThrow(
+      'ResourceNotFoundException'
+    );
+  });
+});
+
+describe('listOnlineEvaluationConfigs', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it('returns config list', async () => {
+    mockSend.mockResolvedValue({
+      onlineEvaluationConfigs: [
+        {
+          onlineEvaluationConfigId: 'oec-1',
+          onlineEvaluationConfigArn: 'arn:1',
+          onlineEvaluationConfigName: 'eval-prod',
+          status: 'ACTIVE',
+          executionStatus: 'ENABLED',
+        },
+        {
+          onlineEvaluationConfigId: 'oec-2',
+          onlineEvaluationConfigArn: 'arn:2',
+          onlineEvaluationConfigName: 'eval-staging',
+          status: 'ACTIVE',
+          executionStatus: 'DISABLED',
+        },
+      ],
+      nextToken: 'next-page',
+    });
+
+    const result = await listOnlineEvaluationConfigs({ region: 'us-east-1', maxResults: 20 });
+    expect(result.configs).toHaveLength(2);
+    expect(result.configs[0]!.configName).toBe('eval-prod');
+    expect(result.configs[1]!.executionStatus).toBe('DISABLED');
+    expect(result.nextToken).toBe('next-page');
+  });
+
+  it('returns empty list when no configs', async () => {
+    mockSend.mockResolvedValue({ onlineEvaluationConfigs: undefined });
+
+    const result = await listOnlineEvaluationConfigs({ region: 'us-east-1' });
+    expect(result.configs).toEqual([]);
+    expect(result.nextToken).toBeUndefined();
+  });
+
+  it('passes maxResults and nextToken in command', async () => {
+    mockSend.mockResolvedValue({ onlineEvaluationConfigs: [] });
+
+    await listOnlineEvaluationConfigs({ region: 'us-east-1', maxResults: 10, nextToken: 'tok-abc' });
+
+    const command = mockSend.mock.calls[0]![0];
+    expect(command.input.maxResults).toBe(10);
+    expect(command.input.nextToken).toBe('tok-abc');
+  });
+
+  it('propagates SDK errors', async () => {
+    mockSend.mockRejectedValue(new Error('InternalServerError'));
+
+    await expect(listOnlineEvaluationConfigs({ region: 'us-east-1' })).rejects.toThrow('InternalServerError');
+  });
+});
diff --git a/src/cli/aws/agentcore-control.ts b/src/cli/aws/agentcore-control.ts
index 4ef9ff64..1222a619 100644
--- a/src/cli/aws/agentcore-control.ts
+++ b/src/cli/aws/agentcore-control.ts
@@ -2,6 +2,10 @@ import { getCredentialProvider } from './account';
 import {
   BedrockAgentCoreControlClient,
   GetAgentRuntimeCommand,
+  GetEvaluatorCommand,
+  GetOnlineEvaluationConfigCommand,
+  ListEvaluatorsCommand,
+  ListOnlineEvaluationConfigsCommand,
   UpdateOnlineEvaluationConfigCommand,
 } from '@aws-sdk/client-bedrock-agentcore-control';
 
@@ -40,6 +44,98 @@ export async function getAgentRuntimeStatus(options: GetAgentRuntimeStatusOption
   };
 }
 
+// ============================================================================
+// Evaluator
+// ============================================================================
+
+export interface GetEvaluatorOptions {
+  region: string;
+  evaluatorId: string;
+}
+
+export interface GetEvaluatorResult {
+  evaluatorId: string;
+  evaluatorArn: string;
+  evaluatorName: string;
+  level: string;
+  status: string;
+  description?: string;
+}
+
+export async function getEvaluator(options: GetEvaluatorOptions): Promise<GetEvaluatorResult> {
+  const client = new BedrockAgentCoreControlClient({
+    region: options.region,
+    credentials: getCredentialProvider(),
+  });
+
+  const command = new GetEvaluatorCommand({
+    evaluatorId: options.evaluatorId,
+  });
+
+  const response = await client.send(command);
+
+  if (!response.evaluatorId) {
+    throw new Error(`No evaluator found for ID ${options.evaluatorId}`);
+  }
+
+  return {
+    evaluatorId: response.evaluatorId,
+    evaluatorArn: response.evaluatorArn ?? '',
+    evaluatorName: response.evaluatorName ?? '',
+    level: response.level ?? 'SESSION',
+    status: response.status ?? 'UNKNOWN',
+    description: response.description,
+  };
+}
+
+export interface ListEvaluatorsOptions {
+  region: string;
+  maxResults?: number;
+  nextToken?: string;
+}
+
+export interface EvaluatorSummary {
+  evaluatorId: string;
+  evaluatorArn: string;
+  evaluatorName: string;
+  evaluatorType: string;
+  level?: string;
+  status: string;
+  description?: string;
+}
+
+export interface ListEvaluatorsResult {
+  evaluators: EvaluatorSummary[];
+  nextToken?: string;
+}
+
+export async function listEvaluators(options: ListEvaluatorsOptions): Promise<ListEvaluatorsResult> {
+  const client = new BedrockAgentCoreControlClient({
+    region: options.region,
+    credentials: getCredentialProvider(),
+  });
+
+  const command = new ListEvaluatorsCommand({
+    maxResults: options.maxResults,
+    nextToken: options.nextToken,
+  });
+
+  const response = await client.send(command);
+
+  return {
+    evaluators: (response.evaluators ?? []).map(e => ({
+      evaluatorId: e.evaluatorId ?? '',
+      evaluatorArn: e.evaluatorArn ?? '',
+      evaluatorName: e.evaluatorName ?? '',
+      evaluatorType: e.evaluatorType ?? 'Custom',
+      level: e.level,
+      status: e.status ?? 'UNKNOWN',
+      description: e.description,
+    })),
+    nextToken: response.nextToken,
+  };
+}
+
 // ============================================================================
 // Online Eval Config
 // ============================================================================
@@ -82,3 +178,101 @@ export async function updateOnlineEvalExecutionStatus(
     status: response.status ?? 'UNKNOWN',
   };
 }
+
+export interface GetOnlineEvalConfigOptions {
+  region: string;
+  configId: string;
+}
+
+export interface GetOnlineEvalConfigResult {
+  configId: string;
+  configArn: string;
+  configName: string;
+  status: string;
+  executionStatus: string;
+  description?: string;
+  failureReason?: string;
+  outputLogGroupName?: string;
+}
+
+export async function getOnlineEvaluationConfig(
+  options: GetOnlineEvalConfigOptions
+): Promise<GetOnlineEvalConfigResult> {
+  const client = new BedrockAgentCoreControlClient({
+    region: options.region,
+    credentials: getCredentialProvider(),
+  });
+
+  const command = new GetOnlineEvaluationConfigCommand({
+    onlineEvaluationConfigId: options.configId,
+  });
+
+  const response = await client.send(command);
+
+  if (!response.onlineEvaluationConfigId) {
+    throw new Error(`No online evaluation config found for ID ${options.configId}`);
+  }
+
+  const logGroupName = response.outputConfig?.cloudWatchConfig?.logGroupName;
+
+  return {
+    configId: response.onlineEvaluationConfigId,
+    configArn: response.onlineEvaluationConfigArn ?? '',
+    configName: response.onlineEvaluationConfigName ?? '',
+    status: response.status ?? 'UNKNOWN',
+    executionStatus: response.executionStatus ?? 'UNKNOWN',
+    description: response.description,
+    failureReason: response.failureReason,
+    outputLogGroupName: logGroupName,
+  };
+}
+
+export interface ListOnlineEvalConfigsOptions {
+  region: string;
+  maxResults?: number;
+  nextToken?: string;
+}
+
+export interface OnlineEvalConfigSummary {
+  configId: string;
+  configArn: string;
+  configName: string;
+  status: string;
+  executionStatus: string;
+  description?: string;
+  failureReason?: string;
+}
+
+export interface ListOnlineEvalConfigsResult {
+  configs: OnlineEvalConfigSummary[];
+  nextToken?: string;
+}
+
+export async function listOnlineEvaluationConfigs(
+  options: ListOnlineEvalConfigsOptions
+): Promise<ListOnlineEvalConfigsResult> {
+  const client = new BedrockAgentCoreControlClient({
+    region: options.region,
+    credentials: getCredentialProvider(),
+  });
+
+  const command = new ListOnlineEvaluationConfigsCommand({
+    maxResults: options.maxResults,
+    nextToken: options.nextToken,
+  });
+
+  const response = await client.send(command);
+
+  return {
+    configs: (response.onlineEvaluationConfigs ?? []).map(c => ({
+      configId: c.onlineEvaluationConfigId ?? '',
+      configArn: c.onlineEvaluationConfigArn ?? '',
+      configName: c.onlineEvaluationConfigName ?? '',
+      status: c.status ?? 'UNKNOWN',
+      executionStatus: c.executionStatus ?? 'UNKNOWN',
+      description: c.description,
+      failureReason: c.failureReason,
+    })),
+    nextToken: response.nextToken,
+  };
+}
diff --git a/src/cli/commands/run/command.tsx b/src/cli/commands/run/command.tsx
index b63cfbab..076e054c 100644
--- a/src/cli/commands/run/command.tsx
+++ b/src/cli/commands/run/command.tsx
@@ -33,21 +33,27 @@ export const registerRun = (program: Command) => {
     .command('eval')
     .description('Run on-demand evaluation of agent traces')
     .option('-a, --agent <name>', 'Agent to evaluate')
+    .option('--agent-arn <arn>', 'Agent runtime ARN (bypasses project config)')
     .option('-e, --evaluator <names...>', 'Evaluator name(s) or Builtin.* IDs')
     .option('--evaluator-arn <arns...>', 'Evaluator ARN(s) to use directly')
+    .option('--region <region>', 'AWS region (required with --agent-arn, inferred otherwise)')
     .option('--days <days>', 'Lookback window in days', '7')
     .option('--output <path>', 'Custom output file path for results')
     .option('--json', 'Output as JSON')
     .action(
       async (cliOptions: {
         agent?: string;
+        agentArn?: string;
         evaluator?: string[];
         evaluatorArn?: string[];
+        region?: string;
         days: string;
         output?: string;
         json?: boolean;
       }) => {
-        requireProject();
+        if (!cliOptions.agentArn) {
+          requireProject();
+        }
 
         if (!cliOptions.evaluator && !cliOptions.evaluatorArn) {
           const error = 'At least one --evaluator or --evaluator-arn is required';
@@ -61,8 +67,10 @@ export const registerRun = (program: Command) => {
 
         const options: RunEvalOptions = {
           agent: cliOptions.agent,
+          agentArn: cliOptions.agentArn,
           evaluator: cliOptions.evaluator ?? [],
           evaluatorArn: cliOptions.evaluatorArn,
+          region: cliOptions.region,
           days: parseInt(cliOptions.days, 10),
           output: cliOptions.output,
           json: cliOptions.json,
diff --git a/src/cli/operations/eval/__tests__/run-eval.test.ts b/src/cli/operations/eval/__tests__/run-eval.test.ts
index 9b19ea0e..4c5e2c83 100644
--- a/src/cli/operations/eval/__tests__/run-eval.test.ts
+++ b/src/cli/operations/eval/__tests__/run-eval.test.ts
@@ -449,6 +449,132 @@ describe('handleRunEval', () => {
     expect(result.run!.results[1]!.aggregateScore).toBe(4.5);
   });
 
+  // ─── ARN mode ─────────────────────────────────────────────────────────────
+
+  it('resolves context from agent runtime ARN without project config', async () => {
+    setupCloudWatchToReturn([makeOtelSpanRow('s1', 't1')]);
+    mockEvaluate.mockResolvedValue({
+      evaluationResults: [{ value: 4.0, context: { spanContext: { sessionId: 's1' } } }],
+    });
+
+    const result = await handleRunEval({
+      agentArn: 'arn:aws:bedrock-agentcore:us-west-2:123456789012:runtime/rt-arn-test',
+      evaluator: ['Builtin.Helpfulness'],
+      evaluatorArn: [],
+      days: 3,
+    });
+
+    expect(result.success).toBe(true);
+    expect(result.run!.agent).toBe('rt-arn-test');
+    expect(mockLoadDeployedProjectConfig).not.toHaveBeenCalled();
+    expect(mockResolveAgent).not.toHaveBeenCalled();
+  });
+
+  it('uses --region override in ARN mode', async () => {
+    setupCloudWatchToReturn([makeOtelSpanRow('s1', 't1')]);
+    mockEvaluate.mockResolvedValue({
+      evaluationResults: [{ value: 3.0, context: { spanContext: { sessionId: 's1' } } }],
+    });
+
+    const result = await handleRunEval({
+      agentArn: 'arn:aws:bedrock-agentcore:us-west-2:123456789012:runtime/rt-region-test',
+      evaluator: ['Builtin.Helpfulness'],
+      region: 'eu-west-1',
+      days: 7,
+    });
+
+    expect(result.success).toBe(true);
+    // Should not load project config
+    expect(mockLoadDeployedProjectConfig).not.toHaveBeenCalled();
+  });
+
+  it('resolves evaluator ARNs in ARN mode', async () => {
+    setupCloudWatchToReturn([makeOtelSpanRow('s1', 't1')]);
+    mockEvaluate.mockResolvedValue({
+      evaluationResults: [{ value: 5.0, context: { spanContext: { sessionId: 's1' } } }],
+    });
+
+    const result = await handleRunEval({
+      agentArn: 'arn:aws:bedrock-agentcore:us-east-1:123456789012:runtime/rt-abc',
+      evaluator: [],
+      evaluatorArn: ['arn:aws:bedrock-agentcore:us-east-1:123456789012:evaluator/eval-xyz'],
+      days: 7,
+    });
+
+    expect(result.success).toBe(true);
+    expect(mockEvaluate).toHaveBeenCalledWith(expect.objectContaining({ evaluatorId: 'eval-xyz' }));
+  });
+
+  it('returns error for invalid ARN format', async () => {
+    const result = await handleRunEval({
+      agentArn: 'not-an-arn',
+      evaluator: ['Builtin.Helpfulness'],
+      days: 7,
+    });
+
+    expect(result.success).toBe(false);
+    expect(result.error).toContain('Invalid agent runtime ARN');
+  });
+
+  it('rejects custom evaluator names in ARN mode', async () => {
+    const result = await handleRunEval({
+      agentArn: 'arn:aws:bedrock-agentcore:us-east-1:123456789012:runtime/rt-abc',
+      evaluator: ['MyCustomEval'],
+      days: 7,
+    });
+
+    expect(result.success).toBe(false);
+    expect(result.error).toContain('cannot be resolved in ARN mode');
+  });
+
+  it('saves to cwd in ARN mode when no --output is specified', async () => {
+    setupCloudWatchToReturn([makeOtelSpanRow('s1', 't1')]);
+    mockEvaluate.mockResolvedValue({
+      evaluationResults: [{ value: 4.0, context: { spanContext: { sessionId: 's1' } } }],
+    });
+
+    const result = await handleRunEval({
+      agentArn: 'arn:aws:bedrock-agentcore:us-east-1:123456789012:runtime/rt-save-test',
+      evaluator: ['Builtin.Helpfulness'],
+      days: 7,
+    });
+
+    expect(result.success).toBe(true);
+    // Should write to cwd, not call saveEvalRun (which requires a project)
+    expect(mockSaveEvalRun).not.toHaveBeenCalled();
+    expect(mockWriteFileSync).toHaveBeenCalledWith(expect.stringContaining('run_test-123.json'), expect.any(String));
+    expect(result.filePath).toContain('run_test-123.json');
+  });
+
+  it('saves to --output path in ARN mode', async () => {
+    setupCloudWatchToReturn([makeOtelSpanRow('s1', 't1')]);
+    mockEvaluate.mockResolvedValue({
+      evaluationResults: [{ value: 4.0, context: { spanContext: { sessionId: 's1' } } }],
+    });
+
+    const result = await handleRunEval({
+      agentArn: 'arn:aws:bedrock-agentcore:us-east-1:123456789012:runtime/rt-output-test',
+      evaluator: ['Builtin.Helpfulness'],
+      days: 7,
+      output: '/tmp/custom-eval.json',
+    });
+
+    expect(result.success).toBe(true);
+    expect(mockWriteFileSync).toHaveBeenCalledWith('/tmp/custom-eval.json', expect.any(String));
+    expect(result.filePath).toBe('/tmp/custom-eval.json');
+  });
+
+  it('returns error when no evaluators in ARN mode', async () => {
+    const result = await handleRunEval({
+      agentArn: 'arn:aws:bedrock-agentcore:us-east-1:123456789012:runtime/rt-abc',
+      evaluator: [],
+      days: 7,
+    });
+
+    expect(result.success).toBe(false);
+    expect(result.error).toContain('No evaluators specified');
+  });
+
   // ─── Query sanitization ───────────────────────────────────────────────────
 
   it('sanitizes runtimeId in CloudWatch query to prevent injection', async () => {
diff --git a/src/cli/operations/eval/run-eval.ts b/src/cli/operations/eval/run-eval.ts
index d925df3f..c9775f1e 100644
--- a/src/cli/operations/eval/run-eval.ts
+++ b/src/cli/operations/eval/run-eval.ts
@@ -9,6 +9,7 @@ import { CloudWatchLogsClient, GetQueryResultsCommand, StartQueryCommand } from
 import type { ResultField } from '@aws-sdk/client-cloudwatch-logs';
 import type { DocumentType } from '@smithy/types';
 import { writeFileSync } from 'fs';
+import { join } from 'path';
 
 const SPANS_LOG_GROUP = 'aws/spans';
 
@@ -19,18 +20,97 @@ const SUPPORTED_SCOPES = new Set([
 ]);
 
 interface ResolvedEvalContext {
-  agentName: string;
+  agentLabel: string;
   region: string;
-  accountId: string;
   runtimeId: string;
   runtimeLogGroup: string;
   evaluatorIds: string[];
+  evaluatorLabels: string[];
 }
 
-function resolveEvalContext(
-  context: DeployedProjectConfig,
-  options: RunEvalOptions
-): { success: true; ctx: ResolvedEvalContext } | { success: false; error: string } {
+type ResolveResult = { success: true; ctx: ResolvedEvalContext } | { success: false; error: string };
+
+/**
+ * Resolve evaluator IDs from ARN strings or raw IDs.
+ * Returns the extracted evaluator ID (last segment of ARN, or the value as-is).
+ */
+function resolveEvaluatorArns(arns: string[]): string[] {
+  return arns.map(arnOrId => {
+    const arnMatch = /evaluator\/(.+)$/.exec(arnOrId);
+    return arnMatch ? arnMatch[1]! : arnOrId;
+  });
+}
+
+/**
+ * ARN mode: resolve context directly from an agent runtime ARN.
+ * No project config needed.
+ */
+function resolveFromArn(options: RunEvalOptions): ResolveResult {
+  const arn = options.agentArn!;
+
+  // Parse ARN: arn:aws:bedrock-agentcore:<region>:<account>:runtime/<runtimeId>
+  const arnParts = arn.split(':');
+  if (arnParts.length < 6) {
+    return { success: false, error: `Invalid agent runtime ARN: ${arn}` };
+  }
+
+  const region = options.region ?? arnParts[3];
+  if (!region) {
+    return { success: false, error: 'Could not determine region from ARN. Use --region to specify.' };
+  }
+
+  const resourcePart = arnParts.slice(5).join(':');
+  const runtimeMatch = /runtime\/(.+)$/.exec(resourcePart);
+  if (!runtimeMatch) {
+    return { success: false, error: `Could not extract runtime ID from ARN: ${arn}` };
+  }
+  const runtimeId = runtimeMatch[1]!;
+
+  // In ARN mode, evaluators must come from --evaluator-arn or Builtin.* names
+  const evaluatorIds: string[] = [];
+  const evaluatorLabels: string[] = [];
+
+  for (const evalName of options.evaluator) {
+    if (evalName.startsWith('Builtin.')) {
+      evaluatorIds.push(evalName);
+      evaluatorLabels.push(evalName);
+    } else {
+      return {
+        success: false,
+        error: `Custom evaluator "${evalName}" cannot be resolved in ARN mode. Use --evaluator-arn with an evaluator ARN or ID, or use Builtin.* evaluators.`,
+      };
+    }
+  }
+
+  if (options.evaluatorArn) {
+    const resolved = resolveEvaluatorArns(options.evaluatorArn);
+    evaluatorIds.push(...resolved);
+    evaluatorLabels.push(...options.evaluatorArn);
+  }
+
+  if (evaluatorIds.length === 0) {
+    return { success: false, error: 'No evaluators specified. Use -e/--evaluator with Builtin.* or --evaluator-arn.' };
+  }
+
+  const runtimeLogGroup = `/aws/bedrock-agentcore/runtimes/${runtimeId}-${DEFAULT_ENDPOINT_NAME}`;
+
+  return {
+    success: true,
+    ctx: {
+      agentLabel: runtimeId,
+      region,
+      runtimeId,
+      runtimeLogGroup,
+      evaluatorIds,
+      evaluatorLabels,
+    },
+  };
+}
+
+/**
+ * Project mode: resolve context from agentcore.json + deployed-state.json.
+ */
+function resolveFromProject(context: DeployedProjectConfig, options: RunEvalOptions): ResolveResult {
   const agentResult = resolveAgent(context, { agent: options.agent });
   if (!agentResult.success) {
     return agentResult;
@@ -41,11 +121,13 @@ function resolveEvalContext(
 
   // Resolve evaluator names to IDs
   const evaluatorIds: string[] = [];
+  const evaluatorLabels: string[] = [];
   const targetResources = context.deployedState.targets[agent.targetName]?.resources;
 
   for (const evalName of options.evaluator) {
     if (evalName.startsWith('Builtin.')) {
       evaluatorIds.push(evalName);
+      evaluatorLabels.push(evalName);
       continue;
     }
 
@@ -57,14 +139,14 @@ function resolveEvalContext(
       };
     }
     evaluatorIds.push(deployedEval.evaluatorId);
+    evaluatorLabels.push(evalName);
   }
 
-  // Also add any direct ARNs/IDs — extract ID from ARN if full ARN is passed
+  // Also add any direct ARNs/IDs
   if (options.evaluatorArn) {
-    for (const arnOrId of options.evaluatorArn) {
-      const arnMatch = /evaluator\/(.+)$/.exec(arnOrId);
-      evaluatorIds.push(arnMatch ? arnMatch[1]! : arnOrId);
-    }
+    const resolved = resolveEvaluatorArns(options.evaluatorArn);
+    evaluatorIds.push(...resolved);
+    evaluatorLabels.push(...options.evaluatorArn);
   }
 
   if (evaluatorIds.length === 0) {
@@ -74,12 +156,12 @@ function resolveEvalContext(
   return {
     success: true,
     ctx: {
-      agentName: agent.agentName,
+      agentLabel: agent.agentName,
       region: agent.region,
-      accountId: agent.accountId,
       runtimeId: agent.runtimeId,
       runtimeLogGroup,
       evaluatorIds,
+      evaluatorLabels,
     },
   };
 }
@@ -307,8 +389,14 @@ export interface RunEvalResult {
 }
 
 export async function handleRunEval(options: RunEvalOptions): Promise<RunEvalResult> {
-  const context = await loadDeployedProjectConfig();
-  const resolution = resolveEvalContext(context, options);
+  let resolution: ResolveResult;
+
+  if (options.agentArn) {
+    resolution = resolveFromArn(options);
+  } else {
+    const context = await loadDeployedProjectConfig();
+    resolution = resolveFromProject(context, options);
+  }
 
   if (!resolution.success) {
     return { success: false, error: resolution.error };
@@ -322,17 +410,16 @@ export async function handleRunEval(options: RunEvalOptions): Promise<RunEvalRes
   if (sessions.length === 0) {
     return {
       success: false,
-      error: `No session spans found for agent "${ctx.agentName}" in the last ${options.days} day(s). Has the agent been invoked?`,
+      error: `No session spans found for agent "${ctx.agentLabel}" in the last ${options.days} day(s). Has the agent been invoked?`,
     };
   }
 
   // Run each evaluator against each session
   const results: EvalEvaluatorResult[] = [];
-  const allEvaluatorNames = [...options.evaluator, ...(options.evaluatorArn ?? [])];
 
   for (let i = 0; i < ctx.evaluatorIds.length; i++) {
     const evaluatorId = ctx.evaluatorIds[i]!;
-    const evaluatorName = allEvaluatorNames[i] ?? evaluatorId;
+    const evaluatorName = ctx.evaluatorLabels[i] ?? evaluatorId;
 
     const sessionScores: EvalSessionScore[] = [];
     let totalInputTokens = 0;
@@ -379,8 +466,8 @@ export async function handleRunEval(options: RunEvalOptions): Promise<RunEvalRes
   const run: EvalRunResult = {
     runId: generateRunId(),
     timestamp: new Date().toISOString(),
-    agent: ctx.agentName,
-    evaluators: allEvaluatorNames,
+    agent: ctx.agentLabel,
+    evaluators: ctx.evaluatorLabels,
     lookbackDays: options.days,
     sessionCount: sessions.length,
     results,
@@ -391,6 +478,11 @@ export async function handleRunEval(options: RunEvalOptions): Promise<RunEvalRes
   if (options.output) {
     writeFileSync(options.output, JSON.stringify(run, null, 2));
     filePath = options.output;
+  } else if (options.agentArn) {
+    // ARN mode may not have a project directory — save to cwd
+    const fallbackPath = join(process.cwd(), `${run.runId}.json`);
+    writeFileSync(fallbackPath, JSON.stringify(run, null, 2));
+    filePath = fallbackPath;
   } else {
     filePath = saveEvalRun(run);
   }
diff --git a/src/cli/operations/eval/types.ts b/src/cli/operations/eval/types.ts
index 73447aba..e78c3b92 100644
--- a/src/cli/operations/eval/types.ts
+++ b/src/cli/operations/eval/types.ts
@@ -34,9 +34,16 @@ export interface EvalRunResult {
 
 /** Options for running an eval */
 export interface RunEvalOptions {
+  /** Agent name (project mode) */
   agent?: string;
+  /** Evaluator names or Builtin.* IDs (resolved via project deployed state) */
   evaluator: string[];
+  /** Evaluator ARN(s) or IDs passed directly */
   evaluatorArn?: string[];
+  /** Agent runtime ARN (ARN mode — bypasses project config) */
+  agentArn?: string;
+  /** AWS region (required with --agent-arn, inferred otherwise) */
+  region?: string;
   days: number;
   output?: string;
   json?: boolean;

From c64d4e46b1f3018a14341e0341247c725691464b Mon Sep 17 00:00:00 2001
From: notgitika <gitijh@gmail.com>
Date: Thu, 12 Mar 2026 01:50:47 -0400
Subject: [PATCH 6/9] feat: add eval discovery commands, status enrichment, and
 schema updates

---
 src/cli/aws/agentcore-control.ts              |  19 +-
 src/cli/commands/eval/command.tsx             | 239 ++++++++++++++
 src/cli/commands/run/command.tsx              |   6 +
 .../commands/status/__tests__/action.test.ts  | 212 +++++++++++-
 src/cli/commands/status/action.ts             |  77 +++++
 .../eval/__tests__/logs-eval.test.ts          |  96 +++++-
 .../eval/__tests__/run-eval.test.ts           | 302 +++++++++++++++++-
 src/cli/operations/eval/logs-eval.ts          |  50 ++-
 src/cli/operations/eval/run-eval.ts           | 154 +++++++--
 src/cli/operations/eval/types.ts              |   4 +
 src/cli/tui/screens/online-eval/types.ts      |   2 +
 .../__tests__/online-eval-config.test.ts      |  19 ++
 .../schemas/primitives/online-eval-config.ts  |   4 +
 13 files changed, 1145 insertions(+), 39 deletions(-)

diff --git a/src/cli/aws/agentcore-control.ts b/src/cli/aws/agentcore-control.ts
index 1222a619..070f2b25 100644
--- a/src/cli/aws/agentcore-control.ts
+++ b/src/cli/aws/agentcore-control.ts
@@ -148,6 +148,13 @@ export interface UpdateOnlineEvalStatusOptions {
   executionStatus: OnlineEvalExecutionStatus;
 }
 
+export interface UpdateOnlineEvalOptions {
+  region: string;
+  onlineEvaluationConfigId: string;
+  executionStatus?: OnlineEvalExecutionStatus;
+  description?: string;
+}
+
 export interface UpdateOnlineEvalStatusResult {
   configId: string;
   executionStatus: string;
@@ -160,6 +167,13 @@ export interface UpdateOnlineEvalStatusResult {
 export async function updateOnlineEvalExecutionStatus(
   options: UpdateOnlineEvalStatusOptions
 ): Promise<UpdateOnlineEvalStatusResult> {
+  return updateOnlineEvalConfig(options);
+}
+
+/**
+ * Update an online evaluation config with any supported fields.
+ */
+export async function updateOnlineEvalConfig(options: UpdateOnlineEvalOptions): Promise<UpdateOnlineEvalStatusResult> {
   const client = new BedrockAgentCoreControlClient({
     region: options.region,
     credentials: getCredentialProvider(),
@@ -167,14 +181,15 @@ export async function updateOnlineEvalExecutionStatus(
 
   const command = new UpdateOnlineEvaluationConfigCommand({
     onlineEvaluationConfigId: options.onlineEvaluationConfigId,
-    executionStatus: options.executionStatus,
+    ...(options.executionStatus && { executionStatus: options.executionStatus }),
+    ...(options.description !== undefined && { description: options.description }),
   });
 
   const response = await client.send(command);
 
   return {
     configId: response.onlineEvaluationConfigId ?? options.onlineEvaluationConfigId,
-    executionStatus: response.executionStatus ?? options.executionStatus,
+    executionStatus: response.executionStatus ?? options.executionStatus ?? 'UNKNOWN',
     status: response.status ?? 'UNKNOWN',
   };
 }
diff --git a/src/cli/commands/eval/command.tsx b/src/cli/commands/eval/command.tsx
index 4184b1cb..0e0cf0b0 100644
--- a/src/cli/commands/eval/command.tsx
+++ b/src/cli/commands/eval/command.tsx
@@ -1,3 +1,12 @@
+import {
+  getEvaluator,
+  getOnlineEvaluationConfig,
+  listEvaluators,
+  listOnlineEvaluationConfigs,
+  updateOnlineEvalConfig,
+} from '../../aws/agentcore-control';
+import type { OnlineEvalExecutionStatus } from '../../aws/agentcore-control';
+import { detectRegion } from '../../aws/region';
 import { getErrorMessage } from '../../errors';
 import { handleGetEvalRun, handleListEvalRuns } from '../../operations/eval';
 import { COMMAND_DESCRIPTIONS } from '../../tui/copy';
@@ -135,4 +144,234 @@ export const registerEval = (program: Command) => {
         }
       }
     );
+
+  // ── WI-7: Evaluator Discovery Commands ──────────────────────────────
+
+  evalCmd
+    .command('list-evaluators')
+    .description('List available evaluators (built-in and custom)')
+    .option('--region <region>', 'AWS region')
+    .option('--max-results <n>', 'Maximum number of results')
+    .option('--json', 'Output as JSON')
+    .action(async (cliOptions: { region?: string; maxResults?: string; json?: boolean }) => {
+      try {
+        const region = cliOptions.region ?? (await detectRegion()).region;
+        const result = await listEvaluators({
+          region,
+          maxResults: cliOptions.maxResults ? parseInt(cliOptions.maxResults, 10) : undefined,
+        });
+
+        if (cliOptions.json) {
+          console.log(JSON.stringify(result));
+          return;
+        }
+
+        if (result.evaluators.length === 0) {
+          console.log('No evaluators found.');
+          return;
+        }
+
+        console.log(`\n${'ID'.padEnd(45)} ${'Name'.padEnd(30)} ${'Type'.padEnd(10)} ${'Level'.padEnd(12)} Status`);
+        console.log('─'.repeat(110));
+
+        for (const e of result.evaluators) {
+          console.log(
+            `${e.evaluatorId.padEnd(45)} ${e.evaluatorName.padEnd(30)} ${e.evaluatorType.padEnd(10)} ${(e.level ?? '—').padEnd(12)} ${e.status}`
+          );
+        }
+        console.log('');
+      } catch (error) {
+        if (cliOptions.json) {
+          console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
+        } else {
+          render(<Text color="red">Error: {getErrorMessage(error)}</Text>);
+        }
+        process.exit(1);
+      }
+    });
+
+  evalCmd
+    .command('get-evaluator')
+    .description('Get details of a specific evaluator')
+    .argument('<evaluatorId>', 'Evaluator ID')
+    .option('--region <region>', 'AWS region')
+    .option('--json', 'Output as JSON')
+    .action(async (evaluatorId: string, cliOptions: { region?: string; json?: boolean }) => {
+      try {
+        const region = cliOptions.region ?? (await detectRegion()).region;
+        const result = await getEvaluator({ region, evaluatorId });
+
+        if (cliOptions.json) {
+          console.log(JSON.stringify(result));
+          return;
+        }
+
+        console.log(`\nEvaluator: ${result.evaluatorName}`);
+        console.log(`ID: ${result.evaluatorId}`);
+        console.log(`ARN: ${result.evaluatorArn}`);
+        console.log(`Level: ${result.level}`);
+        console.log(`Status: ${result.status}`);
+        if (result.description) {
+          console.log(`Description: ${result.description}`);
+        }
+        console.log('');
+      } catch (error) {
+        if (cliOptions.json) {
+          console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
+        } else {
+          render(<Text color="red">Error: {getErrorMessage(error)}</Text>);
+        }
+        process.exit(1);
+      }
+    });
+
+  // ── WI-8: Online Eval Config Inspection Commands ────────────────────
+
+  evalCmd
+    .command('list-online')
+    .description('List online evaluation configs')
+    .option('--region <region>', 'AWS region')
+    .option('--max-results <n>', 'Maximum number of results')
+    .option('--json', 'Output as JSON')
+    .action(async (cliOptions: { region?: string; maxResults?: string; json?: boolean }) => {
+      try {
+        const region = cliOptions.region ?? (await detectRegion()).region;
+        const result = await listOnlineEvaluationConfigs({
+          region,
+          maxResults: cliOptions.maxResults ? parseInt(cliOptions.maxResults, 10) : undefined,
+        });
+
+        if (cliOptions.json) {
+          console.log(JSON.stringify(result));
+          return;
+        }
+
+        if (result.configs.length === 0) {
+          console.log('No online eval configs found.');
+          return;
+        }
+
+        console.log(`\n${'ID'.padEnd(50)} ${'Name'.padEnd(30)} ${'Status'.padEnd(18)} Execution`);
+        console.log('─'.repeat(115));
+
+        for (const c of result.configs) {
+          const failSuffix = c.failureReason ? ` (${c.failureReason})` : '';
+          console.log(
+            `${c.configId.padEnd(50)} ${c.configName.padEnd(30)} ${c.status.padEnd(18)} ${c.executionStatus}${failSuffix}`
+          );
+        }
+        console.log('');
+      } catch (error) {
+        if (cliOptions.json) {
+          console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
+        } else {
+          render(<Text color="red">Error: {getErrorMessage(error)}</Text>);
+        }
+        process.exit(1);
+      }
+    });
+
+  evalCmd
+    .command('get-online')
+    .description('Get details of a specific online evaluation config')
+    .argument('<configId>', 'Online evaluation config ID')
+    .option('--region <region>', 'AWS region')
+    .option('--json', 'Output as JSON')
+    .action(async (configId: string, cliOptions: { region?: string; json?: boolean }) => {
+      try {
+        const region = cliOptions.region ?? (await detectRegion()).region;
+        const result = await getOnlineEvaluationConfig({ region, configId });
+
+        if (cliOptions.json) {
+          console.log(JSON.stringify(result));
+          return;
+        }
+
+        console.log(`\nOnline Eval Config: ${result.configName}`);
+        console.log(`ID: ${result.configId}`);
+        console.log(`ARN: ${result.configArn}`);
+        console.log(`Status: ${result.status}`);
+        console.log(`Execution: ${result.executionStatus}`);
+        if (result.description) {
+          console.log(`Description: ${result.description}`);
+        }
+        if (result.failureReason) {
+          console.log(`Failure: ${result.failureReason}`);
+        }
+        if (result.outputLogGroupName) {
+          console.log(`Log Group: ${result.outputLogGroupName}`);
+        }
+        console.log('');
+      } catch (error) {
+        if (cliOptions.json) {
+          console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
+        } else {
+          render(<Text color="red">Error: {getErrorMessage(error)}</Text>);
+        }
+        process.exit(1);
+      }
+    });
+
+  // ── WI-9: Online Eval Config Update ─────────────────────────────────
+
+  evalCmd
+    .command('update-online')
+    .description('Update a deployed online evaluation config')
+    .argument('<configId>', 'Online evaluation config ID')
+    .option('--status <status>', 'Set execution status (ENABLED or DISABLED)')
+    .option('--description <text>', 'Set config description')
+    .option('--region <region>', 'AWS region')
+    .option('--json', 'Output as JSON')
+    .action(
+      async (
+        configId: string,
+        cliOptions: { status?: string; description?: string; region?: string; json?: boolean }
+      ) => {
+        try {
+          if (!cliOptions.status && cliOptions.description === undefined) {
+            const error = 'At least one of --status or --description is required';
+            if (cliOptions.json) {
+              console.log(JSON.stringify({ success: false, error }));
+            } else {
+              render(<Text color="red">{error}</Text>);
+            }
+            process.exit(1);
+          }
+
+          if (cliOptions.status && !['ENABLED', 'DISABLED'].includes(cliOptions.status)) {
+            const error = `Invalid status "${cliOptions.status}". Must be ENABLED or DISABLED.`;
+            if (cliOptions.json) {
+              console.log(JSON.stringify({ success: false, error }));
+            } else {
+              render(<Text color="red">{error}</Text>);
+            }
+            process.exit(1);
+          }
+
+          const region = cliOptions.region ?? (await detectRegion()).region;
+          const result = await updateOnlineEvalConfig({
+            region,
+            onlineEvaluationConfigId: configId,
+            executionStatus: cliOptions.status as OnlineEvalExecutionStatus | undefined,
+            description: cliOptions.description,
+          });
+
+          if (cliOptions.json) {
+            console.log(JSON.stringify(result));
+            return;
+          }
+
+          console.log(`Updated online eval config "${configId}"`);
+          console.log(`  Status: ${result.status}`);
+          console.log(`  Execution: ${result.executionStatus}`);
+        } catch (error) {
+          if (cliOptions.json) {
+            console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
+          } else {
+            render(<Text color="red">Error: {getErrorMessage(error)}</Text>);
+          }
+          process.exit(1);
+        }
+      }
+    );
 };
diff --git a/src/cli/commands/run/command.tsx b/src/cli/commands/run/command.tsx
index 076e054c..bf0400e3 100644
--- a/src/cli/commands/run/command.tsx
+++ b/src/cli/commands/run/command.tsx
@@ -37,6 +37,8 @@ export const registerRun = (program: Command) => {
     .option('-e, --evaluator <names...>', 'Evaluator name(s) or Builtin.* IDs')
     .option('--evaluator-arn <arns...>', 'Evaluator ARN(s) to use directly')
     .option('--region <region>', 'AWS region (required with --agent-arn, inferred otherwise)')
+    .option('-s, --session-id <id>', 'Evaluate a specific session only')
+    .option('-t, --trace-id <id>', 'Evaluate a specific trace only')
     .option('--days <days>', 'Lookback window in days', '7')
     .option('--output <path>', 'Custom output file path for results')
     .option('--json', 'Output as JSON')
@@ -47,6 +49,8 @@ export const registerRun = (program: Command) => {
         evaluator?: string[];
         evaluatorArn?: string[];
         region?: string;
+        sessionId?: string;
+        traceId?: string;
         days: string;
         output?: string;
         json?: boolean;
@@ -71,6 +75,8 @@ export const registerRun = (program: Command) => {
           evaluator: cliOptions.evaluator ?? [],
           evaluatorArn: cliOptions.evaluatorArn,
           region: cliOptions.region,
+          sessionId: cliOptions.sessionId,
+          traceId: cliOptions.traceId,
           days: parseInt(cliOptions.days, 10),
           output: cliOptions.output,
           json: cliOptions.json,
diff --git a/src/cli/commands/status/__tests__/action.test.ts b/src/cli/commands/status/__tests__/action.test.ts
index 603f9574..eeb6310a 100644
--- a/src/cli/commands/status/__tests__/action.test.ts
+++ b/src/cli/commands/status/__tests__/action.test.ts
@@ -1,6 +1,32 @@
 import type { AgentCoreMcpSpec, AgentCoreProjectSpec, DeployedResourceState } from '../../../../schema/index.js';
-import { computeResourceStatuses } from '../action.js';
-import { describe, expect, it } from 'vitest';
+import { computeResourceStatuses, handleProjectStatus } from '../action.js';
+import type { StatusContext } from '../action.js';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+
+const mockGetAgentRuntimeStatus = vi.fn();
+const mockGetEvaluator = vi.fn();
+const mockGetOnlineEvaluationConfig = vi.fn();
+
+vi.mock('../../../aws', () => ({
+  getAgentRuntimeStatus: (...args: unknown[]) => mockGetAgentRuntimeStatus(...args),
+}));
+
+vi.mock('../../../aws/agentcore-control', () => ({
+  getEvaluator: (...args: unknown[]) => mockGetEvaluator(...args),
+  getOnlineEvaluationConfig: (...args: unknown[]) => mockGetOnlineEvaluationConfig(...args),
+}));
+
+vi.mock('../../../logging', () => {
+  return {
+    ExecLogger: class {
+      startStep = vi.fn();
+      endStep = vi.fn();
+      log = vi.fn();
+      finalize = vi.fn();
+      getRelativeLogPath = vi.fn().mockReturnValue('logs/status.log');
+    },
+  };
+});
 
 const baseProject: AgentCoreProjectSpec = {
   name: 'test-project',
@@ -411,3 +437,185 @@ describe('computeResourceStatuses', () => {
     expect(deployedCred!.deploymentState).toBe('deployed');
   });
 });
+
+describe('handleProjectStatus — live enrichment', () => {
+  beforeEach(() => {
+    mockGetAgentRuntimeStatus.mockReset();
+    mockGetEvaluator.mockReset();
+    mockGetOnlineEvaluationConfig.mockReset();
+  });
+
+  afterEach(() => vi.clearAllMocks());
+
+  function makeContext(overrides: Partial<StatusContext> = {}): StatusContext {
+    return {
+      project: {
+        ...baseProject,
+        evaluators: [{ name: 'MyEval', level: 'SESSION', config: {} }],
+        onlineEvalConfigs: [{ name: 'MyConfig', agents: ['agent1'], evaluators: ['Builtin.Helpfulness'] }],
+      } as unknown as AgentCoreProjectSpec,
+      awsTargets: [{ name: 'dev', region: 'us-east-1', account: '123456789' }],
+      deployedState: {
+        targets: {
+          dev: {
+            resources: {
+              evaluators: {
+                MyEval: {
+                  evaluatorId: 'eval-123',
+                  evaluatorArn: 'arn:aws:bedrock:us-east-1:123456789:evaluator/eval-123',
+                },
+              },
+              onlineEvalConfigs: {
+                MyConfig: {
+                  onlineEvaluationConfigId: 'cfg-456',
+                  onlineEvaluationConfigArn: 'arn:aws:bedrock:us-east-1:123456789:online-evaluation-config/cfg-456',
+                },
+              },
+            },
+          },
+        },
+      },
+      ...overrides,
+    } as unknown as StatusContext;
+  }
+
+  it('enriches deployed evaluators with live status', async () => {
+    mockGetEvaluator.mockResolvedValue({
+      evaluatorId: 'eval-123',
+      evaluatorName: 'MyEval',
+      status: 'ACTIVE',
+      level: 'SESSION',
+    });
+    mockGetOnlineEvaluationConfig.mockResolvedValue({
+      configId: 'cfg-456',
+      configName: 'MyConfig',
+      status: 'ACTIVE',
+      executionStatus: 'ENABLED',
+    });
+
+    const result = await handleProjectStatus(makeContext());
+
+    expect(result.success).toBe(true);
+
+    const evalEntry = result.resources.find(r => r.resourceType === 'evaluator' && r.name === 'MyEval');
+    expect(evalEntry).toBeDefined();
+    expect(evalEntry!.detail).toContain('ACTIVE');
+
+    expect(mockGetEvaluator).toHaveBeenCalledWith({
+      region: 'us-east-1',
+      evaluatorId: 'eval-123',
+    });
+  });
+
+  it('enriches deployed online eval configs with live status', async () => {
+    mockGetEvaluator.mockResolvedValue({
+      evaluatorId: 'eval-123',
+      evaluatorName: 'MyEval',
+      status: 'ACTIVE',
+      level: 'SESSION',
+    });
+    mockGetOnlineEvaluationConfig.mockResolvedValue({
+      configId: 'cfg-456',
+      configName: 'MyConfig',
+      status: 'ACTIVE',
+      executionStatus: 'ENABLED',
+    });
+
+    const result = await handleProjectStatus(makeContext());
+
+    expect(result.success).toBe(true);
+
+    const configEntry = result.resources.find(r => r.resourceType === 'online-eval' && r.name === 'MyConfig');
+    expect(configEntry).toBeDefined();
+    expect(configEntry!.detail).toContain('ACTIVE');
+    expect(configEntry!.detail).toContain('ENABLED');
+
+    expect(mockGetOnlineEvaluationConfig).toHaveBeenCalledWith({
+      region: 'us-east-1',
+      configId: 'cfg-456',
+    });
+  });
+
+  it('sets error on evaluator when getEvaluator fails', async () => {
+    mockGetEvaluator.mockRejectedValue(new Error('AccessDenied'));
+    mockGetOnlineEvaluationConfig.mockResolvedValue({
+      configId: 'cfg-456',
+      configName: 'MyConfig',
+      status: 'ACTIVE',
+      executionStatus: 'ENABLED',
+    });
+
+    const result = await handleProjectStatus(makeContext());
+
+    expect(result.success).toBe(true);
+
+    const evalEntry = result.resources.find(r => r.resourceType === 'evaluator' && r.name === 'MyEval');
+    expect(evalEntry).toBeDefined();
+    expect(evalEntry!.error).toBe('AccessDenied');
+  });
+
+  it('sets error on online eval config when getOnlineEvaluationConfig fails', async () => {
+    mockGetEvaluator.mockResolvedValue({
+      evaluatorId: 'eval-123',
+      evaluatorName: 'MyEval',
+      status: 'ACTIVE',
+      level: 'SESSION',
+    });
+    mockGetOnlineEvaluationConfig.mockRejectedValue(new Error('ResourceNotFound'));
+
+    const result = await handleProjectStatus(makeContext());
+
+    expect(result.success).toBe(true);
+
+    const configEntry = result.resources.find(r => r.resourceType === 'online-eval' && r.name === 'MyConfig');
+    expect(configEntry).toBeDefined();
+    expect(configEntry!.error).toBe('ResourceNotFound');
+  });
+
+  it('skips enrichment when no target config is found', async () => {
+    const ctx = makeContext({
+      awsTargets: [] as unknown as StatusContext['awsTargets'],
+      deployedState: {
+        targets: {
+          dev: {
+            resources: {
+              evaluators: {
+                MyEval: {
+                  evaluatorId: 'eval-123',
+                  evaluatorArn: 'arn:aws:bedrock:us-east-1:123456789:evaluator/eval-123',
+                },
+              },
+            },
+          },
+        },
+      } as unknown as StatusContext['deployedState'],
+    });
+
+    const result = await handleProjectStatus(ctx);
+
+    expect(result.success).toBe(true);
+    expect(mockGetEvaluator).not.toHaveBeenCalled();
+    expect(mockGetOnlineEvaluationConfig).not.toHaveBeenCalled();
+  });
+
+  it('does not enrich local-only evaluators', async () => {
+    const ctx = makeContext({
+      deployedState: {
+        targets: {
+          dev: {
+            resources: {},
+          },
+        },
+      } as unknown as StatusContext['deployedState'],
+    });
+
+    const result = await handleProjectStatus(ctx);
+
+    expect(result.success).toBe(true);
+
+    const evalEntry = result.resources.find(r => r.resourceType === 'evaluator' && r.name === 'MyEval');
+    expect(evalEntry).toBeDefined();
+    expect(evalEntry!.deploymentState).toBe('local-only');
+    expect(mockGetEvaluator).not.toHaveBeenCalled();
+  });
+});
diff --git a/src/cli/commands/status/action.ts b/src/cli/commands/status/action.ts
index 0b00b144..49b3ebb1 100644
--- a/src/cli/commands/status/action.ts
+++ b/src/cli/commands/status/action.ts
@@ -7,6 +7,7 @@ import type {
   DeployedState,
 } from '../../../schema';
 import { getAgentRuntimeStatus } from '../../aws';
+import { getEvaluator, getOnlineEvaluationConfig } from '../../aws/agentcore-control';
 import { getErrorMessage } from '../../errors';
 import { ExecLogger } from '../../logging';
 import type { ResourceDeploymentState } from './constants';
@@ -262,6 +263,82 @@ export async function handleProjectStatus(
       const hasErrors = resources.some(r => r.error);
       logger.endStep(hasErrors ? 'error' : 'success');
     }
+
+    // Enrich deployed evaluators with live status
+    const evaluatorStates = targetResources?.evaluators ?? {};
+    const deployedEvaluators = resources.filter(
+      e => e.resourceType === 'evaluator' && e.deploymentState === 'deployed' && evaluatorStates[e.name]
+    );
+
+    if (deployedEvaluators.length > 0) {
+      logger.startStep(
+        `Fetch evaluator status (${deployedEvaluators.length} evaluator${deployedEvaluators.length !== 1 ? 's' : ''})`
+      );
+
+      await Promise.all(
+        resources.map(async (entry, i) => {
+          if (entry.resourceType !== 'evaluator' || entry.deploymentState !== 'deployed') return;
+
+          const evalState = evaluatorStates[entry.name];
+          if (!evalState) return;
+
+          try {
+            const evalResult = await getEvaluator({
+              region: targetConfig.region,
+              evaluatorId: evalState.evaluatorId,
+            });
+            resources[i] = { ...entry, detail: `${entry.detail} — ${evalResult.status}` };
+            logger.log(`  ${entry.name}: ${evalResult.status} (${evalState.evaluatorId})`);
+          } catch (error) {
+            const errorMsg = getErrorMessage(error);
+            resources[i] = { ...entry, error: errorMsg };
+            logger.log(`  ${entry.name}: ERROR - ${errorMsg}`, 'error');
+          }
+        })
+      );
+
+      const hasEvalErrors = resources.some(r => r.resourceType === 'evaluator' && r.error);
+      logger.endStep(hasEvalErrors ? 'error' : 'success');
+    }
+
+    // Enrich deployed online eval configs with live status
+    const onlineEvalStates = targetResources?.onlineEvalConfigs ?? {};
+    const deployedOnlineEvals = resources.filter(
+      e => e.resourceType === 'online-eval' && e.deploymentState === 'deployed' && onlineEvalStates[e.name]
+    );
+
+    if (deployedOnlineEvals.length > 0) {
+      logger.startStep(
+        `Fetch online eval status (${deployedOnlineEvals.length} config${deployedOnlineEvals.length !== 1 ? 's' : ''})`
+      );
+
+      await Promise.all(
+        resources.map(async (entry, i) => {
+          if (entry.resourceType !== 'online-eval' || entry.deploymentState !== 'deployed') return;
+
+          const configState = onlineEvalStates[entry.name];
+          if (!configState) return;
+
+          try {
+            const configResult = await getOnlineEvaluationConfig({
+              region: targetConfig.region,
+              configId: configState.onlineEvaluationConfigId,
+            });
+            const statusLabel = `${configResult.status} (${configResult.executionStatus})`;
+            const detail = entry.detail ? `${entry.detail} — ${statusLabel}` : statusLabel;
+            resources[i] = { ...entry, detail };
+            logger.log(`  ${entry.name}: ${statusLabel} (${configState.onlineEvaluationConfigId})`);
+          } catch (error) {
+            const errorMsg = getErrorMessage(error);
+            resources[i] = { ...entry, error: errorMsg };
+            logger.log(`  ${entry.name}: ERROR - ${errorMsg}`, 'error');
+          }
+        })
+      );
+
+      const hasOnlineEvalErrors = resources.some(r => r.resourceType === 'online-eval' && r.error);
+      logger.endStep(hasOnlineEvalErrors ? 'error' : 'success');
+    }
   }
 
   logger.finalize(true);
diff --git a/src/cli/operations/eval/__tests__/logs-eval.test.ts b/src/cli/operations/eval/__tests__/logs-eval.test.ts
index 70c88252..a512a1a0 100644
--- a/src/cli/operations/eval/__tests__/logs-eval.test.ts
+++ b/src/cli/operations/eval/__tests__/logs-eval.test.ts
@@ -1,8 +1,9 @@
 import { handleLogsEval } from '../logs-eval.js';
-import { afterEach, describe, expect, it, vi } from 'vitest';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
 
 const mockLoadDeployedProjectConfig = vi.fn();
 const mockResolveAgent = vi.fn();
+const mockGetOnlineEvaluationConfig = vi.fn();
 const mockSearchLogs = vi.fn();
 const mockStreamLogs = vi.fn();
 
@@ -11,6 +12,10 @@ vi.mock('../../resolve-agent', () => ({
   resolveAgent: (...args: unknown[]) => mockResolveAgent(...args),
 }));
 
+vi.mock('../../../aws/agentcore-control', () => ({
+  getOnlineEvaluationConfig: (...args: unknown[]) => mockGetOnlineEvaluationConfig(...args),
+}));
+
 vi.mock('../../../aws/cloudwatch', () => ({
   searchLogs: (...args: unknown[]) => mockSearchLogs(...args),
   streamLogs: (...args: unknown[]) => mockStreamLogs(...args),
@@ -71,6 +76,19 @@ function makeResolvedAgent(agentName = 'my-agent') {
 }
 
 describe('handleLogsEval', () => {
+  beforeEach(() => {
+    // Default: API returns the convention-based log group name
+    mockGetOnlineEvaluationConfig.mockImplementation((opts: { configId: string }) =>
+      Promise.resolve({
+        configId: opts.configId,
+        configName: 'eval-config',
+        status: 'ACTIVE',
+        executionStatus: 'ENABLED',
+        outputLogGroupName: `/aws/bedrock-agentcore/evaluations/results/${opts.configId}`,
+      })
+    );
+  });
+
   afterEach(() => vi.clearAllMocks());
 
   it('returns error when agent resolution fails', async () => {
@@ -206,4 +224,80 @@ describe('handleLogsEval', () => {
     expect(result.success).toBe(false);
     expect(result.error).toContain('No deployed online eval configs found');
   });
+
+  it('uses log group name from API when available', async () => {
+    const ctx = makeContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue(makeResolvedAgent());
+
+    mockGetOnlineEvaluationConfig.mockResolvedValue({
+      configId: 'cfg-123',
+      configName: 'eval-config',
+      status: 'ACTIVE',
+      executionStatus: 'ENABLED',
+      outputLogGroupName: '/custom/log/group/from-api',
+    });
+
+    async function* emptyGenerator() {
+      // no events
+    }
+    mockSearchLogs.mockReturnValue(emptyGenerator());
+
+    await handleLogsEval({ since: '1h' });
+
+    expect(mockSearchLogs).toHaveBeenCalledWith(
+      expect.objectContaining({
+        logGroupName: '/custom/log/group/from-api',
+      })
+    );
+  });
+
+  it('falls back to convention-based log group when API call fails', async () => {
+    const ctx = makeContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue(makeResolvedAgent());
+
+    mockGetOnlineEvaluationConfig.mockRejectedValue(new Error('AccessDenied'));
+
+    async function* emptyGenerator() {
+      // no events
+    }
+    mockSearchLogs.mockReturnValue(emptyGenerator());
+
+    await handleLogsEval({ since: '1h' });
+
+    expect(mockSearchLogs).toHaveBeenCalledWith(
+      expect.objectContaining({
+        logGroupName: '/aws/bedrock-agentcore/evaluations/results/cfg-123',
+      })
+    );
+  });
+
+  it('surfaces failure reason from config in failed state', async () => {
+    const ctx = makeContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue(makeResolvedAgent());
+
+    mockGetOnlineEvaluationConfig.mockResolvedValue({
+      configId: 'cfg-123',
+      configName: 'eval-config',
+      status: 'CREATE_FAILED',
+      executionStatus: 'DISABLED',
+      failureReason: 'IAM role does not exist',
+      outputLogGroupName: '/aws/bedrock-agentcore/evaluations/results/cfg-123',
+    });
+
+    async function* emptyGenerator() {
+      // no events
+    }
+    mockSearchLogs.mockReturnValue(emptyGenerator());
+
+    // eslint-disable-next-line @typescript-eslint/no-empty-function
+    const consoleSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
+
+    await handleLogsEval({ since: '1h' });
+
+    expect(consoleSpy).toHaveBeenCalledWith(expect.stringContaining('IAM role does not exist'));
+    consoleSpy.mockRestore();
+  });
 });
diff --git a/src/cli/operations/eval/__tests__/run-eval.test.ts b/src/cli/operations/eval/__tests__/run-eval.test.ts
index 4c5e2c83..2495bcd6 100644
--- a/src/cli/operations/eval/__tests__/run-eval.test.ts
+++ b/src/cli/operations/eval/__tests__/run-eval.test.ts
@@ -6,6 +6,7 @@ import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
 const mockResolveAgent = vi.fn();
 const mockLoadDeployedProjectConfig = vi.fn();
 const mockEvaluate = vi.fn();
+const mockGetEvaluator = vi.fn();
 const mockSaveEvalRun = vi.fn();
 const mockGenerateRunId = vi.fn();
 const mockSend = vi.fn();
@@ -21,6 +22,10 @@ vi.mock('../../../aws/agentcore', () => ({
   evaluate: (...args: unknown[]) => mockEvaluate(...args),
 }));
 
+vi.mock('../../../aws/agentcore-control', () => ({
+  getEvaluator: (...args: unknown[]) => mockGetEvaluator(...args),
+}));
+
 vi.mock('../../../aws', () => ({
   getCredentialProvider: () => mockGetCredentialProvider(),
 }));
@@ -95,6 +100,21 @@ function makeOtelSpanRow(sessionId: string, traceId: string, spanBody: Record<st
   ];
 }
 
+function makeToolCallSpanRow(sessionId: string, traceId: string, spanId: string, toolName: string) {
+  const message = JSON.stringify({
+    scope: { name: 'strands.telemetry.tracer' },
+    traceId,
+    spanId,
+    kind: 'CLIENT',
+    attributes: { 'gen_ai.tool.name': toolName },
+  });
+  return [
+    { field: '@message', value: message },
+    { field: 'sessionId', value: sessionId },
+    { field: 'traceId', value: traceId },
+  ];
+}
+
 function setupCloudWatchToReturn(spanRows: unknown[][], runtimeLogRows: unknown[][] = []) {
   let queryCount = 0;
   mockSend.mockImplementation((cmd: { input: unknown }) => {
@@ -575,6 +595,280 @@ describe('handleRunEval', () => {
     expect(result.error).toContain('No evaluators specified');
   });
 
+  // ─── Evaluator-level grouping ────────────────────────────────────────────
+
+  it('sends targetTraceIds for TRACE-level builtin evaluators', async () => {
+    const ctx = makeDeployedContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue({
+      success: true,
+      agent: {
+        agentName: 'my-agent',
+        targetName: 'dev',
+        region: 'us-east-1',
+        accountId: '111222333444',
+        runtimeId: 'rt-123',
+      },
+    });
+
+    const spanRows = [makeOtelSpanRow('session-1', 'trace-1'), makeOtelSpanRow('session-1', 'trace-2')];
+    setupCloudWatchToReturn(spanRows);
+
+    mockEvaluate.mockResolvedValue({
+      evaluationResults: [{ value: 4.0, context: { spanContext: { sessionId: 'session-1', traceId: 'trace-1' } } }],
+    });
+
+    // Builtin.Helpfulness is TRACE-level
+    const result = await handleRunEval({ evaluator: ['Builtin.Helpfulness'], days: 7 });
+
+    expect(result.success).toBe(true);
+    expect(mockEvaluate).toHaveBeenCalledWith(
+      expect.objectContaining({
+        targetTraceIds: expect.arrayContaining(['trace-1', 'trace-2']),
+      })
+    );
+  });
+
+  it('does not send targetTraceIds for SESSION-level evaluators', async () => {
+    const ctx = makeDeployedContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue({
+      success: true,
+      agent: {
+        agentName: 'my-agent',
+        targetName: 'dev',
+        region: 'us-east-1',
+        accountId: '111222333444',
+        runtimeId: 'rt-123',
+      },
+    });
+
+    setupCloudWatchToReturn([makeOtelSpanRow('session-1', 'trace-1')]);
+
+    mockEvaluate.mockResolvedValue({
+      evaluationResults: [{ value: 3.0, context: { spanContext: { sessionId: 'session-1' } } }],
+    });
+
+    // Builtin.GoalSuccessRate is SESSION-level
+    const result = await handleRunEval({ evaluator: ['Builtin.GoalSuccessRate'], days: 7 });
+
+    expect(result.success).toBe(true);
+    expect(mockEvaluate).toHaveBeenCalledWith(
+      expect.objectContaining({
+        targetTraceIds: undefined,
+        targetSpanIds: undefined,
+      })
+    );
+  });
+
+  it('sends targetSpanIds for TOOL_CALL-level evaluators', async () => {
+    const ctx = makeDeployedContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue({
+      success: true,
+      agent: {
+        agentName: 'my-agent',
+        targetName: 'dev',
+        region: 'us-east-1',
+        accountId: '111222333444',
+        runtimeId: 'rt-123',
+      },
+    });
+
+    const spanRows = [makeToolCallSpanRow('session-1', 'trace-1', 'span-tool-1', 'calculator')];
+    setupCloudWatchToReturn(spanRows);
+
+    mockEvaluate.mockResolvedValue({
+      evaluationResults: [{ value: 5.0, context: { spanContext: { sessionId: 'session-1', spanId: 'span-tool-1' } } }],
+    });
+
+    // Builtin.ToolSelectionAccuracy is TOOL_CALL-level
+    const result = await handleRunEval({ evaluator: ['Builtin.ToolSelectionAccuracy'], days: 7 });
+
+    expect(result.success).toBe(true);
+    expect(mockEvaluate).toHaveBeenCalledWith(
+      expect.objectContaining({
+        targetSpanIds: ['span-tool-1'],
+      })
+    );
+  });
+
+  it('fetches level from API for custom evaluators', async () => {
+    const ctx = makeDeployedContext({
+      evaluators: { MyTraceEval: { evaluatorId: 'eval-trace-custom' } },
+    });
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue({
+      success: true,
+      agent: {
+        agentName: 'my-agent',
+        targetName: 'dev',
+        region: 'us-east-1',
+        accountId: '111222333444',
+        runtimeId: 'rt-123',
+      },
+    });
+
+    // Mock getEvaluator to return TRACE level for the custom evaluator
+    mockGetEvaluator.mockResolvedValue({
+      evaluatorId: 'eval-trace-custom',
+      evaluatorName: 'MyTraceEval',
+      level: 'TRACE',
+      status: 'ACTIVE',
+    });
+
+    setupCloudWatchToReturn([makeOtelSpanRow('session-1', 'trace-1')]);
+
+    mockEvaluate.mockResolvedValue({
+      evaluationResults: [{ value: 4.0, context: { spanContext: { sessionId: 'session-1', traceId: 'trace-1' } } }],
+    });
+
+    const result = await handleRunEval({ evaluator: ['MyTraceEval'], days: 7 });
+
+    expect(result.success).toBe(true);
+    expect(mockGetEvaluator).toHaveBeenCalledWith(expect.objectContaining({ evaluatorId: 'eval-trace-custom' }));
+    expect(mockEvaluate).toHaveBeenCalledWith(
+      expect.objectContaining({
+        targetTraceIds: ['trace-1'],
+      })
+    );
+  });
+
+  it('defaults to SESSION level when getEvaluator fails for custom evaluator', async () => {
+    const ctx = makeDeployedContext({
+      evaluators: { FailingEval: { evaluatorId: 'eval-failing' } },
+    });
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue({
+      success: true,
+      agent: {
+        agentName: 'my-agent',
+        targetName: 'dev',
+        region: 'us-east-1',
+        accountId: '111222333444',
+        runtimeId: 'rt-123',
+      },
+    });
+
+    mockGetEvaluator.mockRejectedValue(new Error('Not found'));
+
+    setupCloudWatchToReturn([makeOtelSpanRow('session-1', 'trace-1')]);
+
+    mockEvaluate.mockResolvedValue({
+      evaluationResults: [{ value: 3.0, context: { spanContext: { sessionId: 'session-1' } } }],
+    });
+
+    const result = await handleRunEval({ evaluator: ['FailingEval'], days: 7 });
+
+    expect(result.success).toBe(true);
+    // Should default to SESSION (no target IDs)
+    expect(mockEvaluate).toHaveBeenCalledWith(
+      expect.objectContaining({
+        targetTraceIds: undefined,
+        targetSpanIds: undefined,
+      })
+    );
+  });
+
+  // ─── Session/trace filtering ─────────────────────────────────────────────
+
+  function getFirstQueryString(): string {
+    const call = mockSend.mock.calls.find((c: unknown[]) => {
+      const input = (c[0] as { input?: { queryString?: string } }).input;
+      return input?.queryString !== undefined;
+    });
+    return (call![0] as { input: { queryString: string } }).input.queryString;
+  }
+
+  it('filters CloudWatch query by --session-id', async () => {
+    const ctx = makeDeployedContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue({
+      success: true,
+      agent: {
+        agentName: 'my-agent',
+        targetName: 'dev',
+        region: 'us-east-1',
+        accountId: '111222333444',
+        runtimeId: 'rt-123',
+      },
+    });
+
+    setupCloudWatchToReturn([makeOtelSpanRow('session-abc', 'trace-1')]);
+    mockEvaluate.mockResolvedValue({
+      evaluationResults: [{ value: 4.0, context: { spanContext: { sessionId: 'session-abc' } } }],
+    });
+
+    const result = await handleRunEval({
+      evaluator: ['Builtin.GoalSuccessRate'],
+      days: 7,
+      sessionId: 'session-abc',
+    });
+
+    expect(result.success).toBe(true);
+    const query = getFirstQueryString();
+    expect(query).toContain("filter attributes.session.id = 'session-abc'");
+  });
+
+  it('filters CloudWatch query by --trace-id', async () => {
+    const ctx = makeDeployedContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue({
+      success: true,
+      agent: {
+        agentName: 'my-agent',
+        targetName: 'dev',
+        region: 'us-east-1',
+        accountId: '111222333444',
+        runtimeId: 'rt-123',
+      },
+    });
+
+    setupCloudWatchToReturn([makeOtelSpanRow('session-1', 'trace-xyz')]);
+    mockEvaluate.mockResolvedValue({
+      evaluationResults: [{ value: 3.0, context: { spanContext: { sessionId: 'session-1', traceId: 'trace-xyz' } } }],
+    });
+
+    const result = await handleRunEval({
+      evaluator: ['Builtin.GoalSuccessRate'],
+      days: 7,
+      traceId: 'trace-xyz',
+    });
+
+    expect(result.success).toBe(true);
+    const query = getFirstQueryString();
+    expect(query).toContain("filter traceId = 'trace-xyz'");
+  });
+
+  it('sanitizes --session-id and --trace-id values', async () => {
+    const ctx = makeDeployedContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue({
+      success: true,
+      agent: {
+        agentName: 'my-agent',
+        targetName: 'dev',
+        region: 'us-east-1',
+        accountId: '111222333444',
+        runtimeId: 'rt-123',
+      },
+    });
+
+    setupCloudWatchToReturn([]);
+
+    await handleRunEval({
+      evaluator: ['Builtin.GoalSuccessRate'],
+      days: 7,
+      sessionId: "sess'; DROP TABLE--",
+      traceId: "trace'; DROP TABLE--",
+    });
+
+    const query = getFirstQueryString();
+    expect(query).toContain("filter attributes.session.id = 'sess; DROP TABLE--'");
+    expect(query).toContain("filter traceId = 'trace; DROP TABLE--'");
+    expect(query).not.toContain("sess'");
+  });
+
   // ─── Query sanitization ───────────────────────────────────────────────────
 
   it('sanitizes runtimeId in CloudWatch query to prevent injection', async () => {
@@ -595,13 +889,7 @@ describe('handleRunEval', () => {
 
     await handleRunEval({ evaluator: ['Builtin.GoalSuccessRate'], days: 7 });
 
-    // Verify the StartQueryCommand was called with sanitized runtimeId (no single quotes)
-    const startQueryCall = mockSend.mock.calls.find((call: unknown[]) => {
-      const input = (call[0] as { input?: { queryString?: string } }).input;
-      return input?.queryString !== undefined;
-    });
-    expect(startQueryCall).toBeDefined();
-    const queryString = (startQueryCall![0] as { input: { queryString: string } }).input.queryString;
+    const queryString = getFirstQueryString();
     expect(queryString).not.toContain("'rt-123'; DROP TABLE'");
     expect(queryString).toContain('rt-123; DROP TABLE');
   });
diff --git a/src/cli/operations/eval/logs-eval.ts b/src/cli/operations/eval/logs-eval.ts
index 0e60fa0e..a4187674 100644
--- a/src/cli/operations/eval/logs-eval.ts
+++ b/src/cli/operations/eval/logs-eval.ts
@@ -1,4 +1,5 @@
 import { parseTimeString } from '../../../lib/utils';
+import { getOnlineEvaluationConfig } from '../../aws/agentcore-control';
 import { searchLogs, streamLogs } from '../../aws/cloudwatch';
 import type { DeployedProjectConfig } from '../resolve-agent';
 import { loadDeployedProjectConfig, resolveAgent } from '../resolve-agent';
@@ -25,26 +26,50 @@ function formatLogLine(event: { timestamp: number; message: string }, json: bool
   return `${ts}  ${event.message}`;
 }
 
+interface ResolvedLogGroup {
+  logGroupName: string;
+  configName: string;
+  failureReason?: string;
+}
+
 /**
  * Resolve the online eval config log group names for a given agent.
- * Online eval results are written to: /aws/bedrock-agentcore/evaluations/results/{onlineEvalConfigId}
+ * Fetches the actual log group from the API when possible, falls back to convention.
  */
-function resolveEvalLogGroups(context: DeployedProjectConfig, agentName: string, targetName: string): string[] {
+async function resolveEvalLogGroups(
+  context: DeployedProjectConfig,
+  agentName: string,
+  targetName: string,
+  region: string
+): Promise<ResolvedLogGroup[]> {
   const { project, deployedState } = context;
   const targetResources = deployedState.targets[targetName]?.resources;
 
   // Find online eval configs that monitor this agent
   const matchingConfigs = (project.onlineEvalConfigs ?? []).filter(c => c.agents.includes(agentName));
 
-  const logGroups: string[] = [];
+  const results: ResolvedLogGroup[] = [];
   for (const config of matchingConfigs) {
     const deployed = targetResources?.onlineEvalConfigs?.[config.name];
-    if (deployed?.onlineEvaluationConfigId) {
-      logGroups.push(`/aws/bedrock-agentcore/evaluations/results/${deployed.onlineEvaluationConfigId}`);
+    if (!deployed?.onlineEvaluationConfigId) continue;
+
+    const configId = deployed.onlineEvaluationConfigId;
+    const fallbackLogGroup = `/aws/bedrock-agentcore/evaluations/results/${configId}`;
+
+    try {
+      const apiConfig = await getOnlineEvaluationConfig({ region, configId });
+      results.push({
+        logGroupName: apiConfig.outputLogGroupName ?? fallbackLogGroup,
+        configName: config.name,
+        failureReason: apiConfig.failureReason,
+      });
+    } catch {
+      // API call failed — fall back to convention-based name
+      results.push({ logGroupName: fallbackLogGroup, configName: config.name });
     }
   }
 
-  return logGroups;
+  return results;
 }
 
 export async function handleLogsEval(options: LogsEvalOptions): Promise<LogsEvalResult> {
@@ -57,15 +82,22 @@ export async function handleLogsEval(options: LogsEvalOptions): Promise<LogsEval
 
   const { agent } = agentResult;
 
-  const logGroups = resolveEvalLogGroups(context, agent.agentName, agent.targetName);
+  const resolvedLogGroups = await resolveEvalLogGroups(context, agent.agentName, agent.targetName, agent.region);
 
-  if (logGroups.length === 0) {
+  if (resolvedLogGroups.length === 0) {
     return {
       success: false,
       error: `No deployed online eval configs found for agent '${agent.agentName}'. Add one with 'agentcore add online-eval' and deploy.`,
     };
   }
 
+  // Surface failure reasons from configs that are in a failed state
+  for (const lg of resolvedLogGroups) {
+    if (lg.failureReason) {
+      console.error(`Warning: Online eval config '${lg.configName}' has a failure: ${lg.failureReason}`);
+    }
+  }
+
   const isJson = options.json ?? false;
   const isFollow = options.follow ?? (!options.since && !options.until);
 
@@ -75,7 +107,7 @@ export async function handleLogsEval(options: LogsEvalOptions): Promise<LogsEval
 
   try {
     // Query all matching log groups
-    for (const logGroupName of logGroups) {
+    for (const { logGroupName } of resolvedLogGroups) {
       if (!isFollow) {
         const startTimeMs = options.since ? parseTimeString(options.since) : Date.now() - 3_600_000;
         const endTimeMs = options.until ? parseTimeString(options.until) : Date.now();
diff --git a/src/cli/operations/eval/run-eval.ts b/src/cli/operations/eval/run-eval.ts
index c9775f1e..1391e859 100644
--- a/src/cli/operations/eval/run-eval.ts
+++ b/src/cli/operations/eval/run-eval.ts
@@ -1,5 +1,6 @@
 import { getCredentialProvider } from '../../aws';
 import { evaluate } from '../../aws/agentcore';
+import { getEvaluator } from '../../aws/agentcore-control';
 import { DEFAULT_ENDPOINT_NAME } from '../../constants';
 import type { DeployedProjectConfig } from '../resolve-agent';
 import { loadDeployedProjectConfig, resolveAgent } from '../resolve-agent';
@@ -166,6 +167,90 @@ function resolveFromProject(context: DeployedProjectConfig, options: RunEvalOpti
   };
 }
 
+type EvaluatorLevel = 'SESSION' | 'TRACE' | 'TOOL_CALL';
+
+const BUILTIN_EVALUATOR_LEVELS: Record<string, EvaluatorLevel> = {
+  'Builtin.GoalSuccessRate': 'SESSION',
+  'Builtin.Correctness': 'TRACE',
+  'Builtin.Faithfulness': 'TRACE',
+  'Builtin.Helpfulness': 'TRACE',
+  'Builtin.ResponseRelevance': 'TRACE',
+  'Builtin.Conciseness': 'TRACE',
+  'Builtin.Coherence': 'TRACE',
+  'Builtin.InstructionFollowing': 'TRACE',
+  'Builtin.Refusal': 'TRACE',
+  'Builtin.ToolSelectionAccuracy': 'TOOL_CALL',
+};
+
+/**
+ * Resolve the evaluation level for each evaluator.
+ * Builtin evaluators use a known mapping; custom evaluators are fetched via the API.
+ */
+async function resolveEvaluatorLevels(evaluatorIds: string[], region: string): Promise<Map<string, EvaluatorLevel>> {
+  const levels = new Map<string, EvaluatorLevel>();
+
+  for (const id of evaluatorIds) {
+    const builtinLevel = BUILTIN_EVALUATOR_LEVELS[id];
+    if (builtinLevel) {
+      levels.set(id, builtinLevel);
+      continue;
+    }
+
+    // Unknown builtin — default to SESSION
+    if (id.startsWith('Builtin.')) {
+      levels.set(id, 'SESSION');
+      continue;
+    }
+
+    // Custom evaluator — fetch level from API
+    try {
+      const evaluator = await getEvaluator({ region, evaluatorId: id });
+      levels.set(id, (evaluator.level as EvaluatorLevel) ?? 'SESSION');
+    } catch {
+      // If we can't determine the level, default to SESSION (most permissive)
+      levels.set(id, 'SESSION');
+    }
+  }
+
+  return levels;
+}
+
+/**
+ * Extract distinct trace IDs from session spans.
+ */
+function extractTraceIds(spans: DocumentType[]): string[] {
+  const traceIds = new Set<string>();
+  for (const span of spans) {
+    const traceId = (span as Record<string, unknown>).traceId as string | undefined;
+    if (traceId) {
+      traceIds.add(traceId);
+    }
+  }
+  return [...traceIds];
+}
+
+/**
+ * Extract span IDs that represent tool calls from session spans.
+ */
+function extractToolCallSpanIds(spans: DocumentType[]): string[] {
+  const spanIds: string[] = [];
+  for (const span of spans) {
+    const doc = span as Record<string, unknown>;
+    const spanId = doc.spanId as string | undefined;
+    if (!spanId) continue;
+
+    // Tool call spans have kind=CLIENT or attributes indicating tool usage
+    const kind = doc.kind as string | undefined;
+    const attrs = doc.attributes as Record<string, unknown> | undefined;
+    const hasToolAttr = attrs?.['gen_ai.tool.name'] ?? attrs?.['tool.name'];
+
+    if (kind === 'CLIENT' || hasToolAttr) {
+      spanIds.push(spanId);
+    }
+  }
+  return spanIds;
+}
+
 /**
  * Execute a CloudWatch Logs Insights query and wait for results.
  */
@@ -255,18 +340,23 @@ interface SessionSpans {
   spans: DocumentType[];
 }
 
+interface FetchSpansOptions {
+  runtimeId: string;
+  runtimeLogGroup: string;
+  region: string;
+  lookbackDays: number;
+  sessionId?: string;
+  traceId?: string;
+}
+
 /**
  * Fetch OTel spans from the `aws/spans` log group and runtime logs from the agent's
  * log group, then group them by session.
  *
  * The Evaluate API requires spans from a single session per call.
  */
-async function fetchSessionSpans(
-  runtimeId: string,
-  runtimeLogGroup: string,
-  region: string,
-  lookbackDays: number
-): Promise<SessionSpans[]> {
+async function fetchSessionSpans(opts: FetchSpansOptions): Promise<SessionSpans[]> {
+  const { runtimeId, runtimeLogGroup, region, lookbackDays } = opts;
   const endTimeMs = Date.now();
   const startTimeMs = endTimeMs - lookbackDays * 24 * 60 * 60 * 1000;
   const startTimeSec = Math.floor(startTimeMs / 1000);
@@ -278,17 +368,20 @@ async function fetchSessionSpans(
   });
 
   // 1. Query proper OTel spans from the aws/spans log group
-  const spanRows = await executeQuery(
-    client,
-    SPANS_LOG_GROUP,
-    `fields @message, attributes.session.id as sessionId, traceId
+  let spanQuery = `fields @message, attributes.session.id as sessionId, traceId
      | parse resource.attributes.cloud.resource_id "runtime/*/" as parsedAgentId
-     | filter parsedAgentId = '${sanitizeQueryValue(runtimeId)}'
-     | sort startTimeUnixNano asc
-     | limit 10000`,
-    startTimeSec,
-    endTimeSec
-  );
+     | filter parsedAgentId = '${sanitizeQueryValue(runtimeId)}'`;
+
+  if (opts.sessionId) {
+    spanQuery += `\n     | filter attributes.session.id = '${sanitizeQueryValue(opts.sessionId)}'`;
+  }
+  if (opts.traceId) {
+    spanQuery += `\n     | filter traceId = '${sanitizeQueryValue(opts.traceId)}'`;
+  }
+
+  spanQuery += `\n     | sort startTimeUnixNano asc\n     | limit 10000`;
+
+  const spanRows = await executeQuery(client, SPANS_LOG_GROUP, spanQuery, startTimeSec, endTimeSec);
 
   // Group spans by session and collect trace IDs
   const sessionMap = new Map<string, DocumentType[]>();
@@ -405,7 +498,14 @@ export async function handleRunEval(options: RunEvalOptions): Promise<RunEvalRes
   const { ctx } = resolution;
 
   // Fetch spans grouped by session
-  const sessions = await fetchSessionSpans(ctx.runtimeId, ctx.runtimeLogGroup, ctx.region, options.days);
+  const sessions = await fetchSessionSpans({
+    runtimeId: ctx.runtimeId,
+    runtimeLogGroup: ctx.runtimeLogGroup,
+    region: ctx.region,
+    lookbackDays: options.days,
+    sessionId: options.sessionId,
+    traceId: options.traceId,
+  });
 
   if (sessions.length === 0) {
     return {
@@ -414,12 +514,16 @@ export async function handleRunEval(options: RunEvalOptions): Promise<RunEvalRes
     };
   }
 
-  // Run each evaluator against each session
+  // Resolve evaluator levels to determine how to send spans
+  const evaluatorLevels = await resolveEvaluatorLevels(ctx.evaluatorIds, ctx.region);
+
+  // Run each evaluator against each session with level-appropriate targeting
   const results: EvalEvaluatorResult[] = [];
 
   for (let i = 0; i < ctx.evaluatorIds.length; i++) {
     const evaluatorId = ctx.evaluatorIds[i]!;
     const evaluatorName = ctx.evaluatorLabels[i] ?? evaluatorId;
+    const level = evaluatorLevels.get(evaluatorId) ?? 'SESSION';
 
     const sessionScores: EvalSessionScore[] = [];
     let totalInputTokens = 0;
@@ -427,10 +531,24 @@ export async function handleRunEval(options: RunEvalOptions): Promise<RunEvalRes
     let totalTokens = 0;
 
     for (const session of sessions) {
+      // Build evaluation target based on evaluator level
+      let targetTraceIds: string[] | undefined;
+      let targetSpanIds: string[] | undefined;
+
+      if (level === 'TRACE') {
+        targetTraceIds = extractTraceIds(session.spans);
+        if (targetTraceIds.length === 0) continue;
+      } else if (level === 'TOOL_CALL') {
+        targetSpanIds = extractToolCallSpanIds(session.spans);
+        if (targetSpanIds.length === 0) continue;
+      }
+
       const response = await evaluate({
         region: ctx.region,
         evaluatorId,
         sessionSpans: session.spans,
+        targetTraceIds,
+        targetSpanIds,
       });
 
       for (const r of response.evaluationResults) {
diff --git a/src/cli/operations/eval/types.ts b/src/cli/operations/eval/types.ts
index e78c3b92..1f4c3438 100644
--- a/src/cli/operations/eval/types.ts
+++ b/src/cli/operations/eval/types.ts
@@ -44,6 +44,10 @@ export interface RunEvalOptions {
   agentArn?: string;
   /** AWS region (required with --agent-arn, inferred otherwise) */
   region?: string;
+  /** Filter to a specific session */
+  sessionId?: string;
+  /** Filter to a specific trace */
+  traceId?: string;
   days: number;
   output?: string;
   json?: boolean;
diff --git a/src/cli/tui/screens/online-eval/types.ts b/src/cli/tui/screens/online-eval/types.ts
index 12f4c151..62432cac 100644
--- a/src/cli/tui/screens/online-eval/types.ts
+++ b/src/cli/tui/screens/online-eval/types.ts
@@ -9,6 +9,8 @@ export interface AddOnlineEvalConfig {
   agents: string[];
   evaluators: string[];
   samplingRate: number;
+  description?: string;
+  enableOnCreate?: boolean;
 }
 
 export const ONLINE_EVAL_STEP_LABELS: Record<AddOnlineEvalStep, string> = {
diff --git a/src/schema/schemas/primitives/__tests__/online-eval-config.test.ts b/src/schema/schemas/primitives/__tests__/online-eval-config.test.ts
index 9381539e..c141b463 100644
--- a/src/schema/schemas/primitives/__tests__/online-eval-config.test.ts
+++ b/src/schema/schemas/primitives/__tests__/online-eval-config.test.ts
@@ -87,4 +87,23 @@ describe('OnlineEvalConfigSchema', () => {
     const config = { ...validConfig, evaluators: [''] };
     expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(false);
   });
+
+  it('accepts optional description field', () => {
+    const config = { ...validConfig, description: 'My eval config description' };
+    expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(true);
+  });
+
+  it('rejects description longer than 200 characters', () => {
+    const config = { ...validConfig, description: 'x'.repeat(201) };
+    expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(false);
+  });
+
+  it('accepts optional enableOnCreate field', () => {
+    const config = { ...validConfig, enableOnCreate: false };
+    expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(true);
+  });
+
+  it('accepts config without description and enableOnCreate', () => {
+    expect(OnlineEvalConfigSchema.safeParse(validConfig).success).toBe(true);
+  });
 });
diff --git a/src/schema/schemas/primitives/online-eval-config.ts b/src/schema/schemas/primitives/online-eval-config.ts
index b90f6295..784c3c25 100644
--- a/src/schema/schemas/primitives/online-eval-config.ts
+++ b/src/schema/schemas/primitives/online-eval-config.ts
@@ -22,6 +22,10 @@ export const OnlineEvalConfigSchema = z.object({
   evaluators: z.array(z.string().min(1)).min(1, 'At least one evaluator is required'),
   /** Sampling rate as a percentage (0.01 to 100) */
   samplingRate: z.number().min(0.01).max(100),
+  /** Optional description for the online eval config */
+  description: z.string().max(200).optional(),
+  /** Whether to enable execution on create (default: true) */
+  enableOnCreate: z.boolean().optional(),
 });
 
 export type OnlineEvalConfig = z.infer<typeof OnlineEvalConfigSchema>;

From cc37874da16eeb7944bafbee0193556fe24d441e Mon Sep 17 00:00:00 2001
From: notgitika <gitijh@gmail.com>
Date: Thu, 12 Mar 2026 02:23:22 -0400
Subject: [PATCH 7/9] fix: add API limit of 10 on spanIds and TUI changes

---
 .../eval/__tests__/run-eval.test.ts           | 41 +++++++++
 src/cli/operations/eval/run-eval.ts           | 84 +++++++++++++------
 src/cli/tui/components/SelectList.tsx         |  2 +-
 .../screens/evaluator/AddEvaluatorScreen.tsx  |  7 +-
 src/schema/schemas/agentcore-project.ts       |  2 +-
 src/schema/schemas/primitives/evaluator.ts    | 10 ++-
 src/schema/schemas/primitives/index.ts        |  1 +
 7 files changed, 116 insertions(+), 31 deletions(-)

diff --git a/src/cli/operations/eval/__tests__/run-eval.test.ts b/src/cli/operations/eval/__tests__/run-eval.test.ts
index 2495bcd6..8a99d9f1 100644
--- a/src/cli/operations/eval/__tests__/run-eval.test.ts
+++ b/src/cli/operations/eval/__tests__/run-eval.test.ts
@@ -693,6 +693,47 @@ describe('handleRunEval', () => {
     );
   });
 
+  it('batches targetSpanIds into chunks of 10 for TOOL_CALL evaluators', async () => {
+    const ctx = makeDeployedContext();
+    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
+    mockResolveAgent.mockReturnValue({
+      success: true,
+      agent: {
+        agentName: 'my-agent',
+        targetName: 'dev',
+        region: 'us-east-1',
+        accountId: '111222333444',
+        runtimeId: 'rt-123',
+      },
+    });
+
+    // Create 12 tool call spans in one session
+    const spanRows = Array.from({ length: 12 }, (_, i) =>
+      makeToolCallSpanRow('session-1', 'trace-1', `span-tool-${i}`, `tool-${i}`)
+    );
+    setupCloudWatchToReturn(spanRows);
+
+    mockEvaluate.mockResolvedValue({
+      evaluationResults: [{ value: 5.0, context: { spanContext: { sessionId: 'session-1' } } }],
+    });
+
+    const result = await handleRunEval({ evaluator: ['Builtin.ToolSelectionAccuracy'], days: 7 });
+
+    expect(result.success).toBe(true);
+    // Should be called twice: first batch of 10, second batch of 2
+    expect(mockEvaluate).toHaveBeenCalledTimes(2);
+    expect(mockEvaluate).toHaveBeenCalledWith(
+      expect.objectContaining({
+        targetSpanIds: expect.arrayContaining(['span-tool-0']) as string[],
+      })
+    );
+
+    const firstCallSpanIds = (mockEvaluate.mock.calls[0] as [{ targetSpanIds: string[] }])[0].targetSpanIds;
+    const secondCallSpanIds = (mockEvaluate.mock.calls[1] as [{ targetSpanIds: string[] }])[0].targetSpanIds;
+    expect(firstCallSpanIds).toHaveLength(10);
+    expect(secondCallSpanIds).toHaveLength(2);
+  });
+
   it('fetches level from API for custom evaluators', async () => {
     const ctx = makeDeployedContext({
       evaluators: { MyTraceEval: { evaluatorId: 'eval-trace-custom' } },
diff --git a/src/cli/operations/eval/run-eval.ts b/src/cli/operations/eval/run-eval.ts
index 1391e859..fc36efb7 100644
--- a/src/cli/operations/eval/run-eval.ts
+++ b/src/cli/operations/eval/run-eval.ts
@@ -239,18 +239,46 @@ function extractToolCallSpanIds(spans: DocumentType[]): string[] {
     const spanId = doc.spanId as string | undefined;
     if (!spanId) continue;
 
-    // Tool call spans have kind=CLIENT or attributes indicating tool usage
-    const kind = doc.kind as string | undefined;
+    // Tool call spans must have a tool name attribute — kind=CLIENT alone is too broad
     const attrs = doc.attributes as Record<string, unknown> | undefined;
-    const hasToolAttr = attrs?.['gen_ai.tool.name'] ?? attrs?.['tool.name'];
-
-    if (kind === 'CLIENT' || hasToolAttr) {
+    if (attrs?.['gen_ai.tool.name'] ?? attrs?.['tool.name']) {
       spanIds.push(spanId);
     }
   }
   return spanIds;
 }
 
+const EVALUATE_TARGET_BATCH_SIZE = 10;
+
+interface TargetIdBatch {
+  traceIds?: string[];
+  spanIds?: string[];
+}
+
+/**
+ * Batch targetTraceIds / targetSpanIds into chunks of EVALUATE_TARGET_BATCH_SIZE.
+ * The Evaluate API limits these arrays to 10 items per call.
+ * For SESSION-level evaluators (both undefined), returns a single batch with no IDs.
+ */
+function batchTargetIds(traceIds?: string[], spanIds?: string[]): TargetIdBatch[] {
+  if (spanIds) {
+    return chunk(spanIds, EVALUATE_TARGET_BATCH_SIZE).map(batch => ({ spanIds: batch }));
+  }
+  if (traceIds) {
+    return chunk(traceIds, EVALUATE_TARGET_BATCH_SIZE).map(batch => ({ traceIds: batch }));
+  }
+  // SESSION level — single call with no target IDs
+  return [{}];
+}
+
+function chunk<T>(arr: T[], size: number): T[][] {
+  const batches: T[][] = [];
+  for (let i = 0; i < arr.length; i += size) {
+    batches.push(arr.slice(i, i + size));
+  }
+  return batches;
+}
+
 /**
  * Execute a CloudWatch Logs Insights query and wait for results.
  */
@@ -543,28 +571,34 @@ export async function handleRunEval(options: RunEvalOptions): Promise<RunEvalRes
         if (targetSpanIds.length === 0) continue;
       }
 
-      const response = await evaluate({
-        region: ctx.region,
-        evaluatorId,
-        sessionSpans: session.spans,
-        targetTraceIds,
-        targetSpanIds,
-      });
-
-      for (const r of response.evaluationResults) {
-        sessionScores.push({
-          sessionId: r.context?.sessionId ?? session.sessionId,
-          traceId: r.context?.traceId,
-          spanId: r.context?.spanId,
-          value: r.value ?? 0,
-          label: r.label,
-          explanation: r.explanation,
-          errorMessage: r.errorMessage,
+      // The Evaluate API limits targetSpanIds and targetTraceIds to 10 per call.
+      // Batch into chunks and merge results.
+      const batches = batchTargetIds(targetTraceIds, targetSpanIds);
+
+      for (const batch of batches) {
+        const response = await evaluate({
+          region: ctx.region,
+          evaluatorId,
+          sessionSpans: session.spans,
+          targetTraceIds: batch.traceIds,
+          targetSpanIds: batch.spanIds,
         });
 
-        totalInputTokens += r.tokenUsage?.inputTokens ?? 0;
-        totalOutputTokens += r.tokenUsage?.outputTokens ?? 0;
-        totalTokens += r.tokenUsage?.totalTokens ?? 0;
+        for (const r of response.evaluationResults) {
+          sessionScores.push({
+            sessionId: r.context?.sessionId ?? session.sessionId,
+            traceId: r.context?.traceId,
+            spanId: r.context?.spanId,
+            value: r.value ?? 0,
+            label: r.label,
+            explanation: r.explanation,
+            errorMessage: r.errorMessage,
+          });
+
+          totalInputTokens += r.tokenUsage?.inputTokens ?? 0;
+          totalOutputTokens += r.tokenUsage?.outputTokens ?? 0;
+          totalTokens += r.tokenUsage?.totalTokens ?? 0;
+        }
       }
     }
 
diff --git a/src/cli/tui/components/SelectList.tsx b/src/cli/tui/components/SelectList.tsx
index e69c19b0..45a41952 100644
--- a/src/cli/tui/components/SelectList.tsx
+++ b/src/cli/tui/components/SelectList.tsx
@@ -29,7 +29,7 @@ export function SelectList<T extends SelectableItem>(props: {
         const disabled = item.disabled ?? false;
         return (
           <Box key={item.id}>
-            <Text wrap="truncate">
+            <Text wrap="wrap">
               <Text color={selected && !disabled ? 'cyan' : undefined} dimColor={disabled}>
                 {selected ? '❯' : ' '}{' '}
               </Text>
diff --git a/src/cli/tui/screens/evaluator/AddEvaluatorScreen.tsx b/src/cli/tui/screens/evaluator/AddEvaluatorScreen.tsx
index 66731159..21ecb6fb 100644
--- a/src/cli/tui/screens/evaluator/AddEvaluatorScreen.tsx
+++ b/src/cli/tui/screens/evaluator/AddEvaluatorScreen.tsx
@@ -1,5 +1,5 @@
 import type { EvaluationLevel, EvaluatorConfig } from '../../../../schema';
-import { EvaluatorNameSchema } from '../../../../schema';
+import { BedrockModelIdSchema, EvaluatorNameSchema } from '../../../../schema';
 import type { SelectableItem } from '../../components';
 import { ConfirmReview, Panel, Screen, StepIndicator, TextInput, WizardSelect } from '../../components';
 import { HELP_TEXT } from '../../constants';
@@ -89,8 +89,8 @@ export function AddEvaluatorScreen({ onComplete, onExit, existingEvaluatorNames
   const headerContent = <StepIndicator steps={wizard.steps} currentStep={wizard.step} labels={EVALUATOR_STEP_LABELS} />;
 
   return (
-    <Screen title="Add Evaluator" onExit={onExit} helpText={helpText} headerContent={headerContent}>
-      <Panel>
+    <Screen title="Add Evaluator" onExit={onExit} helpText={helpText} headerContent={headerContent} exitEnabled={false}>
+      <Panel fullWidth>
         {isNameStep && (
           <TextInput
             key="name"
@@ -119,6 +119,7 @@ export function AddEvaluatorScreen({ onComplete, onExit, existingEvaluatorNames
             initialValue={DEFAULT_MODEL}
             onSubmit={wizard.setModel}
             onCancel={() => wizard.goBack()}
+            schema={BedrockModelIdSchema}
           />
         )}
 
diff --git a/src/schema/schemas/agentcore-project.ts b/src/schema/schemas/agentcore-project.ts
index d367ffb8..ad515acb 100644
--- a/src/schema/schemas/agentcore-project.ts
+++ b/src/schema/schemas/agentcore-project.ts
@@ -21,7 +21,7 @@ export type { MemoryStrategy, MemoryStrategyType } from './primitives/memory';
 export type { OnlineEvalConfig } from './primitives/online-eval-config';
 export { OnlineEvalConfigSchema, OnlineEvalConfigNameSchema } from './primitives/online-eval-config';
 export type { EvaluationLevel, EvaluatorConfig, LlmAsAJudgeConfig, RatingScale } from './primitives/evaluator';
-export { EvaluatorNameSchema } from './primitives/evaluator';
+export { BedrockModelIdSchema, EvaluatorNameSchema } from './primitives/evaluator';
 
 // ============================================================================
 // Project Name Schema
diff --git a/src/schema/schemas/primitives/evaluator.ts b/src/schema/schemas/primitives/evaluator.ts
index 632cfd85..c6e29f7e 100644
--- a/src/schema/schemas/primitives/evaluator.ts
+++ b/src/schema/schemas/primitives/evaluator.ts
@@ -55,8 +55,16 @@ export type RatingScale = z.infer<typeof RatingScaleSchema>;
 // LLM-as-a-Judge Config
 // ============================================================================
 
+export const BedrockModelIdSchema = z
+  .string()
+  .min(1, 'Model ID is required')
+  .regex(
+    /^(arn:aws(-[a-z]+)?:bedrock:[a-z0-9-]+:\d{12}:(inference-profile|foundation-model)\/[a-zA-Z0-9._:-]+|([a-z]{2}(-[a-z]+)?\.)?[a-z0-9]+\.[a-zA-Z0-9._-]+(:[0-9]+)?)$/,
+    'Must be a valid Bedrock model ID (e.g. us.anthropic.claude-sonnet-4-5-20250929-v1:0) or model ARN'
+  );
+
 export const LlmAsAJudgeConfigSchema = z.object({
-  model: z.string().min(1, 'Model ID is required'),
+  model: BedrockModelIdSchema,
   instructions: z.string().min(1, 'Evaluation instructions are required'),
   ratingScale: RatingScaleSchema,
 });
diff --git a/src/schema/schemas/primitives/index.ts b/src/schema/schemas/primitives/index.ts
index 7b29a435..f68fa3e4 100644
--- a/src/schema/schemas/primitives/index.ts
+++ b/src/schema/schemas/primitives/index.ts
@@ -15,6 +15,7 @@ export type {
   CategoricalRating,
 } from './evaluator';
 export {
+  BedrockModelIdSchema,
   EvaluationLevelSchema,
   EvaluatorConfigSchema,
   EvaluatorNameSchema,

From 2acb37811c8660b7049bd20cbb2bf2dbd3291d1f Mon Sep 17 00:00:00 2001
From: notgitika <gitijh@gmail.com>
Date: Thu, 12 Mar 2026 02:43:36 -0400
Subject: [PATCH 8/9] feat: add evals in resourcegraph

---
 src/cli/tui/components/ResourceGraph.tsx | 55 ++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/src/cli/tui/components/ResourceGraph.tsx b/src/cli/tui/components/ResourceGraph.tsx
index cbea8692..d3445350 100644
--- a/src/cli/tui/components/ResourceGraph.tsx
+++ b/src/cli/tui/components/ResourceGraph.tsx
@@ -105,6 +105,8 @@ export function ResourceGraph({ project, mcp, agentName, resourceStatuses }: Res
   const agents = agentName ? allAgents.filter(a => a.name === agentName) : allAgents;
   const memories = project.memories ?? [];
   const credentials = project.credentials ?? [];
+  const evaluators = project.evaluators ?? [];
+  const onlineEvalConfigs = project.onlineEvalConfigs ?? [];
   const gateways = mcp?.agentCoreGateways ?? [];
   const mcpRuntimeTools = mcp?.mcpRuntimeTools ?? [];
   const unassignedTargets = mcp?.unassignedTargets ?? [];
@@ -130,6 +132,8 @@ export function ResourceGraph({ project, mcp, agentName, resourceStatuses }: Res
     agents.length > 0 ||
     memories.length > 0 ||
     credentials.length > 0 ||
+    evaluators.length > 0 ||
+    onlineEvalConfigs.length > 0 ||
     gateways.length > 0 ||
     mcpRuntimeTools.length > 0 ||
     unassignedTargets.length > 0 ||
@@ -211,6 +215,55 @@ export function ResourceGraph({ project, mcp, agentName, resourceStatuses }: Res
         </Box>
       )}
 
+      {/* Evaluators */}
+      {evaluators.length > 0 && (
+        <Box flexDirection="column">
+          <SectionHeader>Evaluators</SectionHeader>
+          {evaluators.map(evaluator => {
+            const rsEntry = statusMap.get(`evaluator:${evaluator.name}`);
+            const evalStatus = rsEntry?.error ? 'error' : undefined;
+            const evalStatusColor = rsEntry?.error ? 'red' : undefined;
+            return (
+              <ResourceRow
+                key={evaluator.name}
+                icon={ICONS.evaluator}
+                color="cyan"
+                name={evaluator.name}
+                detail={rsEntry?.detail ?? `${evaluator.level} — LLM-as-a-Judge`}
+                status={evalStatus}
+                statusColor={evalStatusColor}
+                deploymentState={rsEntry?.deploymentState}
+                identifier={rsEntry?.identifier}
+              />
+            );
+          })}
+        </Box>
+      )}
+
+      {/* Online Eval Configs */}
+      {onlineEvalConfigs.length > 0 && (
+        <Box flexDirection="column">
+          <SectionHeader>Online Eval Configs</SectionHeader>
+          {onlineEvalConfigs.map(config => {
+            const rsEntry = statusMap.get(`online-eval:${config.name}`);
+            const defaultDetail = `${config.agents.length} agents, ${config.evaluators.length} evaluators — ${config.samplingRate}% sampling`;
+            return (
+              <ResourceRow
+                key={config.name}
+                icon={ICONS['online-eval']}
+                color="magenta"
+                name={config.name}
+                detail={rsEntry?.detail ?? defaultDetail}
+                status={rsEntry?.error ? 'error' : undefined}
+                statusColor={rsEntry?.error ? 'red' : undefined}
+                deploymentState={rsEntry?.deploymentState}
+                identifier={rsEntry?.identifier}
+              />
+            );
+          })}
+        </Box>
+      )}
+
       {/* Removed locally — still deployed in AWS, will be torn down on next deploy */}
       {pendingRemovals.length > 0 && (
         <Box flexDirection="column">
@@ -303,6 +356,8 @@ export function ResourceGraph({ project, mcp, agentName, resourceStatuses }: Res
           <Text color="green">{ICONS.agent}</Text> agent{'  '}
           <Text color="blue">{ICONS.memory}</Text> memory{'  '}
           <Text color="yellow">{ICONS.credential}</Text> credential{'  '}
+          <Text color="cyan">{ICONS.evaluator}</Text> evaluator{'  '}
+          <Text color="magenta">{ICONS['online-eval']}</Text> online-eval{'  '}
           <Text color="magenta">{ICONS.gateway}</Text> gateway
         </Text>
         {resourceStatuses && resourceStatuses.length > 0 && (

From e76851a0d4694689693274e160c3bb74af144cbd Mon Sep 17 00:00:00 2001
From: notgitika <gitijh@gmail.com>
Date: Tue, 17 Mar 2026 18:34:32 -0400
Subject: [PATCH 9/9] feat: add eval TUI screens, online eval dashboard, and
 run eval wizard

---
 .../aws/__tests__/agentcore-control.test.ts   | 124 ++---
 src/cli/aws/agentcore-control.ts              |  46 +-
 src/cli/cli.ts                                |   3 +-
 src/cli/commands/eval/command.tsx             | 335 +------------
 src/cli/commands/pause/command.tsx            | 119 ++++-
 src/cli/commands/pause/index.ts               |   2 +-
 src/cli/commands/resume/command.tsx           |   2 +-
 src/cli/commands/run/command.tsx              |  14 +-
 .../commands/status/__tests__/action.test.ts  |  10 +-
 src/cli/commands/status/action.ts             |   2 +-
 .../eval/__tests__/get-eval-run.test.ts       |  13 +-
 .../eval/__tests__/list-eval-runs.test.ts     |  30 +-
 .../eval/__tests__/logs-eval.test.ts          |  18 +-
 .../eval/__tests__/run-eval.test.ts           |  17 +-
 .../operations/eval/__tests__/storage.test.ts |  70 ++-
 src/cli/operations/eval/get-eval-run.ts       |   2 +-
 src/cli/operations/eval/index.ts              |  14 +-
 src/cli/operations/eval/logs-eval.ts          |  10 +-
 src/cli/operations/eval/pause-resume.ts       |  72 ++-
 src/cli/operations/eval/run-eval.ts           |  10 +-
 src/cli/operations/eval/storage.ts            |  25 +-
 src/cli/operations/eval/types.ts              |   9 +-
 src/cli/primitives/EvaluatorPrimitive.ts      | 170 ++++---
 .../primitives/OnlineEvalConfigPrimitive.ts   |  29 +-
 .../OnlineEvalConfigPrimitive.test.ts         |  73 ++-
 src/cli/tui/App.tsx                           |  48 +-
 src/cli/tui/components/ResourceGraph.tsx      |   2 +-
 src/cli/tui/copy.ts                           |   3 +-
 src/cli/tui/hooks/useCreateOnlineEval.ts      |   6 +-
 src/cli/tui/screens/eval/EvalHubScreen.tsx    |  44 ++
 src/cli/tui/screens/eval/EvalScreen.tsx       | 455 ++++++++++++++++--
 src/cli/tui/screens/eval/index.ts             |   1 +
 .../screens/evaluator/AddEvaluatorScreen.tsx  | 104 +++-
 .../screens/evaluator/__tests__/types.test.ts | 100 +++-
 src/cli/tui/screens/evaluator/types.ts        | 128 ++++-
 .../evaluator/useAddEvaluatorWizard.ts        |  72 ++-
 .../screens/online-eval/AddOnlineEvalFlow.tsx |  77 ++-
 .../online-eval/AddOnlineEvalScreen.tsx       | 114 +++--
 .../online-eval/OnlineEvalDashboard.tsx       | 259 ++++++++++
 src/cli/tui/screens/online-eval/index.ts      |   1 +
 src/cli/tui/screens/online-eval/types.ts      |  34 +-
 .../online-eval/useAddOnlineEvalWizard.ts     |  66 ++-
 src/cli/tui/screens/run-eval/RunEvalFlow.tsx  | 294 +++++++++++
 .../tui/screens/run-eval/RunEvalScreen.tsx    | 142 ++++++
 src/cli/tui/screens/run-eval/RunScreen.tsx    |  32 ++
 src/cli/tui/screens/run-eval/index.ts         |   3 +
 src/cli/tui/screens/run-eval/types.ts         |  28 ++
 .../tui/screens/run-eval/useRunEvalWizard.ts  |  83 ++++
 src/cli/tui/utils/commands.ts                 |   2 +-
 src/schema/schemas/agentcore-project.ts       |  22 +-
 .../__tests__/online-eval-config.test.ts      |  24 +-
 src/schema/schemas/primitives/evaluator.ts    |  16 +-
 src/schema/schemas/primitives/index.ts        |   1 +
 .../schemas/primitives/online-eval-config.ts  |   6 +-
 54 files changed, 2505 insertions(+), 881 deletions(-)
 create mode 100644 src/cli/tui/screens/eval/EvalHubScreen.tsx
 create mode 100644 src/cli/tui/screens/online-eval/OnlineEvalDashboard.tsx
 create mode 100644 src/cli/tui/screens/run-eval/RunEvalFlow.tsx
 create mode 100644 src/cli/tui/screens/run-eval/RunEvalScreen.tsx
 create mode 100644 src/cli/tui/screens/run-eval/RunScreen.tsx
 create mode 100644 src/cli/tui/screens/run-eval/index.ts
 create mode 100644 src/cli/tui/screens/run-eval/types.ts
 create mode 100644 src/cli/tui/screens/run-eval/useRunEvalWizard.ts

diff --git a/src/cli/aws/__tests__/agentcore-control.test.ts b/src/cli/aws/__tests__/agentcore-control.test.ts
index 75a5b769..3683eb08 100644
--- a/src/cli/aws/__tests__/agentcore-control.test.ts
+++ b/src/cli/aws/__tests__/agentcore-control.test.ts
@@ -3,7 +3,6 @@ import {
   getEvaluator,
   getOnlineEvaluationConfig,
   listEvaluators,
-  listOnlineEvaluationConfigs,
   updateOnlineEvalExecutionStatus,
 } from '../agentcore-control.js';
 import { beforeEach, describe, expect, it, vi } from 'vitest';
@@ -28,9 +27,6 @@ vi.mock('@aws-sdk/client-bedrock-agentcore-control', () => ({
   ListEvaluatorsCommand: class {
     constructor(public input: unknown) {}
   },
-  ListOnlineEvaluationConfigsCommand: class {
-    constructor(public input: unknown) {}
-  },
   UpdateOnlineEvaluationConfigCommand: class {
     constructor(public input: unknown) {}
   },
@@ -142,67 +138,6 @@ describe('getEvaluator', () => {
   });
 });
 
-describe('listEvaluators', () => {
-  beforeEach(() => {
-    vi.clearAllMocks();
-  });
-
-  it('returns evaluator list', async () => {
-    mockSend.mockResolvedValue({
-      evaluators: [
-        {
-          evaluatorId: 'eval-1',
-          evaluatorArn: 'arn:1',
-          evaluatorName: 'builtin-help',
-          evaluatorType: 'Builtin',
-          level: 'SESSION',
-          status: 'ACTIVE',
-        },
-        {
-          evaluatorId: 'eval-2',
-          evaluatorArn: 'arn:2',
-          evaluatorName: 'custom-tone',
-          evaluatorType: 'Custom',
-          level: 'TRACE',
-          status: 'ACTIVE',
-          description: 'Tone checker',
-        },
-      ],
-      nextToken: 'page2',
-    });
-
-    const result = await listEvaluators({ region: 'us-east-1', maxResults: 10 });
-    expect(result.evaluators).toHaveLength(2);
-    expect(result.evaluators[0]!.evaluatorType).toBe('Builtin');
-    expect(result.evaluators[1]!.description).toBe('Tone checker');
-    expect(result.nextToken).toBe('page2');
-  });
-
-  it('returns empty list when no evaluators', async () => {
-    mockSend.mockResolvedValue({ evaluators: undefined });
-
-    const result = await listEvaluators({ region: 'us-east-1' });
-    expect(result.evaluators).toEqual([]);
-    expect(result.nextToken).toBeUndefined();
-  });
-
-  it('passes maxResults and nextToken in command', async () => {
-    mockSend.mockResolvedValue({ evaluators: [] });
-
-    await listEvaluators({ region: 'us-east-1', maxResults: 5, nextToken: 'tok-1' });
-
-    const command = mockSend.mock.calls[0]![0];
-    expect(command.input.maxResults).toBe(5);
-    expect(command.input.nextToken).toBe('tok-1');
-  });
-
-  it('propagates SDK errors', async () => {
-    mockSend.mockRejectedValue(new Error('Throttling'));
-
-    await expect(listEvaluators({ region: 'us-east-1' })).rejects.toThrow('Throttling');
-  });
-});
-
 describe('updateOnlineEvalExecutionStatus', () => {
   beforeEach(() => {
     vi.clearAllMocks();
@@ -370,60 +305,61 @@ describe('getOnlineEvaluationConfig', () => {
   });
 });
 
-describe('listOnlineEvaluationConfigs', () => {
+describe('listEvaluators', () => {
   beforeEach(() => {
     vi.clearAllMocks();
   });
 
-  it('returns config list', async () => {
+  it('returns evaluator summaries', async () => {
     mockSend.mockResolvedValue({
-      onlineEvaluationConfigs: [
+      evaluators: [
         {
-          onlineEvaluationConfigId: 'oec-1',
-          onlineEvaluationConfigArn: 'arn:1',
-          onlineEvaluationConfigName: 'eval-prod',
+          evaluatorId: 'eval-1',
+          evaluatorArn: 'arn:aws:bedrock-agentcore:us-east-1:123456:evaluator/eval-1',
+          evaluatorName: 'Faithfulness',
+          evaluatorType: 'Builtin',
           status: 'ACTIVE',
-          executionStatus: 'ENABLED',
         },
         {
-          onlineEvaluationConfigId: 'oec-2',
-          onlineEvaluationConfigArn: 'arn:2',
-          onlineEvaluationConfigName: 'eval-staging',
+          evaluatorId: 'eval-2',
+          evaluatorArn: 'arn:aws:bedrock-agentcore:us-east-1:123456:evaluator/eval-2',
+          evaluatorName: 'my-custom',
+          evaluatorType: 'Custom',
           status: 'ACTIVE',
-          executionStatus: 'DISABLED',
+          description: 'A custom evaluator',
         },
       ],
-      nextToken: 'next-page',
     });
 
-    const result = await listOnlineEvaluationConfigs({ region: 'us-east-1', maxResults: 20 });
-    expect(result.configs).toHaveLength(2);
-    expect(result.configs[0]!.configName).toBe('eval-prod');
-    expect(result.configs[1]!.executionStatus).toBe('DISABLED');
-    expect(result.nextToken).toBe('next-page');
+    const result = await listEvaluators({ region: 'us-east-1' });
+    expect(result.evaluators).toHaveLength(2);
+    expect(result.evaluators[0]!.evaluatorName).toBe('Faithfulness');
+    expect(result.evaluators[0]!.evaluatorType).toBe('Builtin');
+    expect(result.evaluators[1]!.evaluatorName).toBe('my-custom');
+    expect(result.evaluators[1]!.description).toBe('A custom evaluator');
   });
 
-  it('returns empty list when no configs', async () => {
-    mockSend.mockResolvedValue({ onlineEvaluationConfigs: undefined });
+  it('returns empty array when no evaluators', async () => {
+    mockSend.mockResolvedValue({ evaluators: undefined });
 
-    const result = await listOnlineEvaluationConfigs({ region: 'us-east-1' });
-    expect(result.configs).toEqual([]);
-    expect(result.nextToken).toBeUndefined();
+    const result = await listEvaluators({ region: 'us-east-1' });
+    expect(result.evaluators).toEqual([]);
   });
 
-  it('passes maxResults and nextToken in command', async () => {
-    mockSend.mockResolvedValue({ onlineEvaluationConfigs: [] });
+  it('passes maxResults and nextToken', async () => {
+    mockSend.mockResolvedValue({ evaluators: [], nextToken: 'token-2' });
 
-    await listOnlineEvaluationConfigs({ region: 'us-east-1', maxResults: 10, nextToken: 'tok-abc' });
+    const result = await listEvaluators({ region: 'us-east-1', maxResults: 5, nextToken: 'token-1' });
 
     const command = mockSend.mock.calls[0]![0];
-    expect(command.input.maxResults).toBe(10);
-    expect(command.input.nextToken).toBe('tok-abc');
+    expect(command.input.maxResults).toBe(5);
+    expect(command.input.nextToken).toBe('token-1');
+    expect(result.nextToken).toBe('token-2');
   });
 
   it('propagates SDK errors', async () => {
-    mockSend.mockRejectedValue(new Error('InternalServerError'));
+    mockSend.mockRejectedValue(new Error('AccessDeniedException'));
 
-    await expect(listOnlineEvaluationConfigs({ region: 'us-east-1' })).rejects.toThrow('InternalServerError');
+    await expect(listEvaluators({ region: 'us-east-1' })).rejects.toThrow('AccessDeniedException');
   });
 });
diff --git a/src/cli/aws/agentcore-control.ts b/src/cli/aws/agentcore-control.ts
index 070f2b25..40db8e48 100644
--- a/src/cli/aws/agentcore-control.ts
+++ b/src/cli/aws/agentcore-control.ts
@@ -1,11 +1,11 @@
 import { getCredentialProvider } from './account';
 import {
   BedrockAgentCoreControlClient,
+  DeleteOnlineEvaluationConfigCommand,
   GetAgentRuntimeCommand,
   GetEvaluatorCommand,
   GetOnlineEvaluationConfigCommand,
   ListEvaluatorsCommand,
-  ListOnlineEvaluationConfigsCommand,
   UpdateOnlineEvaluationConfigCommand,
 } from '@aws-sdk/client-bedrock-agentcore-control';
 
@@ -242,52 +242,38 @@ export async function getOnlineEvaluationConfig(
   };
 }
 
-export interface ListOnlineEvalConfigsOptions {
+// ============================================================================
+// Delete Online Eval Config
+// ============================================================================
+
+export interface DeleteOnlineEvalConfigOptions {
   region: string;
-  maxResults?: number;
-  nextToken?: string;
+  onlineEvaluationConfigId: string;
 }
 
-export interface OnlineEvalConfigSummary {
+export interface DeleteOnlineEvalConfigResult {
   configId: string;
   configArn: string;
-  configName: string;
   status: string;
-  executionStatus: string;
-  description?: string;
-  failureReason?: string;
 }
 
-export interface ListOnlineEvalConfigsResult {
-  configs: OnlineEvalConfigSummary[];
-  nextToken?: string;
-}
-
-export async function listOnlineEvaluationConfigs(
-  options: ListOnlineEvalConfigsOptions
-): Promise<ListOnlineEvalConfigsResult> {
+export async function deleteOnlineEvalConfig(
+  options: DeleteOnlineEvalConfigOptions
+): Promise<DeleteOnlineEvalConfigResult> {
   const client = new BedrockAgentCoreControlClient({
     region: options.region,
     credentials: getCredentialProvider(),
   });
 
-  const command = new ListOnlineEvaluationConfigsCommand({
-    maxResults: options.maxResults,
-    nextToken: options.nextToken,
+  const command = new DeleteOnlineEvaluationConfigCommand({
+    onlineEvaluationConfigId: options.onlineEvaluationConfigId,
   });
 
   const response = await client.send(command);
 
   return {
-    configs: (response.onlineEvaluationConfigs ?? []).map(c => ({
-      configId: c.onlineEvaluationConfigId ?? '',
-      configArn: c.onlineEvaluationConfigArn ?? '',
-      configName: c.onlineEvaluationConfigName ?? '',
-      status: c.status ?? 'UNKNOWN',
-      executionStatus: c.executionStatus ?? 'UNKNOWN',
-      description: c.description,
-      failureReason: c.failureReason,
-    })),
-    nextToken: response.nextToken,
+    configId: response.onlineEvaluationConfigId ?? options.onlineEvaluationConfigId,
+    configArn: response.onlineEvaluationConfigArn ?? '',
+    status: response.status ?? 'DELETING',
   };
 }
diff --git a/src/cli/cli.ts b/src/cli/cli.ts
index 621e0ef4..dc3b22e3 100644
--- a/src/cli/cli.ts
+++ b/src/cli/cli.ts
@@ -7,7 +7,7 @@ import { registerHelp } from './commands/help';
 import { registerInvoke } from './commands/invoke';
 import { registerLogs } from './commands/logs';
 import { registerPackage } from './commands/package';
-import { registerPause } from './commands/pause';
+import { registerPause, registerStop } from './commands/pause';
 import { registerRemove } from './commands/remove';
 import { registerResume } from './commands/resume';
 import { registerRun } from './commands/run';
@@ -143,6 +143,7 @@ export function registerCommands(program: Command) {
   const removeCmd = registerRemove(program);
   registerResume(program);
   registerRun(program);
+  registerStop(program);
   registerStatus(program);
   registerTraces(program);
   registerUpdate(program);
diff --git a/src/cli/commands/eval/command.tsx b/src/cli/commands/eval/command.tsx
index 0e0cf0b0..44bff493 100644
--- a/src/cli/commands/eval/command.tsx
+++ b/src/cli/commands/eval/command.tsx
@@ -1,14 +1,6 @@
-import {
-  getEvaluator,
-  getOnlineEvaluationConfig,
-  listEvaluators,
-  listOnlineEvaluationConfigs,
-  updateOnlineEvalConfig,
-} from '../../aws/agentcore-control';
-import type { OnlineEvalExecutionStatus } from '../../aws/agentcore-control';
-import { detectRegion } from '../../aws/region';
 import { getErrorMessage } from '../../errors';
-import { handleGetEvalRun, handleListEvalRuns } from '../../operations/eval';
+import { handleListEvalRuns } from '../../operations/eval';
+import { getResultsPath } from '../../operations/eval/storage';
 import { COMMAND_DESCRIPTIONS } from '../../tui/copy';
 import { requireProject } from '../../tui/guards';
 import type { Command } from '@commander-js/extra-typings';
@@ -19,8 +11,8 @@ export const registerEval = (program: Command) => {
   const evalCmd = program.command('eval').description(COMMAND_DESCRIPTIONS.eval);
 
   evalCmd
-    .command('list')
-    .description('List past eval runs')
+    .command('history')
+    .description('Show past eval run results')
     .option('-a, --agent <name>', 'Filter by agent name')
     .option('-n, --limit <count>', 'Maximum number of runs to show')
     .option('--json', 'Output as JSON')
@@ -51,255 +43,25 @@ export const registerEval = (program: Command) => {
           return;
         }
 
-        console.log(
-          `\n${'Run ID'.padEnd(42)} ${'Agent'.padEnd(20)} ${'Evaluators'.padEnd(30)} ${'Sessions'.padEnd(10)} Date`
-        );
-        console.log('─'.repeat(120));
+        console.log(`\n${'Date'.padEnd(22)} ${'Agent'.padEnd(20)} ${'Evaluators'.padEnd(30)} Sessions`);
+        console.log('─'.repeat(90));
 
         for (const run of runs) {
           const scores = run.results.map(r => `${r.evaluator}=${r.aggregateScore.toFixed(2)}`).join(', ');
-          const date = new Date(run.timestamp).toLocaleDateString();
-          console.log(
-            `${run.runId.padEnd(42)} ${run.agent.padEnd(20)} ${scores.padEnd(30)} ${String(run.sessionCount).padEnd(10)} ${date}`
-          );
-        }
-        console.log('');
-      } catch (error) {
-        if (cliOptions.json) {
-          console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
-        } else {
-          render(<Text color="red">Error: {getErrorMessage(error)}</Text>);
-        }
-        process.exit(1);
-      }
-    });
-
-  evalCmd
-    .command('get')
-    .description('Get details of a specific eval run')
-    .argument('<runId>', 'Eval run ID')
-    .option('--sessions', 'Show per-session score breakdown')
-    .option('--json', 'Output as JSON')
-    .action(
-      (
-        runId: string,
-        cliOptions: {
-          sessions?: boolean;
-          json?: boolean;
+          const date = new Date(run.timestamp).toLocaleString([], {
+            year: 'numeric',
+            month: 'short',
+            day: 'numeric',
+            hour: '2-digit',
+            minute: '2-digit',
+          });
+          console.log(`${date.padEnd(22)} ${run.agent.padEnd(20)} ${scores.padEnd(30)} ${run.sessionCount}`);
         }
-      ) => {
-        requireProject();
 
         try {
-          const result = handleGetEvalRun({ runId, sessions: cliOptions.sessions, json: cliOptions.json });
-
-          if (cliOptions.json) {
-            console.log(JSON.stringify(result));
-            process.exit(result.success ? 0 : 1);
-            return;
-          }
-
-          if (!result.success) {
-            render(<Text color="red">{result.error}</Text>);
-            process.exit(1);
-          }
-
-          const run = result.run!;
-          console.log(`\nEval Run: ${run.runId}`);
-          console.log(`Agent: ${run.agent}`);
-          console.log(`Date: ${new Date(run.timestamp).toISOString()}`);
-          console.log(`Sessions: ${run.sessionCount} | Lookback: ${run.lookbackDays}d\n`);
-
-          for (const r of run.results) {
-            const errors = r.sessionScores.filter(s => s.errorMessage).length;
-            console.log(`  ${r.evaluator}: ${r.aggregateScore.toFixed(2)}${errors > 0 ? ` (${errors} errors)` : ''}`);
-
-            if (r.tokenUsage) {
-              console.log(
-                `    Tokens: ${r.tokenUsage.totalTokens} (in: ${r.tokenUsage.inputTokens}, out: ${r.tokenUsage.outputTokens})`
-              );
-            }
-
-            if (cliOptions.sessions) {
-              console.log('');
-              for (const s of r.sessionScores) {
-                const status = s.errorMessage
-                  ? `ERROR: ${s.errorMessage}`
-                  : `${s.value.toFixed(2)}${s.label ? ` (${s.label})` : ''}`;
-                console.log(`    session=${s.sessionId}  ${status}`);
-                if (s.explanation) {
-                  console.log(`      ${s.explanation}`);
-                }
-              }
-            }
-            console.log('');
-          }
-        } catch (error) {
-          if (cliOptions.json) {
-            console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
-          } else {
-            render(<Text color="red">Error: {getErrorMessage(error)}</Text>);
-          }
-          process.exit(1);
-        }
-      }
-    );
-
-  // ── WI-7: Evaluator Discovery Commands ──────────────────────────────
-
-  evalCmd
-    .command('list-evaluators')
-    .description('List available evaluators (built-in and custom)')
-    .option('--region <region>', 'AWS region')
-    .option('--max-results <n>', 'Maximum number of results')
-    .option('--json', 'Output as JSON')
-    .action(async (cliOptions: { region?: string; maxResults?: string; json?: boolean }) => {
-      try {
-        const region = cliOptions.region ?? (await detectRegion()).region;
-        const result = await listEvaluators({
-          region,
-          maxResults: cliOptions.maxResults ? parseInt(cliOptions.maxResults, 10) : undefined,
-        });
-
-        if (cliOptions.json) {
-          console.log(JSON.stringify(result));
-          return;
-        }
-
-        if (result.evaluators.length === 0) {
-          console.log('No evaluators found.');
-          return;
-        }
-
-        console.log(`\n${'ID'.padEnd(45)} ${'Name'.padEnd(30)} ${'Type'.padEnd(10)} ${'Level'.padEnd(12)} Status`);
-        console.log('─'.repeat(110));
-
-        for (const e of result.evaluators) {
-          console.log(
-            `${e.evaluatorId.padEnd(45)} ${e.evaluatorName.padEnd(30)} ${e.evaluatorType.padEnd(10)} ${(e.level ?? '—').padEnd(12)} ${e.status}`
-          );
-        }
-        console.log('');
-      } catch (error) {
-        if (cliOptions.json) {
-          console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
-        } else {
-          render(<Text color="red">Error: {getErrorMessage(error)}</Text>);
-        }
-        process.exit(1);
-      }
-    });
-
-  evalCmd
-    .command('get-evaluator')
-    .description('Get details of a specific evaluator')
-    .argument('<evaluatorId>', 'Evaluator ID')
-    .option('--region <region>', 'AWS region')
-    .option('--json', 'Output as JSON')
-    .action(async (evaluatorId: string, cliOptions: { region?: string; json?: boolean }) => {
-      try {
-        const region = cliOptions.region ?? (await detectRegion()).region;
-        const result = await getEvaluator({ region, evaluatorId });
-
-        if (cliOptions.json) {
-          console.log(JSON.stringify(result));
-          return;
-        }
-
-        console.log(`\nEvaluator: ${result.evaluatorName}`);
-        console.log(`ID: ${result.evaluatorId}`);
-        console.log(`ARN: ${result.evaluatorArn}`);
-        console.log(`Level: ${result.level}`);
-        console.log(`Status: ${result.status}`);
-        if (result.description) {
-          console.log(`Description: ${result.description}`);
-        }
-        console.log('');
-      } catch (error) {
-        if (cliOptions.json) {
-          console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
-        } else {
-          render(<Text color="red">Error: {getErrorMessage(error)}</Text>);
-        }
-        process.exit(1);
-      }
-    });
-
-  // ── WI-8: Online Eval Config Inspection Commands ────────────────────
-
-  evalCmd
-    .command('list-online')
-    .description('List online evaluation configs')
-    .option('--region <region>', 'AWS region')
-    .option('--max-results <n>', 'Maximum number of results')
-    .option('--json', 'Output as JSON')
-    .action(async (cliOptions: { region?: string; maxResults?: string; json?: boolean }) => {
-      try {
-        const region = cliOptions.region ?? (await detectRegion()).region;
-        const result = await listOnlineEvaluationConfigs({
-          region,
-          maxResults: cliOptions.maxResults ? parseInt(cliOptions.maxResults, 10) : undefined,
-        });
-
-        if (cliOptions.json) {
-          console.log(JSON.stringify(result));
-          return;
-        }
-
-        if (result.configs.length === 0) {
-          console.log('No online eval configs found.');
-          return;
-        }
-
-        console.log(`\n${'ID'.padEnd(50)} ${'Name'.padEnd(30)} ${'Status'.padEnd(18)} Execution`);
-        console.log('─'.repeat(115));
-
-        for (const c of result.configs) {
-          const failSuffix = c.failureReason ? ` (${c.failureReason})` : '';
-          console.log(
-            `${c.configId.padEnd(50)} ${c.configName.padEnd(30)} ${c.status.padEnd(18)} ${c.executionStatus}${failSuffix}`
-          );
-        }
-        console.log('');
-      } catch (error) {
-        if (cliOptions.json) {
-          console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
-        } else {
-          render(<Text color="red">Error: {getErrorMessage(error)}</Text>);
-        }
-        process.exit(1);
-      }
-    });
-
-  evalCmd
-    .command('get-online')
-    .description('Get details of a specific online evaluation config')
-    .argument('<configId>', 'Online evaluation config ID')
-    .option('--region <region>', 'AWS region')
-    .option('--json', 'Output as JSON')
-    .action(async (configId: string, cliOptions: { region?: string; json?: boolean }) => {
-      try {
-        const region = cliOptions.region ?? (await detectRegion()).region;
-        const result = await getOnlineEvaluationConfig({ region, configId });
-
-        if (cliOptions.json) {
-          console.log(JSON.stringify(result));
-          return;
-        }
-
-        console.log(`\nOnline Eval Config: ${result.configName}`);
-        console.log(`ID: ${result.configId}`);
-        console.log(`ARN: ${result.configArn}`);
-        console.log(`Status: ${result.status}`);
-        console.log(`Execution: ${result.executionStatus}`);
-        if (result.description) {
-          console.log(`Description: ${result.description}`);
-        }
-        if (result.failureReason) {
-          console.log(`Failure: ${result.failureReason}`);
-        }
-        if (result.outputLogGroupName) {
-          console.log(`Log Group: ${result.outputLogGroupName}`);
+          console.log(`\nResults saved in: ${getResultsPath()}`);
+        } catch {
+          // ignore — no project context
         }
         console.log('');
       } catch (error) {
@@ -311,67 +73,4 @@ export const registerEval = (program: Command) => {
         process.exit(1);
       }
     });
-
-  // ── WI-9: Online Eval Config Update ─────────────────────────────────
-
-  evalCmd
-    .command('update-online')
-    .description('Update a deployed online evaluation config')
-    .argument('<configId>', 'Online evaluation config ID')
-    .option('--status <status>', 'Set execution status (ENABLED or DISABLED)')
-    .option('--description <text>', 'Set config description')
-    .option('--region <region>', 'AWS region')
-    .option('--json', 'Output as JSON')
-    .action(
-      async (
-        configId: string,
-        cliOptions: { status?: string; description?: string; region?: string; json?: boolean }
-      ) => {
-        try {
-          if (!cliOptions.status && cliOptions.description === undefined) {
-            const error = 'At least one of --status or --description is required';
-            if (cliOptions.json) {
-              console.log(JSON.stringify({ success: false, error }));
-            } else {
-              render(<Text color="red">{error}</Text>);
-            }
-            process.exit(1);
-          }
-
-          if (cliOptions.status && !['ENABLED', 'DISABLED'].includes(cliOptions.status)) {
-            const error = `Invalid status "${cliOptions.status}". Must be ENABLED or DISABLED.`;
-            if (cliOptions.json) {
-              console.log(JSON.stringify({ success: false, error }));
-            } else {
-              render(<Text color="red">{error}</Text>);
-            }
-            process.exit(1);
-          }
-
-          const region = cliOptions.region ?? (await detectRegion()).region;
-          const result = await updateOnlineEvalConfig({
-            region,
-            onlineEvaluationConfigId: configId,
-            executionStatus: cliOptions.status as OnlineEvalExecutionStatus | undefined,
-            description: cliOptions.description,
-          });
-
-          if (cliOptions.json) {
-            console.log(JSON.stringify(result));
-            return;
-          }
-
-          console.log(`Updated online eval config "${configId}"`);
-          console.log(`  Status: ${result.status}`);
-          console.log(`  Execution: ${result.executionStatus}`);
-        } catch (error) {
-          if (cliOptions.json) {
-            console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
-          } else {
-            render(<Text color="red">Error: {getErrorMessage(error)}</Text>);
-          }
-          process.exit(1);
-        }
-      }
-    );
 };
diff --git a/src/cli/commands/pause/command.tsx b/src/cli/commands/pause/command.tsx
index a70bb17e..6e5ca7d8 100644
--- a/src/cli/commands/pause/command.tsx
+++ b/src/cli/commands/pause/command.tsx
@@ -1,10 +1,12 @@
 import { getErrorMessage } from '../../errors';
-import { handlePauseResume } from '../../operations/eval';
+import { handleDeleteOnlineEval, handlePauseResume } from '../../operations/eval';
+import type { OnlineEvalActionOptions } from '../../operations/eval';
 import { COMMAND_DESCRIPTIONS } from '../../tui/copy';
 import { requireProject } from '../../tui/guards';
 import type { Command } from '@commander-js/extra-typings';
 import { Text, render } from 'ink';
 import React from 'react';
+import * as readline from 'readline';
 
 function registerOnlineEvalSubcommand(parent: Command, action: 'pause' | 'resume') {
   const description = action === 'pause' ? 'Pause a deployed online eval config' : 'Resume a paused online eval config';
@@ -13,18 +15,40 @@ function registerOnlineEvalSubcommand(parent: Command, action: 'pause' | 'resume
   parent
     .command('online-eval')
     .description(description)
-    .argument('<name>', 'Online eval config name')
+    .argument('[name]', 'Online eval config name (from project config)')
+    .option('--arn <arn>', 'Online eval config ARN (direct mode, bypasses project config)')
+    .option('--region <region>', 'AWS region (used with --arn)')
     .option('--json', 'Output as JSON')
-    .action(async (name: string, cliOptions: { json?: boolean }) => {
-      requireProject();
+    .action(async (name: string | undefined, cliOptions: { arn?: string; region?: string; json?: boolean }) => {
+      if (!cliOptions.arn && !name) {
+        const error = 'Either a config name or --arn is required';
+        if (cliOptions.json) {
+          console.log(JSON.stringify({ success: false, error }));
+        } else {
+          render(<Text color="red">{error}</Text>);
+        }
+        process.exit(1);
+      }
+
+      if (!cliOptions.arn) {
+        requireProject();
+      }
+
+      const options: OnlineEvalActionOptions = {
+        name: name ?? '',
+        arn: cliOptions.arn,
+        region: cliOptions.region,
+        json: cliOptions.json,
+      };
 
       try {
-        const result = await handlePauseResume({ name, json: cliOptions.json }, action);
+        const result = await handlePauseResume(options, action);
 
         if (cliOptions.json) {
           console.log(JSON.stringify(result));
         } else if (result.success) {
-          console.log(`${pastTense} online eval config "${name}" (status: ${result.executionStatus})`);
+          const displayName = cliOptions.arn ? result.configId : name;
+          console.log(`${pastTense} online eval config "${displayName}" (status: ${result.executionStatus})`);
         } else {
           render(<Text color="red">{result.error}</Text>);
         }
@@ -41,6 +65,16 @@ function registerOnlineEvalSubcommand(parent: Command, action: 'pause' | 'resume
     });
 }
 
+function askConfirmation(prompt: string): Promise<boolean> {
+  const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
+  return new Promise(resolve => {
+    rl.question(prompt, answer => {
+      rl.close();
+      resolve(answer.toLowerCase() === 'y' || answer.toLowerCase() === 'yes');
+    });
+  });
+}
+
 export const registerPause = (program: Command) => {
   const pauseCmd = program.command('pause').description(COMMAND_DESCRIPTIONS.pause);
   registerOnlineEvalSubcommand(pauseCmd, 'pause');
@@ -50,3 +84,76 @@ export const registerResume = (program: Command) => {
   const resumeCmd = program.command('resume').description(COMMAND_DESCRIPTIONS.resume);
   registerOnlineEvalSubcommand(resumeCmd, 'resume');
 };
+
+export const registerStop = (program: Command) => {
+  const stopCmd = program.command('stop').description(COMMAND_DESCRIPTIONS.stop);
+
+  stopCmd
+    .command('online-eval')
+    .description('Delete a deployed online eval config')
+    .argument('[name]', 'Online eval config name (from project config)')
+    .option('--arn <arn>', 'Online eval config ARN (direct mode, bypasses project config)')
+    .option('--region <region>', 'AWS region (used with --arn)')
+    .option('--json', 'Output as JSON')
+    .option('-y, --yes', 'Skip confirmation prompt')
+    .action(
+      async (
+        name: string | undefined,
+        cliOptions: { arn?: string; region?: string; json?: boolean; yes?: boolean }
+      ) => {
+        if (!cliOptions.arn && !name) {
+          const error = 'Either a config name or --arn is required';
+          if (cliOptions.json) {
+            console.log(JSON.stringify({ success: false, error }));
+          } else {
+            render(<Text color="red">{error}</Text>);
+          }
+          process.exit(1);
+        }
+
+        if (!cliOptions.arn) {
+          requireProject();
+        }
+
+        const displayName = cliOptions.arn ?? name;
+
+        if (!cliOptions.yes && !cliOptions.json) {
+          const confirmed = await askConfirmation(
+            `Are you sure you want to delete online eval config "${displayName}"? This action cannot be undone. (y/N) `
+          );
+          if (!confirmed) {
+            console.log('Aborted.');
+            process.exit(0);
+          }
+        }
+
+        const options: OnlineEvalActionOptions = {
+          name: name ?? '',
+          arn: cliOptions.arn,
+          region: cliOptions.region,
+          json: cliOptions.json,
+        };
+
+        try {
+          const result = await handleDeleteOnlineEval(options);
+
+          if (cliOptions.json) {
+            console.log(JSON.stringify(result));
+          } else if (result.success) {
+            console.log(`Deleted online eval config "${displayName}" (status: ${result.status})`);
+          } else {
+            render(<Text color="red">{result.error}</Text>);
+          }
+
+          process.exit(result.success ? 0 : 1);
+        } catch (error) {
+          if (cliOptions.json) {
+            console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
+          } else {
+            render(<Text color="red">Error: {getErrorMessage(error)}</Text>);
+          }
+          process.exit(1);
+        }
+      }
+    );
+};
diff --git a/src/cli/commands/pause/index.ts b/src/cli/commands/pause/index.ts
index 858054fd..183fc104 100644
--- a/src/cli/commands/pause/index.ts
+++ b/src/cli/commands/pause/index.ts
@@ -1 +1 @@
-export { registerPause } from './command';
+export { registerPause, registerStop } from './command';
diff --git a/src/cli/commands/resume/command.tsx b/src/cli/commands/resume/command.tsx
index 15b214f0..24e24241 100644
--- a/src/cli/commands/resume/command.tsx
+++ b/src/cli/commands/resume/command.tsx
@@ -1 +1 @@
-export { registerResume } from '../pause/command';
+export { registerResume, registerStop } from '../pause/command';
diff --git a/src/cli/commands/run/command.tsx b/src/cli/commands/run/command.tsx
index bf0400e3..b3ea2790 100644
--- a/src/cli/commands/run/command.tsx
+++ b/src/cli/commands/run/command.tsx
@@ -11,8 +11,14 @@ function formatRunOutput(result: Awaited<ReturnType<typeof handleRunEval>>): voi
   if (!result.run) return;
 
   const { run } = result;
-  console.log(`\nEval Run: ${run.runId}`);
-  console.log(`Agent: ${run.agent} | Sessions: ${run.sessionCount} | Lookback: ${run.lookbackDays}d\n`);
+  const date = new Date(run.timestamp).toLocaleString([], {
+    year: 'numeric',
+    month: 'short',
+    day: 'numeric',
+    hour: '2-digit',
+    minute: '2-digit',
+  });
+  console.log(`\nAgent: ${run.agent} | ${date} | Sessions: ${run.sessionCount} | Lookback: ${run.lookbackDays}d\n`);
 
   for (const r of run.results) {
     const score = r.aggregateScore.toFixed(2);
@@ -55,9 +61,7 @@ export const registerRun = (program: Command) => {
         output?: string;
         json?: boolean;
       }) => {
-        if (!cliOptions.agentArn) {
-          requireProject();
-        }
+        requireProject();
 
         if (!cliOptions.evaluator && !cliOptions.evaluatorArn) {
           const error = 'At least one --evaluator or --evaluator-arn is required';
diff --git a/src/cli/commands/status/__tests__/action.test.ts b/src/cli/commands/status/__tests__/action.test.ts
index eeb6310a..731c8a82 100644
--- a/src/cli/commands/status/__tests__/action.test.ts
+++ b/src/cli/commands/status/__tests__/action.test.ts
@@ -343,7 +343,7 @@ describe('computeResourceStatuses', () => {
   it('marks online-eval config as deployed when in both local and deployed state', () => {
     const project = {
       ...baseProject,
-      onlineEvalConfigs: [{ name: 'TestConfig', agents: ['Agent1'], evaluators: ['Builtin.Helpfulness'] }],
+      onlineEvalConfigs: [{ name: 'TestConfig', evaluators: ['Builtin.Helpfulness'], samplingRate: 10 }],
     } as unknown as AgentCoreProjectSpec;
 
     const resources: DeployedResourceState = {
@@ -360,13 +360,13 @@ describe('computeResourceStatuses', () => {
 
     expect(configEntry).toBeDefined();
     expect(configEntry!.deploymentState).toBe('deployed');
-    expect(configEntry!.detail).toBe('1 agent, 1 evaluator');
+    expect(configEntry!.detail).toBe('1 evaluator, 10% sampling');
   });
 
   it('marks online-eval config as local-only when not deployed', () => {
     const project = {
       ...baseProject,
-      onlineEvalConfigs: [{ name: 'TestConfig', agents: ['A', 'B'], evaluators: ['Builtin.X', 'Builtin.Y', 'Custom'] }],
+      onlineEvalConfigs: [{ name: 'TestConfig', evaluators: ['Builtin.X', 'Builtin.Y', 'Custom'], samplingRate: 50 }],
     } as unknown as AgentCoreProjectSpec;
 
     const result = computeResourceStatuses(project, undefined);
@@ -374,7 +374,7 @@ describe('computeResourceStatuses', () => {
 
     expect(configEntry).toBeDefined();
     expect(configEntry!.deploymentState).toBe('local-only');
-    expect(configEntry!.detail).toBe('2 agents, 3 evaluators');
+    expect(configEntry!.detail).toBe('3 evaluators, 50% sampling');
   });
 
   it('marks online-eval config as pending-removal when deployed but removed from schema', () => {
@@ -452,7 +452,7 @@ describe('handleProjectStatus — live enrichment', () => {
       project: {
         ...baseProject,
         evaluators: [{ name: 'MyEval', level: 'SESSION', config: {} }],
-        onlineEvalConfigs: [{ name: 'MyConfig', agents: ['agent1'], evaluators: ['Builtin.Helpfulness'] }],
+        onlineEvalConfigs: [{ name: 'MyConfig', evaluators: ['Builtin.Helpfulness'], samplingRate: 10 }],
       } as unknown as AgentCoreProjectSpec,
       awsTargets: [{ name: 'dev', region: 'us-east-1', account: '123456789' }],
       deployedState: {
diff --git a/src/cli/commands/status/action.ts b/src/cli/commands/status/action.ts
index 49b3ebb1..fde41a5c 100644
--- a/src/cli/commands/status/action.ts
+++ b/src/cli/commands/status/action.ts
@@ -167,7 +167,7 @@ export function computeResourceStatuses(
     deployedRecord: resources?.onlineEvalConfigs ?? {},
     getIdentifier: deployed => deployed.onlineEvaluationConfigArn,
     getLocalDetail: item =>
-      `${item.agents.length} agent${item.agents.length !== 1 ? 's' : ''}, ${item.evaluators.length} evaluator${item.evaluators.length !== 1 ? 's' : ''}`,
+      `${item.evaluators.length} evaluator${item.evaluators.length !== 1 ? 's' : ''}, ${item.samplingRate}% sampling`,
   });
 
   return [...agents, ...credentials, ...memories, ...gateways, ...evaluators, ...onlineEvalConfigs];
diff --git a/src/cli/operations/eval/__tests__/get-eval-run.test.ts b/src/cli/operations/eval/__tests__/get-eval-run.test.ts
index d019835b..6007221d 100644
--- a/src/cli/operations/eval/__tests__/get-eval-run.test.ts
+++ b/src/cli/operations/eval/__tests__/get-eval-run.test.ts
@@ -9,7 +9,6 @@ vi.mock('../storage', () => ({
 }));
 
 const sampleRun: EvalRunResult = {
-  runId: 'run_abc',
   timestamp: '2025-01-15T10:00:00.000Z',
   agent: 'test-agent',
   evaluators: ['Builtin.GoalSuccessRate'],
@@ -30,22 +29,22 @@ describe('handleGetEvalRun', () => {
   it('returns the run on success', () => {
     mockLoadEvalRun.mockReturnValue(sampleRun);
 
-    const result = handleGetEvalRun({ runId: 'run_abc' });
+    const result = handleGetEvalRun({ filename: 'eval_2025-01-15_10-00-00' });
 
     expect(result.success).toBe(true);
     expect(result.run).toEqual(sampleRun);
-    expect(mockLoadEvalRun).toHaveBeenCalledWith('run_abc');
+    expect(mockLoadEvalRun).toHaveBeenCalledWith('eval_2025-01-15_10-00-00');
   });
 
   it('returns error when run is not found', () => {
     mockLoadEvalRun.mockImplementation(() => {
-      throw new Error('Eval run "run_missing" not found');
+      throw new Error('Eval run "eval_2025-01-01_00-00-00" not found');
     });
 
-    const result = handleGetEvalRun({ runId: 'run_missing' });
+    const result = handleGetEvalRun({ filename: 'eval_2025-01-01_00-00-00' });
 
     expect(result.success).toBe(false);
-    expect(result.error).toContain('run_missing');
+    expect(result.error).toContain('not found');
     expect(result.run).toBeUndefined();
   });
 
@@ -54,7 +53,7 @@ describe('handleGetEvalRun', () => {
       throw new Error('string error');
     });
 
-    const result = handleGetEvalRun({ runId: 'run_bad' });
+    const result = handleGetEvalRun({ filename: 'eval_bad' });
 
     expect(result.success).toBe(false);
     expect(result.error).toBe('string error');
diff --git a/src/cli/operations/eval/__tests__/list-eval-runs.test.ts b/src/cli/operations/eval/__tests__/list-eval-runs.test.ts
index 52c68ee7..c9a71a8c 100644
--- a/src/cli/operations/eval/__tests__/list-eval-runs.test.ts
+++ b/src/cli/operations/eval/__tests__/list-eval-runs.test.ts
@@ -8,10 +8,9 @@ vi.mock('../storage', () => ({
   listEvalRuns: () => mockListEvalRuns(),
 }));
 
-function makeRun(agent: string, runId: string): EvalRunResult {
+function makeRun(agent: string, timestamp: string): EvalRunResult {
   return {
-    runId,
-    timestamp: '2025-01-15T10:00:00.000Z',
+    timestamp,
     agent,
     evaluators: ['Builtin.GoalSuccessRate'],
     lookbackDays: 7,
@@ -24,7 +23,7 @@ describe('handleListEvalRuns', () => {
   afterEach(() => vi.clearAllMocks());
 
   it('returns all runs when no filters specified', () => {
-    const runs = [makeRun('agent-a', 'run_1'), makeRun('agent-b', 'run_2')];
+    const runs = [makeRun('agent-a', '2025-01-15T10:00:00.000Z'), makeRun('agent-b', '2025-01-15T11:00:00.000Z')];
     mockListEvalRuns.mockReturnValue(runs);
 
     const result = handleListEvalRuns({});
@@ -34,7 +33,11 @@ describe('handleListEvalRuns', () => {
   });
 
   it('filters by agent name', () => {
-    const runs = [makeRun('agent-a', 'run_1'), makeRun('agent-b', 'run_2'), makeRun('agent-a', 'run_3')];
+    const runs = [
+      makeRun('agent-a', '2025-01-15T10:00:00.000Z'),
+      makeRun('agent-b', '2025-01-15T11:00:00.000Z'),
+      makeRun('agent-a', '2025-01-15T12:00:00.000Z'),
+    ];
     mockListEvalRuns.mockReturnValue(runs);
 
     const result = handleListEvalRuns({ agent: 'agent-a' });
@@ -45,7 +48,11 @@ describe('handleListEvalRuns', () => {
   });
 
   it('limits the number of results', () => {
-    const runs = [makeRun('a', 'run_1'), makeRun('a', 'run_2'), makeRun('a', 'run_3')];
+    const runs = [
+      makeRun('a', '2025-01-15T10:00:00.000Z'),
+      makeRun('a', '2025-01-15T11:00:00.000Z'),
+      makeRun('a', '2025-01-15T12:00:00.000Z'),
+    ];
     mockListEvalRuns.mockReturnValue(runs);
 
     const result = handleListEvalRuns({ limit: 2 });
@@ -55,14 +62,19 @@ describe('handleListEvalRuns', () => {
   });
 
   it('applies agent filter before limit', () => {
-    const runs = [makeRun('a', 'run_1'), makeRun('b', 'run_2'), makeRun('a', 'run_3'), makeRun('a', 'run_4')];
+    const runs = [
+      makeRun('a', '2025-01-15T10:00:00.000Z'),
+      makeRun('b', '2025-01-15T11:00:00.000Z'),
+      makeRun('a', '2025-01-15T12:00:00.000Z'),
+      makeRun('a', '2025-01-15T13:00:00.000Z'),
+    ];
     mockListEvalRuns.mockReturnValue(runs);
 
     const result = handleListEvalRuns({ agent: 'a', limit: 2 });
 
     expect(result.runs).toHaveLength(2);
-    expect(result.runs![0]!.runId).toBe('run_1');
-    expect(result.runs![1]!.runId).toBe('run_3');
+    expect(result.runs![0]!.timestamp).toBe('2025-01-15T10:00:00.000Z');
+    expect(result.runs![1]!.timestamp).toBe('2025-01-15T12:00:00.000Z');
   });
 
   it('returns empty array when no runs exist', () => {
diff --git a/src/cli/operations/eval/__tests__/logs-eval.test.ts b/src/cli/operations/eval/__tests__/logs-eval.test.ts
index a512a1a0..2a0d5c14 100644
--- a/src/cli/operations/eval/__tests__/logs-eval.test.ts
+++ b/src/cli/operations/eval/__tests__/logs-eval.test.ts
@@ -27,7 +27,7 @@ vi.mock('../../../../lib/utils', () => ({
 
 function makeContext({
   agentName = 'my-agent',
-  onlineEvalConfigs = [{ name: 'eval-config', agents: ['my-agent'] }] as { name: string; agents: string[] }[],
+  onlineEvalConfigs = [{ name: 'eval-config' }] as { name: string }[],
   deployedConfigId = 'cfg-123',
 } = {}) {
   return {
@@ -209,22 +209,6 @@ describe('handleLogsEval', () => {
     );
   });
 
-  it('only resolves configs that match the target agent', async () => {
-    const ctx = makeContext({
-      agentName: 'my-agent',
-      onlineEvalConfigs: [
-        { name: 'eval-config', agents: ['other-agent'] }, // doesn't match
-      ],
-    });
-    mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
-    mockResolveAgent.mockReturnValue(makeResolvedAgent());
-
-    const result = await handleLogsEval({});
-
-    expect(result.success).toBe(false);
-    expect(result.error).toContain('No deployed online eval configs found');
-  });
-
   it('uses log group name from API when available', async () => {
     const ctx = makeContext();
     mockLoadDeployedProjectConfig.mockResolvedValue(ctx);
diff --git a/src/cli/operations/eval/__tests__/run-eval.test.ts b/src/cli/operations/eval/__tests__/run-eval.test.ts
index 8a99d9f1..4ce1a9ae 100644
--- a/src/cli/operations/eval/__tests__/run-eval.test.ts
+++ b/src/cli/operations/eval/__tests__/run-eval.test.ts
@@ -8,7 +8,7 @@ const mockLoadDeployedProjectConfig = vi.fn();
 const mockEvaluate = vi.fn();
 const mockGetEvaluator = vi.fn();
 const mockSaveEvalRun = vi.fn();
-const mockGenerateRunId = vi.fn();
+const mockGenerateFilename = vi.fn();
 const mockSend = vi.fn();
 const mockGetCredentialProvider = vi.fn().mockReturnValue({});
 const mockWriteFileSync = vi.fn();
@@ -31,7 +31,7 @@ vi.mock('../../../aws', () => ({
 }));
 
 vi.mock('../storage', () => ({
-  generateRunId: () => mockGenerateRunId(),
+  generateFilename: (...args: unknown[]) => mockGenerateFilename(...args),
   saveEvalRun: (...args: unknown[]) => mockSaveEvalRun(...args),
 }));
 
@@ -136,8 +136,8 @@ function setupCloudWatchToReturn(spanRows: unknown[][], runtimeLogRows: unknown[
 
 describe('handleRunEval', () => {
   beforeEach(() => {
-    mockGenerateRunId.mockReturnValue('run_test-123');
-    mockSaveEvalRun.mockReturnValue('/tmp/eval-results/run_test-123.json');
+    mockGenerateFilename.mockReturnValue('eval_2025-01-15_10-00-00');
+    mockSaveEvalRun.mockReturnValue('/tmp/eval-results/eval_2025-01-15_10-00-00.json');
   });
 
   afterEach(() => vi.clearAllMocks());
@@ -394,7 +394,7 @@ describe('handleRunEval', () => {
     expect(result.success).toBe(true);
     expect(mockSaveEvalRun).toHaveBeenCalled();
     expect(mockWriteFileSync).not.toHaveBeenCalled();
-    expect(result.filePath).toBe('/tmp/eval-results/run_test-123.json');
+    expect(result.filePath).toBe('/tmp/eval-results/eval_2025-01-15_10-00-00.json');
   });
 
   it('writes to custom output path when --output is specified', async () => {
@@ -562,8 +562,11 @@ describe('handleRunEval', () => {
     expect(result.success).toBe(true);
     // Should write to cwd, not call saveEvalRun (which requires a project)
     expect(mockSaveEvalRun).not.toHaveBeenCalled();
-    expect(mockWriteFileSync).toHaveBeenCalledWith(expect.stringContaining('run_test-123.json'), expect.any(String));
-    expect(result.filePath).toContain('run_test-123.json');
+    expect(mockWriteFileSync).toHaveBeenCalledWith(
+      expect.stringContaining('eval_2025-01-15_10-00-00.json'),
+      expect.any(String)
+    );
+    expect(result.filePath).toContain('eval_2025-01-15_10-00-00.json');
   });
 
   it('saves to --output path in ARN mode', async () => {
diff --git a/src/cli/operations/eval/__tests__/storage.test.ts b/src/cli/operations/eval/__tests__/storage.test.ts
index db56e34b..1ce0fbe6 100644
--- a/src/cli/operations/eval/__tests__/storage.test.ts
+++ b/src/cli/operations/eval/__tests__/storage.test.ts
@@ -1,4 +1,4 @@
-import { generateRunId, listEvalRuns, loadEvalRun, saveEvalRun } from '../storage.js';
+import { generateFilename, getResultsPath, listEvalRuns, loadEvalRun, saveEvalRun } from '../storage.js';
 import type { EvalRunResult } from '../types.js';
 // Use real fs via a temp directory
 import { existsSync, mkdirSync, rmSync } from 'fs';
@@ -20,7 +20,6 @@ function makeTmpDir(): string {
 
 function makeRunResult(overrides: Partial<EvalRunResult> = {}): EvalRunResult {
   return {
-    runId: overrides.runId ?? `run_${Date.now()}`,
     timestamp: '2025-01-15T10:00:00.000Z',
     agent: 'test-agent',
     evaluators: ['Builtin.GoalSuccessRate'],
@@ -52,48 +51,60 @@ describe('storage', () => {
     vi.clearAllMocks();
   });
 
-  describe('generateRunId', () => {
-    it('returns a string starting with run_', () => {
-      const id = generateRunId();
-      expect(id).toMatch(/^run_[0-9a-f-]+$/);
+  describe('generateFilename', () => {
+    it('returns a string starting with eval_', () => {
+      const name = generateFilename('2025-01-15T10:30:45.000Z');
+      expect(name).toMatch(/^eval_\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}$/);
     });
 
-    it('generates unique IDs', () => {
-      const ids = new Set(Array.from({ length: 100 }, () => generateRunId()));
-      expect(ids.size).toBe(100);
+    it('formats timestamp correctly', () => {
+      const name = generateFilename('2025-03-05T08:05:09.000Z');
+      expect(name).toBe('eval_2025-03-05_08-05-09');
     });
   });
 
   describe('saveEvalRun', () => {
     it('creates eval-results directory and writes JSON file', () => {
-      const run = makeRunResult({ runId: 'run_save-test' });
+      const run = makeRunResult();
       const filePath = saveEvalRun(run);
 
       expect(filePath).toContain('eval-results');
-      expect(filePath).toContain('run_save-test.json');
+      expect(filePath).toContain('eval_2025-01-15');
+      expect(filePath.endsWith('.json')).toBe(true);
       expect(existsSync(filePath)).toBe(true);
     });
 
     it('writes valid JSON that can be read back', () => {
-      const run = makeRunResult({ runId: 'run_roundtrip' });
+      const run = makeRunResult();
       saveEvalRun(run);
-      const loaded = loadEvalRun('run_roundtrip');
+      const filename = generateFilename(run.timestamp);
+      const loaded = loadEvalRun(filename);
       expect(loaded).toEqual(run);
     });
   });
 
   describe('loadEvalRun', () => {
     it('loads a previously saved run', () => {
-      const run = makeRunResult({ runId: 'run_load-test', agent: 'my-agent' });
+      const run = makeRunResult({ agent: 'my-agent' });
       saveEvalRun(run);
 
-      const loaded = loadEvalRun('run_load-test');
+      const filename = generateFilename(run.timestamp);
+      const loaded = loadEvalRun(filename);
       expect(loaded.agent).toBe('my-agent');
       expect(loaded.results).toHaveLength(1);
     });
 
-    it('throws for a non-existent run ID', () => {
-      expect(() => loadEvalRun('run_does-not-exist')).toThrow('Eval run "run_does-not-exist" not found');
+    it('accepts filename with .json extension', () => {
+      const run = makeRunResult();
+      saveEvalRun(run);
+
+      const filename = generateFilename(run.timestamp);
+      const loaded = loadEvalRun(`${filename}.json`);
+      expect(loaded).toEqual(run);
+    });
+
+    it('throws for a non-existent filename', () => {
+      expect(() => loadEvalRun('eval_2099-01-01_00-00-00')).toThrow('not found');
     });
   });
 
@@ -109,34 +120,41 @@ describe('storage', () => {
     });
 
     it('returns saved runs', () => {
-      saveEvalRun(makeRunResult({ runId: 'run_aaa' }));
-      saveEvalRun(makeRunResult({ runId: 'run_bbb' }));
+      saveEvalRun(makeRunResult({ timestamp: '2025-01-15T10:00:00.000Z' }));
+      saveEvalRun(makeRunResult({ timestamp: '2025-01-15T11:00:00.000Z' }));
 
       const runs = listEvalRuns();
       expect(runs).toHaveLength(2);
     });
 
     it('returns runs in reverse sorted order (newest first)', () => {
-      saveEvalRun(makeRunResult({ runId: 'run_aaa' }));
-      saveEvalRun(makeRunResult({ runId: 'run_zzz' }));
-      saveEvalRun(makeRunResult({ runId: 'run_mmm' }));
+      saveEvalRun(makeRunResult({ timestamp: '2025-01-15T08:00:00.000Z' }));
+      saveEvalRun(makeRunResult({ timestamp: '2025-01-15T12:00:00.000Z' }));
+      saveEvalRun(makeRunResult({ timestamp: '2025-01-15T10:00:00.000Z' }));
 
       const runs = listEvalRuns();
-      expect(runs.map(r => r.runId)).toEqual(['run_zzz', 'run_mmm', 'run_aaa']);
+      const timestamps = runs.map(r => r.timestamp);
+      expect(timestamps).toEqual(['2025-01-15T12:00:00.000Z', '2025-01-15T10:00:00.000Z', '2025-01-15T08:00:00.000Z']);
     });
 
     it('ignores files that do not match the naming pattern', async () => {
-      saveEvalRun(makeRunResult({ runId: 'run_valid' }));
+      saveEvalRun(makeRunResult());
 
       // Write a file that doesn't match the pattern
-      const resultsDir = join(tmpDir, 'eval-results');
+      const resultsDir = join(tmpDir, '.cli', 'eval-results');
       const { writeFileSync } = await import('fs');
       writeFileSync(join(resultsDir, 'notes.txt'), 'not a run');
       writeFileSync(join(resultsDir, 'other.json'), '{}');
 
       const runs = listEvalRuns();
       expect(runs).toHaveLength(1);
-      expect(runs[0]!.runId).toBe('run_valid');
+    });
+  });
+
+  describe('getResultsPath', () => {
+    it('returns the eval-results directory path', () => {
+      const path = getResultsPath();
+      expect(path).toBe(join(tmpDir, '.cli', 'eval-results'));
     });
   });
 
diff --git a/src/cli/operations/eval/get-eval-run.ts b/src/cli/operations/eval/get-eval-run.ts
index 6f592887..ed2aa6b6 100644
--- a/src/cli/operations/eval/get-eval-run.ts
+++ b/src/cli/operations/eval/get-eval-run.ts
@@ -10,7 +10,7 @@ export interface GetEvalRunResult {
 
 export function handleGetEvalRun(options: GetEvalRunOptions): GetEvalRunResult {
   try {
-    const run = loadEvalRun(options.runId);
+    const run = loadEvalRun(options.filename);
     return { success: true, run };
   } catch (err) {
     return { success: false, error: getErrorMessage(err) };
diff --git a/src/cli/operations/eval/index.ts b/src/cli/operations/eval/index.ts
index f991a4d4..f3937236 100644
--- a/src/cli/operations/eval/index.ts
+++ b/src/cli/operations/eval/index.ts
@@ -2,17 +2,9 @@ export { handleRunEval } from './run-eval';
 export type { RunEvalResult } from './run-eval';
 export { handleListEvalRuns } from './list-eval-runs';
 export type { ListEvalRunsResult } from './list-eval-runs';
-export { handleGetEvalRun } from './get-eval-run';
-export type { GetEvalRunResult } from './get-eval-run';
-export { handlePauseResume } from './pause-resume';
-export type { PauseResumeResult } from './pause-resume';
+export { handlePauseResume, handleDeleteOnlineEval } from './pause-resume';
+export type { PauseResumeResult, DeleteResult } from './pause-resume';
 export { handleLogsEval } from './logs-eval';
 export type { LogsEvalResult } from './logs-eval';
-export type {
-  EvalRunResult,
-  RunEvalOptions,
-  ListEvalRunsOptions,
-  GetEvalRunOptions,
-  OnlineEvalActionOptions,
-} from './types';
+export type { EvalRunResult, RunEvalOptions, ListEvalRunsOptions, OnlineEvalActionOptions } from './types';
 export type { LogsEvalOptions } from './logs-eval';
diff --git a/src/cli/operations/eval/logs-eval.ts b/src/cli/operations/eval/logs-eval.ts
index a4187674..0e3af702 100644
--- a/src/cli/operations/eval/logs-eval.ts
+++ b/src/cli/operations/eval/logs-eval.ts
@@ -33,20 +33,18 @@ interface ResolvedLogGroup {
 }
 
 /**
- * Resolve the online eval config log group names for a given agent.
+ * Resolve the online eval config log group names.
  * Fetches the actual log group from the API when possible, falls back to convention.
  */
 async function resolveEvalLogGroups(
   context: DeployedProjectConfig,
-  agentName: string,
   targetName: string,
   region: string
 ): Promise<ResolvedLogGroup[]> {
   const { project, deployedState } = context;
   const targetResources = deployedState.targets[targetName]?.resources;
 
-  // Find online eval configs that monitor this agent
-  const matchingConfigs = (project.onlineEvalConfigs ?? []).filter(c => c.agents.includes(agentName));
+  const matchingConfigs = project.onlineEvalConfigs ?? [];
 
   const results: ResolvedLogGroup[] = [];
   for (const config of matchingConfigs) {
@@ -82,12 +80,12 @@ export async function handleLogsEval(options: LogsEvalOptions): Promise<LogsEval
 
   const { agent } = agentResult;
 
-  const resolvedLogGroups = await resolveEvalLogGroups(context, agent.agentName, agent.targetName, agent.region);
+  const resolvedLogGroups = await resolveEvalLogGroups(context, agent.targetName, agent.region);
 
   if (resolvedLogGroups.length === 0) {
     return {
       success: false,
-      error: `No deployed online eval configs found for agent '${agent.agentName}'. Add one with 'agentcore add online-eval' and deploy.`,
+      error: `No deployed online eval configs found. Add one with 'agentcore add online-eval' and deploy.`,
     };
   }
 
diff --git a/src/cli/operations/eval/pause-resume.ts b/src/cli/operations/eval/pause-resume.ts
index 7e3b280f..c1b11a44 100644
--- a/src/cli/operations/eval/pause-resume.ts
+++ b/src/cli/operations/eval/pause-resume.ts
@@ -1,5 +1,5 @@
 import type { OnlineEvalExecutionStatus } from '../../aws/agentcore-control';
-import { updateOnlineEvalExecutionStatus } from '../../aws/agentcore-control';
+import { deleteOnlineEvalConfig, updateOnlineEvalExecutionStatus } from '../../aws/agentcore-control';
 import { loadDeployedProjectConfig } from '../resolve-agent';
 import type { OnlineEvalActionOptions } from './types';
 
@@ -10,6 +10,13 @@ export interface PauseResumeResult {
   executionStatus?: string;
 }
 
+export interface DeleteResult {
+  success: boolean;
+  error?: string;
+  configId?: string;
+  status?: string;
+}
+
 async function resolveOnlineEvalConfig(
   configName: string
 ): Promise<{ success: true; configId: string; region: string } | { success: false; error: string }> {
@@ -43,11 +50,50 @@ async function resolveOnlineEvalConfig(
   };
 }
 
+/**
+ * Parse an online eval config ARN to extract the config ID and region.
+ * ARN format: arn:aws:bedrock-agentcore:<region>:<account>:online-evaluation-config/<configId>
+ */
+function parseOnlineEvalConfigArn(
+  arn: string,
+  regionOverride?: string
+): { success: true; configId: string; region: string } | { success: false; error: string } {
+  const parts = arn.split(':');
+  if (parts.length < 6 || !arn.startsWith('arn:')) {
+    return { success: false, error: `Invalid online eval config ARN: ${arn}` };
+  }
+
+  const region = regionOverride ?? parts[3];
+  if (!region) {
+    return { success: false, error: 'Could not determine region from ARN. Use --region to specify.' };
+  }
+
+  const resource = parts.slice(5).join(':');
+  const match = /online-evaluation-config\/(.+)$/.exec(resource);
+  if (!match) {
+    return { success: false, error: `Could not extract config ID from ARN: ${arn}` };
+  }
+
+  return { success: true, configId: match[1]!, region };
+}
+
+/**
+ * Resolve config ID and region from either a project config name or an ARN.
+ */
+async function resolveConfig(
+  options: OnlineEvalActionOptions
+): Promise<{ success: true; configId: string; region: string } | { success: false; error: string }> {
+  if (options.arn) {
+    return parseOnlineEvalConfigArn(options.arn, options.region);
+  }
+  return resolveOnlineEvalConfig(options.name);
+}
+
 export async function handlePauseResume(
   options: OnlineEvalActionOptions,
   action: 'pause' | 'resume'
 ): Promise<PauseResumeResult> {
-  const resolution = await resolveOnlineEvalConfig(options.name);
+  const resolution = await resolveConfig(options);
   if (!resolution.success) {
     return resolution;
   }
@@ -70,3 +116,25 @@ export async function handlePauseResume(
     return { success: false, error: (err as Error).message };
   }
 }
+
+export async function handleDeleteOnlineEval(options: OnlineEvalActionOptions): Promise<DeleteResult> {
+  const resolution = await resolveConfig(options);
+  if (!resolution.success) {
+    return resolution;
+  }
+
+  try {
+    const result = await deleteOnlineEvalConfig({
+      region: resolution.region,
+      onlineEvaluationConfigId: resolution.configId,
+    });
+
+    return {
+      success: true,
+      configId: result.configId,
+      status: result.status,
+    };
+  } catch (err) {
+    return { success: false, error: (err as Error).message };
+  }
+}
diff --git a/src/cli/operations/eval/run-eval.ts b/src/cli/operations/eval/run-eval.ts
index fc36efb7..bee6d9b5 100644
--- a/src/cli/operations/eval/run-eval.ts
+++ b/src/cli/operations/eval/run-eval.ts
@@ -4,7 +4,7 @@ import { getEvaluator } from '../../aws/agentcore-control';
 import { DEFAULT_ENDPOINT_NAME } from '../../constants';
 import type { DeployedProjectConfig } from '../resolve-agent';
 import { loadDeployedProjectConfig, resolveAgent } from '../resolve-agent';
-import { generateRunId, saveEvalRun } from './storage';
+import { generateFilename, saveEvalRun } from './storage';
 import type { EvalEvaluatorResult, EvalRunResult, EvalSessionScore, RunEvalOptions } from './types';
 import { CloudWatchLogsClient, GetQueryResultsCommand, StartQueryCommand } from '@aws-sdk/client-cloudwatch-logs';
 import type { ResultField } from '@aws-sdk/client-cloudwatch-logs';
@@ -491,7 +491,7 @@ async function fetchSessionSpans(opts: FetchSpansOptions): Promise<SessionSpans[
   // 3. Build session list — aws/spans docs are already scoped by runtimeId (step 1),
   //    and runtime log docs were filtered through isRelevantForEval (step 2).
   //    We keep all docs so the Evaluate API has full trace context for resolving
-  //    template variables like {actual_trajectory}.
+  //    template variables like {context} and {assistant_turn}.
   const sessions: SessionSpans[] = [];
   for (const [sessionId, docs] of sessionMap) {
     if (docs.length > 0) {
@@ -615,9 +615,9 @@ export async function handleRunEval(options: RunEvalOptions): Promise<RunEvalRes
   }
 
   // Build run result
+  const timestamp = new Date().toISOString();
   const run: EvalRunResult = {
-    runId: generateRunId(),
-    timestamp: new Date().toISOString(),
+    timestamp,
     agent: ctx.agentLabel,
     evaluators: ctx.evaluatorLabels,
     lookbackDays: options.days,
@@ -632,7 +632,7 @@ export async function handleRunEval(options: RunEvalOptions): Promise<RunEvalRes
     filePath = options.output;
   } else if (options.agentArn) {
     // ARN mode may not have a project directory — save to cwd
-    const fallbackPath = join(process.cwd(), `${run.runId}.json`);
+    const fallbackPath = join(process.cwd(), `${generateFilename(timestamp)}.json`);
     writeFileSync(fallbackPath, JSON.stringify(run, null, 2));
     filePath = fallbackPath;
   } else {
diff --git a/src/cli/operations/eval/storage.ts b/src/cli/operations/eval/storage.ts
index 5329e654..7c65e868 100644
--- a/src/cli/operations/eval/storage.ts
+++ b/src/cli/operations/eval/storage.ts
@@ -1,6 +1,5 @@
 import { findConfigRoot } from '../../../lib';
 import type { EvalRunResult } from './types';
-import { randomUUID } from 'crypto';
 import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync } from 'fs';
 import { join } from 'path';
 
@@ -11,28 +10,32 @@ function getResultsDir(): string {
   if (!configRoot) {
     throw new Error('No agentcore project found. Run `agentcore create` first.');
   }
-  return join(configRoot, EVAL_RESULTS_DIR);
+  return join(configRoot, '.cli', EVAL_RESULTS_DIR);
 }
 
-export function generateRunId(): string {
-  return `run_${randomUUID()}`;
+export function generateFilename(timestamp: string): string {
+  const d = new Date(timestamp);
+  const pad = (n: number) => String(n).padStart(2, '0');
+  return `eval_${d.getUTCFullYear()}-${pad(d.getUTCMonth() + 1)}-${pad(d.getUTCDate())}_${pad(d.getUTCHours())}-${pad(d.getUTCMinutes())}-${pad(d.getUTCSeconds())}`;
 }
 
 export function saveEvalRun(result: EvalRunResult): string {
   const dir = getResultsDir();
   mkdirSync(dir, { recursive: true });
 
-  const filePath = join(dir, `${result.runId}.json`);
+  const filename = generateFilename(result.timestamp);
+  const filePath = join(dir, `${filename}.json`);
   writeFileSync(filePath, JSON.stringify(result, null, 2));
   return filePath;
 }
 
-export function loadEvalRun(runId: string): EvalRunResult {
+export function loadEvalRun(filename: string): EvalRunResult {
   const dir = getResultsDir();
-  const filePath = join(dir, `${runId}.json`);
+  const jsonName = filename.endsWith('.json') ? filename : `${filename}.json`;
+  const filePath = join(dir, jsonName);
 
   if (!existsSync(filePath)) {
-    throw new Error(`Eval run "${runId}" not found at ${filePath}`);
+    throw new Error(`Eval run "${filename}" not found at ${filePath}`);
   }
 
   return JSON.parse(readFileSync(filePath, 'utf-8')) as EvalRunResult;
@@ -46,9 +49,13 @@ export function listEvalRuns(): EvalRunResult[] {
   }
 
   const files = readdirSync(dir)
-    .filter(f => f.startsWith('run_') && f.endsWith('.json'))
+    .filter(f => f.startsWith('eval_') && f.endsWith('.json'))
     .sort()
     .reverse();
 
   return files.map(f => JSON.parse(readFileSync(join(dir, f), 'utf-8')) as EvalRunResult);
 }
+
+export function getResultsPath(): string {
+  return getResultsDir();
+}
diff --git a/src/cli/operations/eval/types.ts b/src/cli/operations/eval/types.ts
index 1f4c3438..522f8e36 100644
--- a/src/cli/operations/eval/types.ts
+++ b/src/cli/operations/eval/types.ts
@@ -23,7 +23,6 @@ export interface EvalSessionScore {
 
 /** Full eval run result stored to disk */
 export interface EvalRunResult {
-  runId: string;
   timestamp: string;
   agent: string;
   evaluators: string[];
@@ -62,13 +61,17 @@ export interface ListEvalRunsOptions {
 
 /** Options for getting a single eval run */
 export interface GetEvalRunOptions {
-  runId: string;
+  filename: string;
   sessions?: boolean;
   json?: boolean;
 }
 
-/** Options for pause/resume online eval */
+/** Options for pause/resume/delete online eval */
 export interface OnlineEvalActionOptions {
   name: string;
+  /** Online eval config ARN (direct mode — bypasses project config) */
+  arn?: string;
+  /** AWS region (required with --arn when region cannot be parsed from ARN) */
+  region?: string;
   json?: boolean;
 }
diff --git a/src/cli/primitives/EvaluatorPrimitive.ts b/src/cli/primitives/EvaluatorPrimitive.ts
index bf0cb7d8..a8aced79 100644
--- a/src/cli/primitives/EvaluatorPrimitive.ts
+++ b/src/cli/primitives/EvaluatorPrimitive.ts
@@ -114,85 +114,125 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
 
   registerCommands(addCmd: Command, removeCmd: Command): void {
     addCmd
-      .command('eval')
+      .command(this.kind)
       .description('Add a custom evaluator to the project')
-      .option('--name <name>', 'Evaluator name [non-interactive]')
-      .option('--level <level>', 'Evaluation level: SESSION, TRACE, TOOL_CALL [non-interactive]')
-      .option('--config <path>', 'Path to evaluator config JSON file [non-interactive]')
-      .option('--json', 'Output as JSON [non-interactive]')
-      .action(async (cliOptions: { name?: string; level?: string; config?: string; json?: boolean }) => {
-        try {
-          if (!findConfigRoot()) {
-            console.error('No agentcore project found. Run `agentcore create` first.');
-            process.exit(1);
-          }
+      .option('--name <name>', 'Evaluator name')
+      .option('--level <level>', 'Evaluation level: SESSION, TRACE, TOOL_CALL')
+      .option('--model <model>', 'Bedrock model ID for LLM-as-a-Judge')
+      .option('--instructions <text>', 'Evaluation prompt instructions')
+      .option('--config <path>', 'Path to evaluator config JSON file (overrides --model, --instructions)')
+      .option('--json', 'Output as JSON')
+      .action(
+        async (cliOptions: {
+          name?: string;
+          level?: string;
+          model?: string;
+          instructions?: string;
+          config?: string;
+          json?: boolean;
+        }) => {
+          try {
+            if (!findConfigRoot()) {
+              console.error('No agentcore project found. Run `agentcore create` first.');
+              process.exit(1);
+            }
 
-          if (cliOptions.name || cliOptions.json) {
-            if (!cliOptions.name || !cliOptions.level || !cliOptions.config) {
-              const error = '--name, --level, and --config are all required in non-interactive mode';
-              if (cliOptions.json) {
-                console.log(JSON.stringify({ success: false, error }));
+            if (cliOptions.name || cliOptions.json) {
+              if (!cliOptions.name || !cliOptions.level) {
+                const error = '--name and --level are required in non-interactive mode';
+                if (cliOptions.json) {
+                  console.log(JSON.stringify({ success: false, error }));
+                } else {
+                  console.error(error);
+                }
+                process.exit(1);
+              }
+
+              if (!cliOptions.config && !cliOptions.model) {
+                const error = 'Either --config or --model is required';
+                if (cliOptions.json) {
+                  console.log(JSON.stringify({ success: false, error }));
+                } else {
+                  console.error(error);
+                }
+                process.exit(1);
+              }
+
+              const levelResult = EvaluationLevelSchema.safeParse(cliOptions.level);
+              if (!levelResult.success) {
+                const error = `Invalid --level "${cliOptions.level}". Must be one of: SESSION, TRACE, TOOL_CALL`;
+                if (cliOptions.json) {
+                  console.log(JSON.stringify({ success: false, error }));
+                } else {
+                  console.error(error);
+                }
+                process.exit(1);
+              }
+
+              let configJson: EvaluatorConfig;
+              if (cliOptions.config) {
+                const { readFileSync } = await import('fs');
+                configJson = JSON.parse(readFileSync(cliOptions.config, 'utf-8')) as EvaluatorConfig;
               } else {
-                console.error(error);
+                configJson = {
+                  llmAsAJudge: {
+                    model: cliOptions.model!,
+                    instructions: cliOptions.instructions ?? `Evaluate the quality. Context: {context}`,
+                    ratingScale: {
+                      numerical: [
+                        { value: 1, label: 'Poor', definition: 'Fails to meet expectations' },
+                        { value: 2, label: 'Fair', definition: 'Partially meets expectations' },
+                        { value: 3, label: 'Good', definition: 'Meets expectations' },
+                        { value: 4, label: 'Very Good', definition: 'Exceeds expectations' },
+                        { value: 5, label: 'Excellent', definition: 'Far exceeds expectations' },
+                      ],
+                    },
+                  },
+                };
               }
-              process.exit(1);
-            }
 
-            const levelResult = EvaluationLevelSchema.safeParse(cliOptions.level);
-            if (!levelResult.success) {
-              const error = `Invalid --level "${cliOptions.level}". Must be one of: SESSION, TRACE, TOOL_CALL`;
+              const result = await this.add({
+                name: cliOptions.name,
+                level: levelResult.data,
+                config: configJson,
+              });
+
               if (cliOptions.json) {
-                console.log(JSON.stringify({ success: false, error }));
+                console.log(JSON.stringify(result));
+              } else if (result.success) {
+                console.log(`Added evaluator '${result.evaluatorName}'`);
               } else {
-                console.error(error);
+                console.error(result.error);
               }
-              process.exit(1);
+              process.exit(result.success ? 0 : 1);
+            } else {
+              // TUI fallback
+              const [{ render }, { default: React }, { AddFlow }] = await Promise.all([
+                import('ink'),
+                import('react'),
+                import('../tui/screens/add/AddFlow'),
+              ]);
+              const { clear, unmount } = render(
+                React.createElement(AddFlow, {
+                  isInteractive: false,
+                  onExit: () => {
+                    clear();
+                    unmount();
+                    process.exit(0);
+                  },
+                })
+              );
             }
-
-            const { readFileSync } = await import('fs');
-            const configJson = JSON.parse(readFileSync(cliOptions.config, 'utf-8')) as EvaluatorConfig;
-
-            const result = await this.add({
-              name: cliOptions.name,
-              level: levelResult.data,
-              config: configJson,
-            });
-
+          } catch (error) {
             if (cliOptions.json) {
-              console.log(JSON.stringify(result));
-            } else if (result.success) {
-              console.log(`Added evaluator '${result.evaluatorName}'`);
+              console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
             } else {
-              console.error(result.error);
+              console.error(getErrorMessage(error));
             }
-            process.exit(result.success ? 0 : 1);
-          } else {
-            // TUI fallback
-            const [{ render }, { default: React }, { AddFlow }] = await Promise.all([
-              import('ink'),
-              import('react'),
-              import('../tui/screens/add/AddFlow'),
-            ]);
-            const { clear, unmount } = render(
-              React.createElement(AddFlow, {
-                isInteractive: false,
-                onExit: () => {
-                  clear();
-                  unmount();
-                  process.exit(0);
-                },
-              })
-            );
-          }
-        } catch (error) {
-          if (cliOptions.json) {
-            console.log(JSON.stringify({ success: false, error: getErrorMessage(error) }));
-          } else {
-            console.error(getErrorMessage(error));
+            process.exit(1);
           }
-          process.exit(1);
         }
-      });
+      );
 
     this.registerRemoveSubcommand(removeCmd);
   }
diff --git a/src/cli/primitives/OnlineEvalConfigPrimitive.ts b/src/cli/primitives/OnlineEvalConfigPrimitive.ts
index c4378d03..e66c0c85 100644
--- a/src/cli/primitives/OnlineEvalConfigPrimitive.ts
+++ b/src/cli/primitives/OnlineEvalConfigPrimitive.ts
@@ -9,9 +9,10 @@ import type { Command } from '@commander-js/extra-typings';
 
 export interface AddOnlineEvalConfigOptions {
   name: string;
-  agents: string[];
+  agent: string;
   evaluators: string[];
   samplingRate: number;
+  enableOnCreate?: boolean;
 }
 
 export type RemovableOnlineEvalConfig = RemovableResource;
@@ -62,7 +63,6 @@ export class OnlineEvalConfigPrimitive extends BasePrimitive<AddOnlineEvalConfig
 
     const summary: string[] = [
       `Removing online eval config: ${configName}`,
-      `Monitors agents: ${config.agents.join(', ')}`,
       `Uses evaluators: ${config.evaluators.join(', ')}`,
     ];
     const schemaChanges: SchemaChange[] = [];
@@ -104,16 +104,20 @@ export class OnlineEvalConfigPrimitive extends BasePrimitive<AddOnlineEvalConfig
       .command('online-eval')
       .description('Add an online eval config to the project')
       .option('--name <name>', 'Config name [non-interactive]')
-      .option('-a, --agent <agents...>', 'Agent name(s) to monitor [non-interactive]')
-      .option('-e, --evaluator <evaluators...>', 'Evaluator name(s) or Builtin.* IDs [non-interactive]')
+      .option('-a, --agent <name>', 'Agent to monitor [non-interactive]')
+      .option('-e, --evaluator <evaluators...>', 'Evaluator name(s), Builtin.* IDs, or ARNs [non-interactive]')
+      .option('--evaluator-arn <arns...>', 'Evaluator ARN(s) [non-interactive]')
       .option('--sampling-rate <rate>', 'Sampling percentage (0.01-100) [non-interactive]')
+      .option('--enable-on-create', 'Enable evaluation immediately after deploy [non-interactive]')
       .option('--json', 'Output as JSON [non-interactive]')
       .action(
         async (cliOptions: {
           name?: string;
-          agent?: string[];
+          agent?: string;
           evaluator?: string[];
+          evaluatorArn?: string[];
           samplingRate?: string;
+          enableOnCreate?: boolean;
           json?: boolean;
         }) => {
           try {
@@ -123,9 +127,12 @@ export class OnlineEvalConfigPrimitive extends BasePrimitive<AddOnlineEvalConfig
             }
 
             if (cliOptions.name || cliOptions.json) {
-              if (!cliOptions.name || !cliOptions.agent || !cliOptions.evaluator || !cliOptions.samplingRate) {
+              // Merge --evaluator and --evaluator-arn into a single list
+              const allEvaluators = [...(cliOptions.evaluator ?? []), ...(cliOptions.evaluatorArn ?? [])];
+
+              if (!cliOptions.name || !cliOptions.agent || allEvaluators.length === 0 || !cliOptions.samplingRate) {
                 const error =
-                  '--name, --agent, --evaluator, and --sampling-rate are all required in non-interactive mode';
+                  '--name, --agent, --evaluator (and/or --evaluator-arn), and --sampling-rate are all required in non-interactive mode';
                 if (cliOptions.json) {
                   console.log(JSON.stringify({ success: false, error }));
                 } else {
@@ -147,9 +154,10 @@ export class OnlineEvalConfigPrimitive extends BasePrimitive<AddOnlineEvalConfig
 
               const result = await this.add({
                 name: cliOptions.name,
-                agents: cliOptions.agent,
-                evaluators: cliOptions.evaluator,
+                agent: cliOptions.agent,
+                evaluators: allEvaluators,
                 samplingRate,
+                enableOnCreate: cliOptions.enableOnCreate,
               });
 
               if (cliOptions.json) {
@@ -204,9 +212,10 @@ export class OnlineEvalConfigPrimitive extends BasePrimitive<AddOnlineEvalConfig
     const config: OnlineEvalConfig = {
       type: 'OnlineEvaluationConfig',
       name: options.name,
-      agents: options.agents,
+      agent: options.agent,
       evaluators: options.evaluators,
       samplingRate: options.samplingRate,
+      ...(options.enableOnCreate !== undefined && { enableOnCreate: options.enableOnCreate }),
     };
 
     project.onlineEvalConfigs.push(config);
diff --git a/src/cli/primitives/__tests__/OnlineEvalConfigPrimitive.test.ts b/src/cli/primitives/__tests__/OnlineEvalConfigPrimitive.test.ts
index ca6c6bc6..badcce2a 100644
--- a/src/cli/primitives/__tests__/OnlineEvalConfigPrimitive.test.ts
+++ b/src/cli/primitives/__tests__/OnlineEvalConfigPrimitive.test.ts
@@ -13,7 +13,7 @@ vi.mock('../../../lib/index.js', () => ({
 }));
 
 function makeProject(
-  onlineEvalConfigs: { name: string; agents: string[]; evaluators: string[] }[] = [],
+  onlineEvalConfigs: { name: string; evaluators: string[] }[] = [],
   evaluators: { name: string }[] = []
 ) {
   return {
@@ -46,7 +46,7 @@ describe('OnlineEvalConfigPrimitive', () => {
 
       const result = await primitive.add({
         name: 'MyConfig',
-        agents: ['agent1'],
+        agent: 'MyAgent',
         evaluators: ['Builtin.GoalSuccessRate'],
         samplingRate: 10,
       });
@@ -59,34 +59,68 @@ describe('OnlineEvalConfigPrimitive', () => {
       const config = writtenSpec.onlineEvalConfigs[0];
       expect(config.type).toBe('OnlineEvaluationConfig');
       expect(config.name).toBe('MyConfig');
-      expect(config.agents).toEqual(['agent1']);
       expect(config.evaluators).toEqual(['Builtin.GoalSuccessRate']);
       expect(config.samplingRate).toBe(10);
     });
 
-    it('supports multiple agents and evaluators', async () => {
+    it('stores enableOnCreate when provided', async () => {
+      mockReadProjectSpec.mockResolvedValue(makeProject());
+      mockWriteProjectSpec.mockResolvedValue(undefined);
+
+      const result = await primitive.add({
+        name: 'EnabledConfig',
+        agent: 'MyAgent',
+        evaluators: ['Builtin.GoalSuccessRate'],
+        samplingRate: 10,
+        enableOnCreate: true,
+      });
+
+      expect(result.success).toBe(true);
+      const config = mockWriteProjectSpec.mock.calls[0]![0].onlineEvalConfigs[0];
+      expect(config.enableOnCreate).toBe(true);
+    });
+
+    it('omits enableOnCreate when not provided', async () => {
+      mockReadProjectSpec.mockResolvedValue(makeProject());
+      mockWriteProjectSpec.mockResolvedValue(undefined);
+
+      await primitive.add({
+        name: 'NoEnableConfig',
+        agent: 'MyAgent',
+        evaluators: ['Builtin.GoalSuccessRate'],
+        samplingRate: 10,
+      });
+
+      const config = mockWriteProjectSpec.mock.calls[0]![0].onlineEvalConfigs[0];
+      expect(config.enableOnCreate).toBeUndefined();
+    });
+
+    it('supports multiple evaluators including ARNs', async () => {
       mockReadProjectSpec.mockResolvedValue(makeProject());
       mockWriteProjectSpec.mockResolvedValue(undefined);
 
       const result = await primitive.add({
         name: 'MultiConfig',
-        agents: ['agent1', 'agent2'],
-        evaluators: ['Builtin.GoalSuccessRate', 'CustomEval'],
+        agent: 'MyAgent',
+        evaluators: ['Builtin.GoalSuccessRate', 'CustomEval', 'arn:aws:bedrock:us-east-1:123:evaluator/ext'],
         samplingRate: 50,
       });
 
       expect(result.success).toBe(true);
       const config = mockWriteProjectSpec.mock.calls[0]![0].onlineEvalConfigs[0];
-      expect(config.agents).toEqual(['agent1', 'agent2']);
-      expect(config.evaluators).toEqual(['Builtin.GoalSuccessRate', 'CustomEval']);
+      expect(config.evaluators).toEqual([
+        'Builtin.GoalSuccessRate',
+        'CustomEval',
+        'arn:aws:bedrock:us-east-1:123:evaluator/ext',
+      ]);
     });
 
     it('returns error when config name already exists', async () => {
-      mockReadProjectSpec.mockResolvedValue(makeProject([{ name: 'Existing', agents: ['a'], evaluators: ['e'] }]));
+      mockReadProjectSpec.mockResolvedValue(makeProject([{ name: 'Existing', evaluators: ['e'] }]));
 
       const result = await primitive.add({
         name: 'Existing',
-        agents: ['a'],
+        agent: 'MyAgent',
         evaluators: ['e'],
         samplingRate: 10,
       });
@@ -101,7 +135,7 @@ describe('OnlineEvalConfigPrimitive', () => {
 
       const result = await primitive.add({
         name: 'New',
-        agents: ['a'],
+        agent: 'MyAgent',
         evaluators: ['e'],
         samplingRate: 10,
       });
@@ -114,8 +148,8 @@ describe('OnlineEvalConfigPrimitive', () => {
     it('removes config from project spec', async () => {
       mockReadProjectSpec.mockResolvedValue(
         makeProject([
-          { name: 'ConfigA', agents: ['a'], evaluators: ['e'] },
-          { name: 'ConfigB', agents: ['b'], evaluators: ['f'] },
+          { name: 'ConfigA', evaluators: ['e'] },
+          { name: 'ConfigB', evaluators: ['f'] },
         ])
       );
       mockWriteProjectSpec.mockResolvedValue(undefined);
@@ -153,15 +187,12 @@ describe('OnlineEvalConfigPrimitive', () => {
   });
 
   describe('previewRemove', () => {
-    it('returns preview with summary including agents and evaluators', async () => {
-      mockReadProjectSpec.mockResolvedValue(
-        makeProject([{ name: 'Config1', agents: ['agentA', 'agentB'], evaluators: ['Builtin.X', 'CustomY'] }])
-      );
+    it('returns preview with summary including evaluators', async () => {
+      mockReadProjectSpec.mockResolvedValue(makeProject([{ name: 'Config1', evaluators: ['Builtin.X', 'CustomY'] }]));
 
       const preview = await primitive.previewRemove('Config1');
 
       expect(preview.summary).toContain('Removing online eval config: Config1');
-      expect(preview.summary).toContain('Monitors agents: agentA, agentB');
       expect(preview.summary).toContain('Uses evaluators: Builtin.X, CustomY');
       expect(preview.schemaChanges).toHaveLength(1);
       expect((preview.schemaChanges[0]!.after as { onlineEvalConfigs: unknown[] }).onlineEvalConfigs).toHaveLength(0);
@@ -178,8 +209,8 @@ describe('OnlineEvalConfigPrimitive', () => {
     it('returns config names', async () => {
       mockReadProjectSpec.mockResolvedValue(
         makeProject([
-          { name: 'C1', agents: ['a'], evaluators: ['e'] },
-          { name: 'C2', agents: ['b'], evaluators: ['f'] },
+          { name: 'C1', evaluators: ['e'] },
+          { name: 'C2', evaluators: ['f'] },
         ])
       );
 
@@ -197,7 +228,7 @@ describe('OnlineEvalConfigPrimitive', () => {
 
   describe('getAllNames', () => {
     it('returns config names as strings', async () => {
-      mockReadProjectSpec.mockResolvedValue(makeProject([{ name: 'X', agents: ['a'], evaluators: ['e'] }]));
+      mockReadProjectSpec.mockResolvedValue(makeProject([{ name: 'X', evaluators: ['e'] }]));
 
       expect(await primitive.getAllNames()).toEqual(['X']);
     });
diff --git a/src/cli/tui/App.tsx b/src/cli/tui/App.tsx
index 5d2873c5..e447db72 100644
--- a/src/cli/tui/App.tsx
+++ b/src/cli/tui/App.tsx
@@ -7,11 +7,13 @@ import { AddFlow } from './screens/add/AddFlow';
 import { CreateScreen } from './screens/create';
 import { DeployScreen } from './screens/deploy/DeployScreen';
 import { DevScreen } from './screens/dev/DevScreen';
-import { EvalScreen } from './screens/eval';
+import { EvalHubScreen, EvalScreen } from './screens/eval';
 import { HelpScreen, HomeScreen } from './screens/home';
 import { InvokeScreen } from './screens/invoke';
+import { OnlineEvalDashboard } from './screens/online-eval';
 import { PackageScreen } from './screens/package';
 import { RemoveFlow } from './screens/remove';
+import { RunEvalFlow, RunScreen } from './screens/run-eval';
 import { StatusScreen } from './screens/status/StatusScreen';
 import { UpdateScreen } from './screens/update';
 import { ValidateScreen } from './screens/validate';
@@ -33,7 +35,11 @@ type Route =
   | { name: 'add' }
   | { name: 'status' }
   | { name: 'remove' }
+  | { name: 'run' }
+  | { name: 'run-eval'; from?: 'run' | 'eval' }
   | { name: 'eval' }
+  | { name: 'eval-runs' }
+  | { name: 'online-evals' }
   | { name: 'validate' }
   | { name: 'package' }
   | { name: 'update' };
@@ -86,6 +92,8 @@ function AppContent() {
       setRoute({ name: 'add' });
     } else if (id === 'remove') {
       setRoute({ name: 'remove' });
+    } else if (id === 'run') {
+      setRoute({ name: 'run' });
     } else if (id === 'eval') {
       setRoute({ name: 'eval' });
     } else if (id === 'validate') {
@@ -183,8 +191,44 @@ function AppContent() {
     );
   }
 
+  if (route.name === 'run') {
+    return (
+      <RunScreen
+        onRunEval={() => setRoute({ name: 'run-eval', from: 'run' })}
+        onExit={() => setRoute({ name: 'help' })}
+      />
+    );
+  }
+
   if (route.name === 'eval') {
-    return <EvalScreen isInteractive={true} onExit={() => setRoute({ name: 'help' })} />;
+    return (
+      <EvalHubScreen
+        onSelect={view => {
+          if (view === 'run-eval') setRoute({ name: 'run-eval', from: 'eval' });
+          if (view === 'runs') setRoute({ name: 'eval-runs' });
+          if (view === 'online-dashboard') setRoute({ name: 'online-evals' });
+        }}
+        onExit={() => setRoute({ name: 'help' })}
+      />
+    );
+  }
+
+  if (route.name === 'run-eval') {
+    const backRoute = route.from ?? 'eval';
+    return (
+      <RunEvalFlow
+        onExit={() => setRoute({ name: backRoute } as Route)}
+        onViewRuns={() => setRoute({ name: 'eval-runs' })}
+      />
+    );
+  }
+
+  if (route.name === 'eval-runs') {
+    return <EvalScreen isInteractive={true} onExit={() => setRoute({ name: 'eval' })} />;
+  }
+
+  if (route.name === 'online-evals') {
+    return <OnlineEvalDashboard isInteractive={true} onExit={() => setRoute({ name: 'eval' })} />;
   }
 
   if (route.name === 'validate') {
diff --git a/src/cli/tui/components/ResourceGraph.tsx b/src/cli/tui/components/ResourceGraph.tsx
index d3445350..11a5bce4 100644
--- a/src/cli/tui/components/ResourceGraph.tsx
+++ b/src/cli/tui/components/ResourceGraph.tsx
@@ -246,7 +246,7 @@ export function ResourceGraph({ project, mcp, agentName, resourceStatuses }: Res
           <SectionHeader>Online Eval Configs</SectionHeader>
           {onlineEvalConfigs.map(config => {
             const rsEntry = statusMap.get(`online-eval:${config.name}`);
-            const defaultDetail = `${config.agents.length} agents, ${config.evaluators.length} evaluators — ${config.samplingRate}% sampling`;
+            const defaultDetail = `${config.evaluators.length} evaluator${config.evaluators.length !== 1 ? 's' : ''} — ${config.samplingRate}% sampling`;
             return (
               <ResourceRow
                 key={config.name}
diff --git a/src/cli/tui/copy.ts b/src/cli/tui/copy.ts
index 8bfd6c1a..8e86a0b6 100644
--- a/src/cli/tui/copy.ts
+++ b/src/cli/tui/copy.ts
@@ -40,9 +40,10 @@ export const COMMAND_DESCRIPTIONS = {
   remove: 'Remove AgentCore resources and project',
   status: 'Retrieve details of deployed AgentCore resources.',
   traces: 'View and download agent traces.',
-  eval: 'View eval run results.',
+  eval: 'Manage evaluations and view history.',
   pause: 'Pause a running resource.',
   resume: 'Resume a paused resource.',
+  stop: 'Stop and delete a running resource.',
   run: 'Run operations (eval, etc.).',
   update: 'Check for and install CLI updates',
   validate: 'Validate agentcore/ config files.',
diff --git a/src/cli/tui/hooks/useCreateOnlineEval.ts b/src/cli/tui/hooks/useCreateOnlineEval.ts
index e8ee3e9a..2d019055 100644
--- a/src/cli/tui/hooks/useCreateOnlineEval.ts
+++ b/src/cli/tui/hooks/useCreateOnlineEval.ts
@@ -3,9 +3,10 @@ import { useCallback, useEffect, useState } from 'react';
 
 interface CreateOnlineEvalConfig {
   name: string;
-  agents: string[];
+  agent: string;
   evaluators: string[];
   samplingRate: number;
+  enableOnCreate: boolean;
 }
 
 export function useCreateOnlineEval() {
@@ -18,9 +19,10 @@ export function useCreateOnlineEval() {
     try {
       const addResult = await onlineEvalConfigPrimitive.add({
         name: config.name,
-        agents: config.agents,
+        agent: config.agent,
         evaluators: config.evaluators,
         samplingRate: config.samplingRate,
+        enableOnCreate: config.enableOnCreate,
       });
       if (!addResult.success) {
         throw new Error(addResult.error ?? 'Failed to create online eval config');
diff --git a/src/cli/tui/screens/eval/EvalHubScreen.tsx b/src/cli/tui/screens/eval/EvalHubScreen.tsx
new file mode 100644
index 00000000..67056413
--- /dev/null
+++ b/src/cli/tui/screens/eval/EvalHubScreen.tsx
@@ -0,0 +1,44 @@
+import { Screen, WizardSelect } from '../../components';
+import type { SelectableItem } from '../../components';
+import { HELP_TEXT } from '../../constants';
+import { useListNavigation } from '../../hooks';
+import React, { useMemo } from 'react';
+
+type EvalHubView = 'run-eval' | 'runs' | 'online-dashboard';
+
+interface EvalHubScreenProps {
+  onSelect: (view: EvalHubView) => void;
+  onExit: () => void;
+}
+
+export function EvalHubScreen({ onSelect, onExit }: EvalHubScreenProps) {
+  const items: SelectableItem[] = useMemo(
+    () => [
+      {
+        id: 'run-eval',
+        title: 'Run On-demand Evaluation',
+        description: 'Evaluate agent traces with selected evaluators',
+      },
+      { id: 'runs', title: 'Eval Runs', description: 'View past eval run results and scores' },
+      {
+        id: 'online-dashboard',
+        title: 'Online Eval Dashboard',
+        description: 'View and manage deployed online eval configs',
+      },
+    ],
+    []
+  );
+
+  const nav = useListNavigation({
+    items,
+    onSelect: item => onSelect(item.id as EvalHubView),
+    onExit,
+    isActive: true,
+  });
+
+  return (
+    <Screen title="Evaluations" onExit={onExit} helpText={HELP_TEXT.NAVIGATE_SELECT} exitEnabled={false}>
+      <WizardSelect title="Choose a view" items={items} selectedIndex={nav.selectedIndex} />
+    </Screen>
+  );
+}
diff --git a/src/cli/tui/screens/eval/EvalScreen.tsx b/src/cli/tui/screens/eval/EvalScreen.tsx
index f178d537..f5707738 100644
--- a/src/cli/tui/screens/eval/EvalScreen.tsx
+++ b/src/cli/tui/screens/eval/EvalScreen.tsx
@@ -1,88 +1,449 @@
 import { handleListEvalRuns } from '../../../operations/eval';
-import type { EvalRunResult } from '../../../operations/eval/types';
-import { Screen } from '../../components';
+import { getResultsPath } from '../../../operations/eval/storage';
+import type { EvalEvaluatorResult, EvalRunResult } from '../../../operations/eval/types';
+import { Panel, Screen } from '../../components';
+import { HELP_TEXT } from '../../constants';
+import { useListNavigation } from '../../hooks';
 import { STATUS_COLORS } from '../../theme';
-import { Box, Text } from 'ink';
-import React, { useEffect, useState } from 'react';
+import { Box, Text, useInput, useStdout } from 'ink';
+import React, { useEffect, useMemo, useState } from 'react';
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Helpers
+// ─────────────────────────────────────────────────────────────────────────────
+
+const MONTHS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'];
+
+function formatShortDate(timestamp: string): string {
+  const d = new Date(timestamp);
+  const mon = MONTHS[d.getMonth()];
+  const day = d.getDate();
+  const h = d.getHours();
+  const m = d.getMinutes().toString().padStart(2, '0');
+  const ampm = h >= 12 ? 'PM' : 'AM';
+  const h12 = h % 12 || 12;
+  return `${mon} ${day} ${h12}:${m} ${ampm}`;
+}
+
+function formatFullDate(timestamp: string): string {
+  const d = new Date(timestamp);
+  return d.toLocaleDateString() + ' ' + d.toLocaleTimeString([], { hour: '2-digit', minute: '2-digit' });
+}
+
+function formatScore(score: number): string {
+  return score.toFixed(2);
+}
+
+function scoreColor(score: number): string {
+  if (score >= 0.8) return 'green';
+  if (score >= 0.5) return 'yellow';
+  return 'red';
+}
+
+/** Strip "Builtin." prefix from evaluator names for display */
+function shortEvalName(name: string): string {
+  return name.replace(/^Builtin\./, '');
+}
+
+// Chrome: title(1) + padding(2) + panel border(2) + help text(2) + padding(2)
+const CHROME_LINES = 9;
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Windowing hook — shared by agent list and runs list
+// ─────────────────────────────────────────────────────────────────────────────
+
+function useWindowedList<T>(items: T[], selectedIndex: number, availableHeight: number, linesPerItem: number) {
+  return useMemo(() => {
+    const total = items.length;
+    const baseMax = Math.max(1, Math.floor(availableHeight / linesPerItem));
+
+    let start = 0;
+    if (selectedIndex >= baseMax) {
+      start = selectedIndex - baseMax + 1;
+    }
+
+    const hasUp = start > 0;
+    const hasDown = start + baseMax < total;
+
+    let reservedLines = 0;
+    if (hasUp) reservedLines++;
+    if (hasDown) reservedLines++;
+    const maxItems = Math.max(1, Math.floor((availableHeight - reservedLines) / linesPerItem));
+
+    if (selectedIndex >= maxItems) {
+      start = selectedIndex - maxItems + 1;
+    }
+
+    return {
+      visible: items.slice(start, start + maxItems),
+      startIdx: start,
+      showUp: start > 0,
+      showDown: start + maxItems < total,
+      countAbove: start,
+      countBelow: Math.max(0, total - start - maxItems),
+    };
+  }, [items, selectedIndex, availableHeight, linesPerItem]);
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Agent picker view
+// ─────────────────────────────────────────────────────────────────────────────
+
+interface AgentGroup {
+  agent: string;
+  runCount: number;
+  lastRun: string;
+}
+
+function AgentPickerView({
+  groups,
+  onSelect,
+  onExit,
+  availableHeight,
+}: {
+  groups: AgentGroup[];
+  onSelect: (agent: string) => void;
+  onExit: () => void;
+  availableHeight: number;
+}) {
+  const nav = useListNavigation({
+    items: groups,
+    onSelect: item => onSelect(item.agent),
+    onExit,
+    isActive: true,
+  });
+
+  const { visible, showUp, showDown, countAbove, countBelow } = useWindowedList(
+    groups,
+    nav.selectedIndex,
+    availableHeight,
+    1
+  );
+
+  return (
+    <Panel fullWidth>
+      <Box flexDirection="column">
+        <Text bold>Select an agent</Text>
+        <Text dimColor>
+          {groups.length} agent{groups.length !== 1 ? 's' : ''} with eval runs
+        </Text>
+        <Box marginTop={1} flexDirection="column">
+          {showUp && <Text dimColor> ↑ {countAbove} more</Text>}
+          {visible.map((g, vIdx) => {
+            const idx = (showUp ? countAbove : 0) + vIdx;
+            const selected = idx === nav.selectedIndex;
+            return (
+              <Text key={g.agent}>
+                <Text color={selected ? 'cyan' : undefined}>{selected ? '❯' : ' '} </Text>
+                <Text color={selected ? 'cyan' : undefined} bold={selected}>
+                  {g.agent}
+                </Text>
+                <Text dimColor>
+                  {'  '}
+                  {g.runCount} run{g.runCount !== 1 ? 's' : ''}
+                  {'  '}last: {formatShortDate(g.lastRun)}
+                </Text>
+              </Text>
+            );
+          })}
+          {showDown && <Text dimColor> ↓ {countBelow} more</Text>}
+        </Box>
+      </Box>
+    </Panel>
+  );
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Runs list view (compact single-line per run)
+// ─────────────────────────────────────────────────────────────────────────────
+
+function RunsListView({
+  agentName,
+  runs,
+  onSelect,
+  onBack,
+  availableHeight,
+}: {
+  agentName: string;
+  runs: EvalRunResult[];
+  onSelect: (run: EvalRunResult) => void;
+  onBack: () => void;
+  availableHeight: number;
+}) {
+  const nav = useListNavigation({
+    items: runs,
+    onSelect: item => onSelect(item),
+    onExit: onBack,
+    isActive: true,
+  });
+
+  // Subtract 2 lines for the header (agent name + separator)
+  const listHeight = Math.max(4, availableHeight - 2);
+  const { visible, showUp, showDown, countAbove, countBelow } = useWindowedList(runs, nav.selectedIndex, listHeight, 1);
+
+  return (
+    <Panel fullWidth>
+      <Box flexDirection="column">
+        <Text>
+          Eval Runs —{' '}
+          <Text bold color="cyan">
+            {agentName}
+          </Text>
+          <Text dimColor>
+            {' '}
+            {runs.length} run{runs.length !== 1 ? 's' : ''}
+          </Text>
+        </Text>
+        <Text dimColor>{'─'.repeat(60)}</Text>
+        {showUp && <Text dimColor> ↑ {countAbove} more</Text>}
+        {visible.map((run, vIdx) => {
+          const idx = (showUp ? countAbove : 0) + vIdx;
+          const selected = idx === nav.selectedIndex;
+          const scores = run.results.map(r => ({ name: shortEvalName(r.evaluator), score: r.aggregateScore }));
+
+          return (
+            <Text key={run.timestamp} wrap="truncate-end">
+              <Text color={selected ? 'cyan' : undefined}>{selected ? '❯' : ' '} </Text>
+              <Text dimColor>{formatShortDate(run.timestamp).padEnd(16)}</Text>
+              <Text dimColor>
+                {String(run.sessionCount).padStart(3)} session{run.sessionCount !== 1 ? 's' : ' '}{' '}
+              </Text>
+              {scores.map((s, i) => (
+                <Text key={i}>
+                  {i > 0 && <Text dimColor>, </Text>}
+                  <Text>{s.name} </Text>
+                  <Text color={scoreColor(s.score)}>{formatScore(s.score)}</Text>
+                </Text>
+              ))}
+            </Text>
+          );
+        })}
+        {showDown && <Text dimColor> ↓ {countBelow} more</Text>}
+      </Box>
+    </Panel>
+  );
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Run detail view
+// ─────────────────────────────────────────────────────────────────────────────
+
+function EvaluatorDetail({ result }: { result: EvalEvaluatorResult }) {
+  const errCount = result.sessionScores.filter(s => s.errorMessage).length;
+  return (
+    <Box flexDirection="column" marginLeft={2}>
+      <Text>
+        <Text bold>{shortEvalName(result.evaluator)}</Text>
+        {'  '}
+        <Text color={scoreColor(result.aggregateScore)}>Score: {formatScore(result.aggregateScore)}</Text>
+        {'  '}
+        <Text dimColor>
+          ({result.sessionScores.length} session{result.sessionScores.length !== 1 ? 's' : ''}
+          {errCount > 0 ? `, ${errCount} errors` : ''})
+        </Text>
+      </Text>
+      {result.tokenUsage && (
+        <Text dimColor>
+          {'  '}Tokens: {result.tokenUsage.inputTokens.toLocaleString()} in /{' '}
+          {result.tokenUsage.outputTokens.toLocaleString()} out
+        </Text>
+      )}
+      {result.sessionScores.map((ss, i) => (
+        <Text key={i} dimColor>
+          {'  '}
+          {ss.sessionId.slice(0, 16)}…{' '}
+          {ss.errorMessage ? (
+            <Text color="red">ERROR: {ss.errorMessage.slice(0, 60)}</Text>
+          ) : (
+            <>
+              <Text color={scoreColor(ss.value)}>{formatScore(ss.value)}</Text>
+              {ss.label && <Text> ({ss.label})</Text>}
+            </>
+          )}
+        </Text>
+      ))}
+    </Box>
+  );
+}
+
+function RunDetailView({ run, onBack, maxHeight }: { run: EvalRunResult; onBack: () => void; maxHeight: number }) {
+  useInput((input, key) => {
+    if (key.escape || input === 'b') {
+      onBack();
+    }
+  });
+
+  return (
+    <Box flexDirection="column" height={maxHeight} overflowY="hidden">
+      <Box flexDirection="column" marginBottom={1}>
+        <Text>
+          <Text bold>Agent:</Text> {run.agent}
+          {'  '}
+          <Text bold>Date:</Text> {formatFullDate(run.timestamp)}
+          {'  '}
+          <Text bold>Lookback:</Text> {run.lookbackDays}d
+        </Text>
+        <Text>
+          <Text bold>Sessions:</Text> {run.sessionCount}
+          {'  '}
+          <Text bold>Evaluators:</Text> {run.evaluators.map(shortEvalName).join(', ')}
+        </Text>
+      </Box>
+      <Text color="gray">{'─'.repeat(60)}</Text>
+      {run.results.map((result, i) => (
+        <EvaluatorDetail key={i} result={result} />
+      ))}
+    </Box>
+  );
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Main screen
+// ─────────────────────────────────────────────────────────────────────────────
 
 interface EvalScreenProps {
   isInteractive: boolean;
   onExit: () => void;
 }
 
-type Phase = 'loading' | 'loaded' | 'error';
+type View = 'agents' | 'runs' | 'detail';
 
 interface EvalState {
-  phase: Phase;
+  phase: 'loading' | 'loaded' | 'error';
   runs: EvalRunResult[];
   error: string | null;
 }
 
-export function EvalScreen({ isInteractive, onExit }: EvalScreenProps) {
+export function EvalScreen({ onExit }: EvalScreenProps) {
+  const { stdout } = useStdout();
+  const terminalHeight = stdout?.rows ?? 24;
+  const availableHeight = Math.max(6, terminalHeight - CHROME_LINES);
+
   const [state, setState] = useState<EvalState>({
     phase: 'loading',
     runs: [],
     error: null,
   });
+  const [view, setView] = useState<View>('agents');
+  const [selectedAgent, setSelectedAgent] = useState<string | null>(null);
+  const [selectedRun, setSelectedRun] = useState<EvalRunResult | null>(null);
+  const [resultsDir, setResultsDir] = useState<string | null>(null);
 
   useEffect(() => {
     const load = async () => {
-      // Yield to allow React to paint the loading state
       await new Promise(resolve => setTimeout(resolve, 0));
-
+      try {
+        setResultsDir(getResultsPath());
+      } catch {
+        // ignore — no project context
+      }
       const result = handleListEvalRuns({});
-
       if (!result.success) {
         setState({ phase: 'error', runs: [], error: result.error ?? 'Unknown error' });
         return;
       }
-
       setState({ phase: 'loaded', runs: result.runs ?? [], error: null });
     };
-
     void load();
   }, []);
 
+  // Group runs by agent
+  const agentGroups: AgentGroup[] = useMemo(() => {
+    const map = new Map<string, { runs: EvalRunResult[] }>();
+    for (const run of state.runs) {
+      const entry = map.get(run.agent);
+      if (entry) {
+        entry.runs.push(run);
+      } else {
+        map.set(run.agent, { runs: [run] });
+      }
+    }
+
+    return Array.from(map.entries())
+      .map(([agent, { runs }]) => ({
+        agent,
+        runCount: runs.length,
+        lastRun: runs[0]!.timestamp,
+      }))
+      .sort((a, b) => new Date(b.lastRun).getTime() - new Date(a.lastRun).getTime());
+  }, [state.runs]);
+
+  // Runs for selected agent
+  const agentRuns = useMemo(
+    () => (selectedAgent ? state.runs.filter(r => r.agent === selectedAgent) : []),
+    [state.runs, selectedAgent]
+  );
+
+  // If only one agent, skip the picker (state sync pattern — no effect needed)
+  if (state.phase === 'loaded' && agentGroups.length === 1 && view === 'agents') {
+    setSelectedAgent(agentGroups[0]!.agent);
+    setView('runs');
+  }
+
+  const helpText =
+    view === 'detail'
+      ? 'Esc/B back to runs'
+      : view === 'runs' && agentGroups.length > 1
+        ? 'Esc back to agents'
+        : state.runs.length > 0
+          ? HELP_TEXT.NAVIGATE_SELECT
+          : HELP_TEXT.EXIT;
+
+  const screenTitle = view === 'runs' || view === 'detail' ? 'Eval Runs' : 'Eval Runs';
+
+  const noRuns = state.phase === 'loaded' && state.runs.length === 0;
+  const exitEnabled = noRuns || (view === 'agents' && agentGroups.length > 1);
+
   return (
-    <Screen title="Eval Runs" onExit={onExit}>
-      <Box flexDirection="column" marginTop={1}>
-        {state.phase === 'loading' && <Text dimColor>Loading eval runs...</Text>}
+    <Screen title={screenTitle} onExit={onExit} helpText={helpText} exitEnabled={exitEnabled}>
+      {state.phase === 'loading' && <Text dimColor>Loading eval runs...</Text>}
 
-        {state.phase === 'error' && <Text color={STATUS_COLORS.error}>{state.error}</Text>}
+      {state.phase === 'error' && <Text color={STATUS_COLORS.error}>{state.error}</Text>}
 
-        {state.phase === 'loaded' && state.runs.length === 0 && (
+      {noRuns && (
+        <Box flexDirection="column">
           <Text dimColor>No eval runs found. Run `agentcore run eval` to create one.</Text>
-        )}
+          {resultsDir && <Text dimColor>Results saved to: {resultsDir}</Text>}
+        </Box>
+      )}
 
-        {state.phase === 'loaded' && state.runs.length > 0 && (
-          <Box flexDirection="column">
-            <Box>
-              <Text bold>
-                {'Run ID'.padEnd(42)} {'Agent'.padEnd(20)} {'Score'.padEnd(30)} {'Sessions'.padEnd(10)} {'Date'}
-              </Text>
-            </Box>
-            <Text dimColor>{'─'.repeat(110)}</Text>
-            {state.runs.map(run => {
-              const scores = run.results.map(r => `${r.evaluator}=${r.aggregateScore.toFixed(2)}`).join(', ');
-              const date = new Date(run.timestamp).toLocaleDateString();
-              return (
-                <Box key={run.runId}>
-                  <Text>
-                    {run.runId.padEnd(42)} {run.agent.padEnd(20)} {scores.padEnd(30)}{' '}
-                    {String(run.sessionCount).padEnd(10)} {date}
-                  </Text>
-                </Box>
-              );
-            })}
-          </Box>
-        )}
-
-        {state.phase !== 'loading' && (
-          <Box marginTop={1}>
-            <Text dimColor>{isInteractive ? 'Esc/B back' : ''}</Text>
-          </Box>
-        )}
-      </Box>
+      {state.phase === 'loaded' && view === 'agents' && agentGroups.length > 1 && (
+        <AgentPickerView
+          groups={agentGroups}
+          onSelect={agent => {
+            setSelectedAgent(agent);
+            setView('runs');
+          }}
+          onExit={onExit}
+          availableHeight={availableHeight}
+        />
+      )}
+
+      {state.phase === 'loaded' && view === 'runs' && selectedAgent && (
+        <RunsListView
+          agentName={selectedAgent}
+          runs={agentRuns}
+          onSelect={run => {
+            setSelectedRun(run);
+            setView('detail');
+          }}
+          onBack={() => {
+            if (agentGroups.length > 1) {
+              setView('agents');
+              setSelectedAgent(null);
+            } else {
+              onExit();
+            }
+          }}
+          availableHeight={availableHeight}
+        />
+      )}
+
+      {state.phase === 'loaded' && view === 'detail' && selectedRun && (
+        <Panel fullWidth>
+          <RunDetailView run={selectedRun} onBack={() => setView('runs')} maxHeight={availableHeight} />
+        </Panel>
+      )}
     </Screen>
   );
 }
diff --git a/src/cli/tui/screens/eval/index.ts b/src/cli/tui/screens/eval/index.ts
index 861edb00..67dc1582 100644
--- a/src/cli/tui/screens/eval/index.ts
+++ b/src/cli/tui/screens/eval/index.ts
@@ -1 +1,2 @@
 export { EvalScreen } from './EvalScreen';
+export { EvalHubScreen } from './EvalHubScreen';
diff --git a/src/cli/tui/screens/evaluator/AddEvaluatorScreen.tsx b/src/cli/tui/screens/evaluator/AddEvaluatorScreen.tsx
index 21ecb6fb..d86d1818 100644
--- a/src/cli/tui/screens/evaluator/AddEvaluatorScreen.tsx
+++ b/src/cli/tui/screens/evaluator/AddEvaluatorScreen.tsx
@@ -1,18 +1,21 @@
 import type { EvaluationLevel, EvaluatorConfig } from '../../../../schema';
-import { BedrockModelIdSchema, EvaluatorNameSchema } from '../../../../schema';
+import { EvaluatorNameSchema, isValidBedrockModelId } from '../../../../schema';
 import type { SelectableItem } from '../../components';
 import { ConfirmReview, Panel, Screen, StepIndicator, TextInput, WizardSelect } from '../../components';
 import { HELP_TEXT } from '../../constants';
 import { useListNavigation } from '../../hooks';
 import { generateUniqueName } from '../../utils';
-import type { AddEvaluatorConfig } from './types';
+import type { AddEvaluatorConfig, CustomRatingScaleType } from './types';
 import {
+  CUSTOM_RATING_SCALE_ID,
   DEFAULT_INSTRUCTIONS,
-  DEFAULT_MODEL,
   EVALUATION_LEVEL_OPTIONS,
+  EVALUATOR_MODEL_OPTIONS,
   EVALUATOR_STEP_LABELS,
   LEVEL_PLACEHOLDERS,
   RATING_SCALE_PRESETS,
+  RATING_SCALE_TYPE_OPTIONS,
+  parseCustomRatingScale,
   validateInstructionPlaceholders,
 } from './types';
 import { useAddEvaluatorWizard } from './useAddEvaluatorWizard';
@@ -44,15 +47,31 @@ export function AddEvaluatorScreen({ onComplete, onExit, existingEvaluatorNames
   );
 
   const ratingScaleItems: SelectableItem[] = useMemo(
-    () => RATING_SCALE_PRESETS.map(opt => ({ id: opt.id, title: opt.title, description: opt.description })),
+    () => [
+      ...RATING_SCALE_PRESETS.map(opt => ({ id: opt.id, title: opt.title, description: opt.description })),
+      { id: CUSTOM_RATING_SCALE_ID, title: 'Custom', description: 'Define your own rating scale' },
+    ],
+    []
+  );
+
+  const ratingScaleTypeItems: SelectableItem[] = useMemo(
+    () => RATING_SCALE_TYPE_OPTIONS.map(opt => ({ id: opt.id, title: opt.title, description: opt.description })),
+    []
+  );
+
+  const modelItems: SelectableItem[] = useMemo(
+    () => EVALUATOR_MODEL_OPTIONS.map(opt => ({ id: opt.id, title: opt.title, description: opt.description })),
     []
   );
 
   const isNameStep = wizard.step === 'name';
   const isLevelStep = wizard.step === 'level';
   const isModelStep = wizard.step === 'model';
+  const isModelCustomStep = wizard.step === 'model-custom';
   const isInstructionsStep = wizard.step === 'instructions';
   const isRatingScaleStep = wizard.step === 'ratingScale';
+  const isRatingScaleTypeStep = wizard.step === 'ratingScale-type';
+  const isRatingScaleCustomStep = wizard.step === 'ratingScale-custom';
   const isConfirmStep = wizard.step === 'confirm';
 
   const levelNav = useListNavigation({
@@ -62,16 +81,30 @@ export function AddEvaluatorScreen({ onComplete, onExit, existingEvaluatorNames
     isActive: isLevelStep,
   });
 
+  const modelNav = useListNavigation({
+    items: modelItems,
+    onSelect: item => wizard.selectModel(item.id),
+    onExit: () => wizard.goBack(),
+    isActive: isModelStep,
+  });
+
   const ratingScaleNav = useListNavigation({
     items: ratingScaleItems,
     onSelect: item => {
       const preset = RATING_SCALE_PRESETS.find(p => p.id === item.id);
-      if (preset) wizard.setRatingScale(preset.ratingScale);
+      wizard.selectRatingScale(item.id, preset?.ratingScale);
     },
     onExit: () => wizard.goBack(),
     isActive: isRatingScaleStep,
   });
 
+  const ratingScaleTypeNav = useListNavigation({
+    items: ratingScaleTypeItems,
+    onSelect: item => wizard.selectCustomRatingScaleType(item.id as CustomRatingScaleType),
+    onExit: () => wizard.goBack(),
+    isActive: isRatingScaleTypeStep,
+  });
+
   useListNavigation({
     items: [{ id: 'confirm', title: 'Confirm' }],
     onSelect: () => onComplete(wizard.config),
@@ -80,7 +113,7 @@ export function AddEvaluatorScreen({ onComplete, onExit, existingEvaluatorNames
   });
 
   const helpText =
-    isLevelStep || isRatingScaleStep
+    isLevelStep || isRatingScaleStep || isModelStep || isRatingScaleTypeStep
       ? HELP_TEXT.NAVIGATE_SELECT
       : isConfirmStep
         ? HELP_TEXT.CONFIRM_CANCEL
@@ -113,13 +146,25 @@ export function AddEvaluatorScreen({ onComplete, onExit, existingEvaluatorNames
         )}
 
         {isModelStep && (
+          <WizardSelect
+            title="Select model"
+            description="Choose the LLM for evaluation judging"
+            items={modelItems}
+            selectedIndex={modelNav.selectedIndex}
+          />
+        )}
+
+        {isModelCustomStep && (
           <TextInput
-            key="model"
+            key="model-custom"
             prompt="Bedrock model ID"
-            initialValue={DEFAULT_MODEL}
-            onSubmit={wizard.setModel}
+            initialValue=""
+            onSubmit={wizard.setCustomModel}
             onCancel={() => wizard.goBack()}
-            schema={BedrockModelIdSchema}
+            customValidation={value =>
+              isValidBedrockModelId(value) ||
+              'Must be a valid Bedrock model ID (e.g. us.anthropic.claude-sonnet-4-5-20250929-v1:0) or model ARN'
+            }
           />
         )}
 
@@ -144,12 +189,49 @@ export function AddEvaluatorScreen({ onComplete, onExit, existingEvaluatorNames
         {isRatingScaleStep && (
           <WizardSelect
             title="Rating scale"
-            description="Choose a rating scale preset"
+            description="Choose a preset or define your own"
             items={ratingScaleItems}
             selectedIndex={ratingScaleNav.selectedIndex}
           />
         )}
 
+        {isRatingScaleTypeStep && (
+          <WizardSelect
+            title="Scale type"
+            description="Choose the type of custom rating scale"
+            items={ratingScaleTypeItems}
+            selectedIndex={ratingScaleTypeNav.selectedIndex}
+          />
+        )}
+
+        {isRatingScaleCustomStep && (
+          <Box flexDirection="column">
+            <Text>Define rating scale entries</Text>
+            <Text dimColor>
+              {wizard.customRatingScaleType === 'numerical'
+                ? 'Format: value:label:definition, ... (e.g. 1:Poor:Fails, 3:Good:Meets, 5:Excellent:Exceeds)'
+                : 'Format: label:definition, ... (e.g. Pass:Meets criteria, Fail:Does not meet)'}
+            </Text>
+            <TextInput
+              key="ratingScale-custom"
+              prompt=""
+              hideArrow={false}
+              initialValue=""
+              onSubmit={value => {
+                const result = parseCustomRatingScale(value, wizard.customRatingScaleType);
+                if (result.success) {
+                  wizard.setCustomRatingScale(result.ratingScale);
+                }
+              }}
+              onCancel={() => wizard.goBack()}
+              customValidation={value => {
+                const result = parseCustomRatingScale(value, wizard.customRatingScaleType);
+                return result.success || result.error;
+              }}
+            />
+          </Box>
+        )}
+
         {isConfirmStep && (
           <ConfirmReview
             fields={[
diff --git a/src/cli/tui/screens/evaluator/__tests__/types.test.ts b/src/cli/tui/screens/evaluator/__tests__/types.test.ts
index 3d807c01..aac6ab0c 100644
--- a/src/cli/tui/screens/evaluator/__tests__/types.test.ts
+++ b/src/cli/tui/screens/evaluator/__tests__/types.test.ts
@@ -1,4 +1,10 @@
-import { DEFAULT_INSTRUCTIONS, DEFAULT_MODEL, LEVEL_PLACEHOLDERS, validateInstructionPlaceholders } from '../types.js';
+import {
+  DEFAULT_INSTRUCTIONS,
+  DEFAULT_MODEL,
+  LEVEL_PLACEHOLDERS,
+  parseCustomRatingScale,
+  validateInstructionPlaceholders,
+} from '../types.js';
 import { describe, expect, it } from 'vitest';
 
 describe('LEVEL_PLACEHOLDERS', () => {
@@ -8,17 +14,22 @@ describe('LEVEL_PLACEHOLDERS', () => {
     expect(LEVEL_PLACEHOLDERS).toHaveProperty('TOOL_CALL');
   });
 
-  it('SESSION and TRACE share context and trajectory placeholders', () => {
+  it('SESSION has correct public placeholders', () => {
     expect(LEVEL_PLACEHOLDERS.SESSION).toContain('context');
+    expect(LEVEL_PLACEHOLDERS.SESSION).toContain('available_tools');
+    expect(LEVEL_PLACEHOLDERS.SESSION).toHaveLength(2);
+  });
+
+  it('TRACE has correct public placeholders', () => {
     expect(LEVEL_PLACEHOLDERS.TRACE).toContain('context');
-    expect(LEVEL_PLACEHOLDERS.SESSION).toContain('actual_trajectory');
-    expect(LEVEL_PLACEHOLDERS.TRACE).toContain('actual_trajectory');
+    expect(LEVEL_PLACEHOLDERS.TRACE).toContain('assistant_turn');
+    expect(LEVEL_PLACEHOLDERS.TRACE).toHaveLength(2);
   });
 
   it('TOOL_CALL has tool-specific placeholders', () => {
-    expect(LEVEL_PLACEHOLDERS.TOOL_CALL).toContain('tool_name');
-    expect(LEVEL_PLACEHOLDERS.TOOL_CALL).toContain('tool_input');
-    expect(LEVEL_PLACEHOLDERS.TOOL_CALL).toContain('tool_output');
+    expect(LEVEL_PLACEHOLDERS.TOOL_CALL).toContain('available_tools');
+    expect(LEVEL_PLACEHOLDERS.TOOL_CALL).toContain('context');
+    expect(LEVEL_PLACEHOLDERS.TOOL_CALL).toContain('tool_turn');
   });
 });
 
@@ -34,10 +45,8 @@ describe('DEFAULT_INSTRUCTIONS', () => {
     expect(DEFAULT_INSTRUCTIONS.SESSION).toContain('{context}');
   });
 
-  it('TOOL_CALL default uses {tool_name}, {tool_input}, {tool_output}', () => {
-    expect(DEFAULT_INSTRUCTIONS.TOOL_CALL).toContain('{tool_name}');
-    expect(DEFAULT_INSTRUCTIONS.TOOL_CALL).toContain('{tool_input}');
-    expect(DEFAULT_INSTRUCTIONS.TOOL_CALL).toContain('{tool_output}');
+  it('TOOL_CALL default uses {tool_turn}', () => {
+    expect(DEFAULT_INSTRUCTIONS.TOOL_CALL).toContain('{tool_turn}');
   });
 });
 
@@ -52,12 +61,12 @@ describe('validateInstructionPlaceholders', () => {
   it('returns true when at least one valid placeholder is present for SESSION', () => {
     expect(validateInstructionPlaceholders('Check {context} now', 'SESSION')).toBe(true);
     expect(validateInstructionPlaceholders('See {available_tools}', 'SESSION')).toBe(true);
-    expect(validateInstructionPlaceholders('Trajectory: {actual_trajectory}', 'SESSION')).toBe(true);
+    expect(validateInstructionPlaceholders('Tools: {available_tools}', 'SESSION')).toBe(true);
   });
 
   it('returns true when at least one valid placeholder is present for TOOL_CALL', () => {
-    expect(validateInstructionPlaceholders('Tool: {tool_name}', 'TOOL_CALL')).toBe(true);
-    expect(validateInstructionPlaceholders('Output: {tool_output}', 'TOOL_CALL')).toBe(true);
+    expect(validateInstructionPlaceholders('Turn: {tool_turn}', 'TOOL_CALL')).toBe(true);
+    expect(validateInstructionPlaceholders('Tools: {available_tools}', 'TOOL_CALL')).toBe(true);
   });
 
   it('returns error string when no valid placeholders are present', () => {
@@ -72,8 +81,8 @@ describe('validateInstructionPlaceholders', () => {
     expect(result).toBe(true);
   });
 
-  it('rejects TOOL_CALL-level placeholders for SESSION level', () => {
-    const result = validateInstructionPlaceholders('Tool: {tool_name}', 'SESSION');
+  it('rejects TOOL_CALL-only placeholders for SESSION level', () => {
+    const result = validateInstructionPlaceholders('Turn: {tool_turn}', 'SESSION');
     expect(typeof result).toBe('string');
     expect(result).toContain('must contain at least one placeholder');
   });
@@ -92,8 +101,61 @@ describe('validateInstructionPlaceholders', () => {
   it('returns descriptive error listing allowed placeholders', () => {
     const result = validateInstructionPlaceholders('nothing', 'TOOL_CALL');
     expect(typeof result).toBe('string');
-    expect(result as string).toContain('{tool_name}');
-    expect(result as string).toContain('{tool_input}');
-    expect(result as string).toContain('{tool_output}');
+    expect(result as string).toContain('{available_tools}');
+    expect(result as string).toContain('{context}');
+    expect(result as string).toContain('{tool_turn}');
+  });
+});
+
+describe('parseCustomRatingScale', () => {
+  it('parses numerical entries', () => {
+    const result = parseCustomRatingScale('1:Poor:Fails, 3:Good:Meets, 5:Excellent:Far exceeds', 'numerical');
+    expect(result.success).toBe(true);
+    if (result.success) {
+      expect(result.ratingScale.numerical).toHaveLength(3);
+      expect(result.ratingScale.numerical![0]).toEqual({ value: 1, label: 'Poor', definition: 'Fails' });
+      expect(result.ratingScale.numerical![2]).toEqual({ value: 5, label: 'Excellent', definition: 'Far exceeds' });
+    }
+  });
+
+  it('parses categorical entries', () => {
+    const result = parseCustomRatingScale('Pass:Meets criteria, Fail:Does not meet', 'categorical');
+    expect(result.success).toBe(true);
+    if (result.success) {
+      expect(result.ratingScale.categorical).toHaveLength(2);
+      expect(result.ratingScale.categorical![0]).toEqual({ label: 'Pass', definition: 'Meets criteria' });
+    }
+  });
+
+  it('rejects fewer than 2 entries', () => {
+    const result = parseCustomRatingScale('1:Poor:Fails', 'numerical');
+    expect(result.success).toBe(false);
+    if (!result.success) expect(result.error).toContain('At least 2');
+  });
+
+  it('rejects numerical entry with non-number value', () => {
+    const result = parseCustomRatingScale('abc:Poor:Fails, 2:Good:Nice', 'numerical');
+    expect(result.success).toBe(false);
+    if (!result.success) expect(result.error).toContain('not a valid number');
+  });
+
+  it('rejects numerical entry with too few parts', () => {
+    const result = parseCustomRatingScale('1:Poor, 2:Good:Nice', 'numerical');
+    expect(result.success).toBe(false);
+    if (!result.success) expect(result.error).toContain('Format');
+  });
+
+  it('rejects categorical entry with too few parts', () => {
+    const result = parseCustomRatingScale('Pass, Fail:Bad', 'categorical');
+    expect(result.success).toBe(false);
+    if (!result.success) expect(result.error).toContain('Format');
+  });
+
+  it('handles definitions containing colons', () => {
+    const result = parseCustomRatingScale('Pass:Good: meets all criteria, Fail:Bad: fails all', 'categorical');
+    expect(result.success).toBe(true);
+    if (result.success) {
+      expect(result.ratingScale.categorical![0]!.definition).toBe('Good: meets all criteria');
+    }
   });
 });
diff --git a/src/cli/tui/screens/evaluator/types.ts b/src/cli/tui/screens/evaluator/types.ts
index af2d2ce5..70ba09c0 100644
--- a/src/cli/tui/screens/evaluator/types.ts
+++ b/src/cli/tui/screens/evaluator/types.ts
@@ -4,7 +4,16 @@ import type { EvaluationLevel, EvaluatorConfig } from '../../../../schema';
 // Evaluator Flow Types
 // ─────────────────────────────────────────────────────────────────────────────
 
-export type AddEvaluatorStep = 'name' | 'level' | 'model' | 'instructions' | 'ratingScale' | 'confirm';
+export type AddEvaluatorStep =
+  | 'name'
+  | 'level'
+  | 'model'
+  | 'model-custom'
+  | 'instructions'
+  | 'ratingScale'
+  | 'ratingScale-type'
+  | 'ratingScale-custom'
+  | 'confirm';
 
 export interface AddEvaluatorConfig {
   name: string;
@@ -16,8 +25,11 @@ export const EVALUATOR_STEP_LABELS: Record<AddEvaluatorStep, string> = {
   name: 'Name',
   level: 'Level',
   model: 'Model',
+  'model-custom': 'Model',
   instructions: 'Prompt',
   ratingScale: 'Scale',
+  'ratingScale-type': 'Scale',
+  'ratingScale-custom': 'Scale',
   confirm: 'Confirm',
 };
 
@@ -33,6 +45,47 @@ export const EVALUATION_LEVEL_OPTIONS = [
 
 export const DEFAULT_MODEL = 'us.anthropic.claude-sonnet-4-5-20250929-v1:0';
 
+export const CUSTOM_MODEL_ID = '__custom__';
+
+export interface EvaluatorModelOption {
+  id: string;
+  title: string;
+  description: string;
+}
+
+export const EVALUATOR_MODEL_OPTIONS: EvaluatorModelOption[] = [
+  {
+    id: 'us.anthropic.claude-sonnet-4-5-20250929-v1:0',
+    title: 'Claude Sonnet 4.5',
+    description: 'Recommended — balanced speed and accuracy',
+  },
+  {
+    id: 'global.anthropic.claude-opus-4-5-20251101-v1:0',
+    title: 'Claude Opus 4.5',
+    description: 'Most capable — best for complex evaluations',
+  },
+  {
+    id: 'us.anthropic.claude-haiku-4-5-20251001-v1:0',
+    title: 'Claude Haiku 4.5',
+    description: 'Fastest — good for high-volume evaluations',
+  },
+  {
+    id: 'us.amazon.nova-pro-v1:0',
+    title: 'Amazon Nova Pro',
+    description: 'Amazon foundation model — strong reasoning',
+  },
+  {
+    id: 'us.amazon.nova-lite-v1:0',
+    title: 'Amazon Nova Lite',
+    description: 'Amazon foundation model — fast and cost-effective',
+  },
+  {
+    id: CUSTOM_MODEL_ID,
+    title: 'Other',
+    description: 'Enter a custom Bedrock model ID or ARN',
+  },
+];
+
 // ─────────────────────────────────────────────────────────────────────────────
 // Placeholder Constants
 // ─────────────────────────────────────────────────────────────────────────────
@@ -42,9 +95,9 @@ export const DEFAULT_MODEL = 'us.anthropic.claude-sonnet-4-5-20250929-v1:0';
  * to contain at least one placeholder from the evaluator's level.
  */
 export const LEVEL_PLACEHOLDERS: Record<EvaluationLevel, string[]> = {
-  SESSION: ['available_tools', 'context', 'actual_trajectory', 'expected_trajectory', 'assertions'],
-  TRACE: ['available_tools', 'context', 'actual_trajectory', 'expected_trajectory', 'assertions'],
-  TOOL_CALL: ['tool_name', 'tool_input', 'tool_output', 'context'],
+  SESSION: ['context', 'available_tools'],
+  TRACE: ['context', 'assistant_turn'],
+  TOOL_CALL: ['available_tools', 'context', 'tool_turn'],
 };
 
 /**
@@ -52,11 +105,10 @@ export const LEVEL_PLACEHOLDERS: Record<EvaluationLevel, string[]> = {
  */
 export const DEFAULT_INSTRUCTIONS: Record<EvaluationLevel, string> = {
   SESSION:
-    'Evaluate the agent session based on the following conversation. Context: {context}. Rate the overall quality of the response.',
+    'Evaluate the agent session. Context: {context}. Available tools: {available_tools}. Rate the overall quality of the session.',
   TRACE:
-    'Evaluate the agent trace based on the following conversation. Context: {context}. Rate the quality of this trace.',
-  TOOL_CALL:
-    'Evaluate the tool call. Tool: {tool_name}. Input: {tool_input}. Output: {tool_output}. Rate the quality of this tool usage.',
+    'Evaluate the agent trace. Context: {context}. Assistant turn: {assistant_turn}. Rate the quality of this trace.',
+  TOOL_CALL: 'Evaluate the tool call. Context: {context}. Tool turn: {tool_turn}. Rate the quality of this tool usage.',
 };
 
 /**
@@ -78,6 +130,66 @@ export interface RatingScalePreset {
   ratingScale: EvaluatorConfig['llmAsAJudge']['ratingScale'];
 }
 
+export const CUSTOM_RATING_SCALE_ID = '__custom__';
+
+export type CustomRatingScaleType = 'numerical' | 'categorical';
+
+export const RATING_SCALE_TYPE_OPTIONS = [
+  { id: 'numerical', title: 'Numerical', description: 'Scored values (e.g. 1–5)' },
+  { id: 'categorical', title: 'Categorical', description: 'Named labels (e.g. Pass/Fail)' },
+] as const;
+
+/**
+ * Parse a custom rating scale from compact text format.
+ * Numerical: "1:Poor:Fails to meet, 2:Fair:Partially meets, 5:Excellent:Far exceeds"
+ * Categorical: "Pass:Meets criteria, Fail:Does not meet"
+ */
+export function parseCustomRatingScale(
+  input: string,
+  type: CustomRatingScaleType
+): { success: true; ratingScale: EvaluatorConfig['llmAsAJudge']['ratingScale'] } | { success: false; error: string } {
+  const entries = input
+    .split(',')
+    .map(e => e.trim())
+    .filter(Boolean);
+
+  if (entries.length < 2) {
+    return { success: false, error: 'At least 2 entries required (comma-separated)' };
+  }
+
+  if (type === 'numerical') {
+    const numerical: { value: number; label: string; definition: string }[] = [];
+    for (const entry of entries) {
+      const firstColon = entry.indexOf(':');
+      const secondColon = firstColon >= 0 ? entry.indexOf(':', firstColon + 1) : -1;
+      if (firstColon < 0 || secondColon < 0) {
+        return { success: false, error: `Invalid entry "${entry}". Format: value:label:definition` };
+      }
+      const rawValue = entry.slice(0, firstColon).trim();
+      const value = Number(rawValue);
+      if (isNaN(value)) {
+        return { success: false, error: `"${rawValue}" is not a valid number in "${entry}"` };
+      }
+      const label = entry.slice(firstColon + 1, secondColon).trim();
+      const definition = entry.slice(secondColon + 1).trim();
+      numerical.push({ value, label, definition });
+    }
+    return { success: true, ratingScale: { numerical } };
+  }
+
+  const categorical: { label: string; definition: string }[] = [];
+  for (const entry of entries) {
+    const firstColon = entry.indexOf(':');
+    if (firstColon < 0) {
+      return { success: false, error: `Invalid entry "${entry}". Format: label:definition` };
+    }
+    const label = entry.slice(0, firstColon).trim();
+    const definition = entry.slice(firstColon + 1).trim();
+    categorical.push({ label, definition });
+  }
+  return { success: true, ratingScale: { categorical } };
+}
+
 export const RATING_SCALE_PRESETS: RatingScalePreset[] = [
   {
     id: '1-5-quality',
diff --git a/src/cli/tui/screens/evaluator/useAddEvaluatorWizard.ts b/src/cli/tui/screens/evaluator/useAddEvaluatorWizard.ts
index 6288eab9..f0bcc33d 100644
--- a/src/cli/tui/screens/evaluator/useAddEvaluatorWizard.ts
+++ b/src/cli/tui/screens/evaluator/useAddEvaluatorWizard.ts
@@ -1,6 +1,6 @@
 import type { EvaluationLevel, EvaluatorConfig } from '../../../../schema';
-import type { AddEvaluatorConfig, AddEvaluatorStep } from './types';
-import { DEFAULT_MODEL } from './types';
+import type { AddEvaluatorConfig, AddEvaluatorStep, CustomRatingScaleType } from './types';
+import { CUSTOM_MODEL_ID, CUSTOM_RATING_SCALE_ID, DEFAULT_MODEL } from './types';
 import { useCallback, useState } from 'react';
 
 const ALL_STEPS: AddEvaluatorStep[] = ['name', 'level', 'model', 'instructions', 'ratingScale', 'confirm'];
@@ -27,13 +27,23 @@ function getDefaultConfig(): AddEvaluatorConfig {
 export function useAddEvaluatorWizard() {
   const [config, setConfig] = useState<AddEvaluatorConfig>(getDefaultConfig);
   const [step, setStep] = useState<AddEvaluatorStep>('name');
+  const [customRatingScaleType, setCustomRatingScaleType] = useState<CustomRatingScaleType>('numerical');
 
   const currentIndex = ALL_STEPS.indexOf(step);
 
   const goBack = useCallback(() => {
+    // Sub-steps not in ALL_STEPS — go back to their parent select
+    if (step === 'model-custom') {
+      setStep('model');
+      return;
+    }
+    if (step === 'ratingScale-type' || step === 'ratingScale-custom') {
+      setStep(step === 'ratingScale-custom' ? 'ratingScale-type' : 'ratingScale');
+      return;
+    }
     const prevStep = ALL_STEPS[currentIndex - 1];
     if (prevStep) setStep(prevStep);
-  }, [currentIndex]);
+  }, [currentIndex, step]);
 
   const nextStep = useCallback((currentStep: AddEvaluatorStep): AddEvaluatorStep | undefined => {
     const idx = ALL_STEPS.indexOf(currentStep);
@@ -58,7 +68,25 @@ export function useAddEvaluatorWizard() {
     [nextStep]
   );
 
-  const setModel = useCallback(
+  const selectModel = useCallback(
+    (modelId: string) => {
+      if (modelId === CUSTOM_MODEL_ID) {
+        setStep('model-custom');
+        return;
+      }
+      setConfig(c => ({
+        ...c,
+        config: {
+          llmAsAJudge: { ...c.config.llmAsAJudge, model: modelId },
+        },
+      }));
+      const next = nextStep('model');
+      if (next) setStep(next);
+    },
+    [nextStep]
+  );
+
+  const setCustomModel = useCallback(
     (model: string) => {
       setConfig(c => ({
         ...c,
@@ -66,6 +94,7 @@ export function useAddEvaluatorWizard() {
           llmAsAJudge: { ...c.config.llmAsAJudge, model },
         },
       }));
+      // After custom model input, go to instructions (same as after model select)
       const next = nextStep('model');
       if (next) setStep(next);
     },
@@ -86,7 +115,32 @@ export function useAddEvaluatorWizard() {
     [nextStep]
   );
 
-  const setRatingScale = useCallback(
+  const selectRatingScale = useCallback(
+    (presetIdOrCustom: string, ratingScale?: EvaluatorConfig['llmAsAJudge']['ratingScale']) => {
+      if (presetIdOrCustom === CUSTOM_RATING_SCALE_ID) {
+        setStep('ratingScale-type');
+        return;
+      }
+      if (ratingScale) {
+        setConfig(c => ({
+          ...c,
+          config: {
+            llmAsAJudge: { ...c.config.llmAsAJudge, ratingScale },
+          },
+        }));
+      }
+      const next = nextStep('ratingScale');
+      if (next) setStep(next);
+    },
+    [nextStep]
+  );
+
+  const selectCustomRatingScaleType = useCallback((type: CustomRatingScaleType) => {
+    setCustomRatingScaleType(type);
+    setStep('ratingScale-custom');
+  }, []);
+
+  const setCustomRatingScale = useCallback(
     (ratingScale: EvaluatorConfig['llmAsAJudge']['ratingScale']) => {
       setConfig(c => ({
         ...c,
@@ -110,12 +164,16 @@ export function useAddEvaluatorWizard() {
     step,
     steps: ALL_STEPS,
     currentIndex,
+    customRatingScaleType,
     goBack,
     setName,
     setLevel,
-    setModel,
+    selectModel,
+    setCustomModel,
     setInstructions,
-    setRatingScale,
+    selectRatingScale,
+    selectCustomRatingScaleType,
+    setCustomRatingScale,
     reset,
   };
 }
diff --git a/src/cli/tui/screens/online-eval/AddOnlineEvalFlow.tsx b/src/cli/tui/screens/online-eval/AddOnlineEvalFlow.tsx
index d838f429..b79b9eb3 100644
--- a/src/cli/tui/screens/online-eval/AddOnlineEvalFlow.tsx
+++ b/src/cli/tui/screens/online-eval/AddOnlineEvalFlow.tsx
@@ -1,15 +1,20 @@
+import { ConfigIO } from '../../../../lib';
+import { validateAwsCredentials } from '../../../aws/account';
+import { listEvaluators } from '../../../aws/agentcore-control';
+import { detectRegion } from '../../../aws/region';
+import { getErrorMessage } from '../../../errors';
 import { ErrorPrompt } from '../../components';
-import { useExistingEvaluatorNames } from '../../hooks/useCreateEvaluator';
-import { useAvailableAgents } from '../../hooks/useCreateMcp';
 import { useCreateOnlineEval, useExistingOnlineEvalNames } from '../../hooks/useCreateOnlineEval';
 import { AddSuccessScreen } from '../add/AddSuccessScreen';
 import { AddOnlineEvalScreen } from './AddOnlineEvalScreen';
-import type { AddOnlineEvalConfig } from './types';
+import type { AddOnlineEvalConfig, EvaluatorItem } from './types';
 import React, { useCallback, useEffect, useState } from 'react';
 
 type FlowState =
-  | { name: 'create-wizard' }
+  | { name: 'loading' }
+  | { name: 'create-wizard'; evaluators: EvaluatorItem[]; agentNames: string[] }
   | { name: 'create-success'; configName: string }
+  | { name: 'creds-error'; message: string }
   | { name: 'error'; message: string };
 
 interface AddOnlineEvalFlowProps {
@@ -23,9 +28,53 @@ interface AddOnlineEvalFlowProps {
 export function AddOnlineEvalFlow({ isInteractive = true, onExit, onBack, onDev, onDeploy }: AddOnlineEvalFlowProps) {
   const { createOnlineEval, reset: resetCreate } = useCreateOnlineEval();
   const { names: existingConfigNames } = useExistingOnlineEvalNames();
-  const { agents: availableAgents } = useAvailableAgents();
-  const { names: availableEvaluators } = useExistingEvaluatorNames();
-  const [flow, setFlow] = useState<FlowState>({ name: 'create-wizard' });
+  const [flow, setFlow] = useState<FlowState>({ name: 'loading' });
+
+  // Pre-check AWS credentials then fetch evaluators from the account
+  useEffect(() => {
+    if (flow.name !== 'loading') return;
+    let cancelled = false;
+
+    void (async () => {
+      try {
+        await validateAwsCredentials();
+      } catch (err) {
+        if (!cancelled) setFlow({ name: 'creds-error', message: getErrorMessage(err) });
+        return;
+      }
+
+      try {
+        const [{ region }, projectSpec] = await Promise.all([detectRegion(), new ConfigIO().readProjectSpec()]);
+        const result = await listEvaluators({ region });
+        if (cancelled) return;
+
+        const items: EvaluatorItem[] = result.evaluators.map(e => ({
+          arn: e.evaluatorArn,
+          name: e.evaluatorName,
+          type: e.evaluatorType,
+          description: e.description,
+        }));
+
+        const agentNames = projectSpec.agents.map(a => a.name);
+
+        if (agentNames.length === 0) {
+          setFlow({
+            name: 'error',
+            message: 'No agents found in project. Add an agent first with `agentcore add agent`.',
+          });
+          return;
+        }
+
+        setFlow({ name: 'create-wizard', evaluators: items, agentNames });
+      } catch (err) {
+        if (!cancelled) setFlow({ name: 'error', message: getErrorMessage(err) });
+      }
+    })();
+
+    return () => {
+      cancelled = true;
+    };
+  }, [flow.name]);
 
   useEffect(() => {
     if (!isInteractive && flow.name === 'create-success') {
@@ -46,12 +95,20 @@ export function AddOnlineEvalFlow({ isInteractive = true, onExit, onBack, onDev,
     [createOnlineEval]
   );
 
+  if (flow.name === 'loading') {
+    return null;
+  }
+
+  if (flow.name === 'creds-error') {
+    return <ErrorPrompt message="AWS credentials required" detail={flow.message} onBack={onBack} onExit={onExit} />;
+  }
+
   if (flow.name === 'create-wizard') {
     return (
       <AddOnlineEvalScreen
         existingConfigNames={existingConfigNames}
-        availableAgents={availableAgents}
-        availableEvaluators={availableEvaluators}
+        evaluatorItems={flow.evaluators}
+        agentNames={flow.agentNames}
         onComplete={handleCreateComplete}
         onExit={onBack}
       />
@@ -78,7 +135,7 @@ export function AddOnlineEvalFlow({ isInteractive = true, onExit, onBack, onDev,
       detail={flow.message}
       onBack={() => {
         resetCreate();
-        setFlow({ name: 'create-wizard' });
+        setFlow({ name: 'loading' });
       }}
       onExit={onExit}
     />
diff --git a/src/cli/tui/screens/online-eval/AddOnlineEvalScreen.tsx b/src/cli/tui/screens/online-eval/AddOnlineEvalScreen.tsx
index 7390a874..19eb35a4 100644
--- a/src/cli/tui/screens/online-eval/AddOnlineEvalScreen.tsx
+++ b/src/cli/tui/screens/online-eval/AddOnlineEvalScreen.tsx
@@ -1,11 +1,19 @@
 import { OnlineEvalConfigNameSchema } from '../../../../schema';
 import type { SelectableItem } from '../../components';
-import { ConfirmReview, Panel, Screen, StepIndicator, TextInput, WizardMultiSelect } from '../../components';
+import {
+  ConfirmReview,
+  Panel,
+  Screen,
+  StepIndicator,
+  TextInput,
+  WizardMultiSelect,
+  WizardSelect,
+} from '../../components';
 import { HELP_TEXT } from '../../constants';
 import { useListNavigation, useMultiSelectNavigation } from '../../hooks';
 import { generateUniqueName } from '../../utils';
-import type { AddOnlineEvalConfig } from './types';
-import { BUILTIN_EVALUATORS, DEFAULT_SAMPLING_RATE, ONLINE_EVAL_STEP_LABELS } from './types';
+import type { AddOnlineEvalConfig, EvaluatorItem } from './types';
+import { DEFAULT_SAMPLING_RATE, ONLINE_EVAL_STEP_LABELS } from './types';
 import { useAddOnlineEvalWizard } from './useAddOnlineEvalWizard';
 import React, { useMemo } from 'react';
 
@@ -13,43 +21,59 @@ interface AddOnlineEvalScreenProps {
   onComplete: (config: AddOnlineEvalConfig) => void;
   onExit: () => void;
   existingConfigNames: string[];
-  availableAgents: string[];
-  availableEvaluators: string[];
+  evaluatorItems: EvaluatorItem[];
+  agentNames: string[];
 }
 
 export function AddOnlineEvalScreen({
   onComplete,
   onExit,
   existingConfigNames,
-  availableAgents,
-  availableEvaluators,
+  evaluatorItems: rawEvaluatorItems,
+  agentNames,
 }: AddOnlineEvalScreenProps) {
-  const wizard = useAddOnlineEvalWizard();
+  const wizard = useAddOnlineEvalWizard(agentNames.length);
 
-  const agentItems: SelectableItem[] = useMemo(
-    () => availableAgents.map(name => ({ id: name, title: name, description: 'Agent' })),
-    [availableAgents]
-  );
+  // Auto-set agent when there's only one
+  const effectiveConfig = useMemo(() => {
+    if (agentNames.length === 1 && !wizard.config.agent) {
+      return { ...wizard.config, agent: agentNames[0]! };
+    }
+    return wizard.config;
+  }, [wizard.config, agentNames]);
 
   const evaluatorItems: SelectableItem[] = useMemo(() => {
-    const custom = availableEvaluators.map(name => ({ id: name, title: name, description: 'Custom evaluator' }));
-    const builtin = BUILTIN_EVALUATORS.map(b => ({ id: b.id, title: b.title, description: b.description }));
-    return [...custom, ...builtin];
-  }, [availableEvaluators]);
+    return rawEvaluatorItems.map(e => ({
+      id: e.arn,
+      title: e.name,
+      description: e.type === 'Builtin' ? 'Built-in evaluator' : (e.description ?? 'Custom evaluator'),
+    }));
+  }, [rawEvaluatorItems]);
+
+  const agentItems: SelectableItem[] = useMemo(() => {
+    return agentNames.map(name => ({ id: name, title: name }));
+  }, [agentNames]);
 
   const isNameStep = wizard.step === 'name';
-  const isAgentsStep = wizard.step === 'agents';
+  const isAgentStep = wizard.step === 'agent';
   const isEvaluatorsStep = wizard.step === 'evaluators';
   const isSamplingRateStep = wizard.step === 'samplingRate';
+  const isEnableOnCreateStep = wizard.step === 'enableOnCreate';
   const isConfirmStep = wizard.step === 'confirm';
 
-  const agentsNav = useMultiSelectNavigation({
+  const enableOnCreateItems: SelectableItem[] = useMemo(
+    () => [
+      { id: 'yes', title: 'Yes', description: 'Enable evaluation immediately after deploy' },
+      { id: 'no', title: 'No', description: 'Deploy paused — enable later with `agentcore resume online-eval`' },
+    ],
+    []
+  );
+
+  const agentNav = useListNavigation({
     items: agentItems,
-    getId: item => item.id,
-    onConfirm: ids => wizard.setAgents(ids),
+    onSelect: item => wizard.setAgent(item.id),
     onExit: () => wizard.goBack(),
-    isActive: isAgentsStep,
-    requireSelection: true,
+    isActive: isAgentStep,
   });
 
   const evaluatorsNav = useMultiSelectNavigation({
@@ -61,16 +85,24 @@ export function AddOnlineEvalScreen({
     requireSelection: true,
   });
 
+  const enableOnCreateNav = useListNavigation({
+    items: enableOnCreateItems,
+    onSelect: item => wizard.setEnableOnCreate(item.id === 'yes'),
+    onExit: () => wizard.goBack(),
+    isActive: isEnableOnCreateStep,
+  });
+
   useListNavigation({
     items: [{ id: 'confirm', title: 'Confirm' }],
-    onSelect: () => onComplete(wizard.config),
+    onSelect: () => onComplete(effectiveConfig),
     onExit: () => wizard.goBack(),
     isActive: isConfirmStep,
   });
 
-  const helpText =
-    isAgentsStep || isEvaluatorsStep
-      ? 'Space toggle · Enter confirm · Esc back'
+  const helpText = isEvaluatorsStep
+    ? 'Space toggle · Enter confirm · Esc back'
+    : isAgentStep || isEnableOnCreateStep
+      ? HELP_TEXT.NAVIGATE_SELECT
       : isConfirmStep
         ? HELP_TEXT.CONFIRM_CANCEL
         : HELP_TEXT.TEXT_INPUT;
@@ -94,13 +126,12 @@ export function AddOnlineEvalScreen({
           />
         )}
 
-        {isAgentsStep && (
-          <WizardMultiSelect
-            title="Select agents to monitor"
-            description="Choose which agents this config evaluates"
+        {isAgentStep && (
+          <WizardSelect
+            title="Select agent to monitor"
+            description="Each online eval config monitors a single agent"
             items={agentItems}
-            cursorIndex={agentsNav.cursorIndex}
-            selectedIds={agentsNav.selectedIds}
+            selectedIndex={agentNav.selectedIndex}
           />
         )}
 
@@ -134,14 +165,23 @@ export function AddOnlineEvalScreen({
           />
         )}
 
+        {isEnableOnCreateStep && (
+          <WizardSelect
+            title="Enable on deploy?"
+            description="If enabled, evaluation starts automatically after `agentcore deploy`"
+            items={enableOnCreateItems}
+            selectedIndex={enableOnCreateNav.selectedIndex}
+          />
+        )}
+
         {isConfirmStep && (
           <ConfirmReview
             fields={[
-              { label: 'Name', value: wizard.config.name },
-              { label: 'Agents', value: wizard.config.agents.join(', ') },
-              { label: 'Evaluators', value: wizard.config.evaluators.join(', ') },
-              { label: 'Sampling Rate', value: `${wizard.config.samplingRate}%` },
-              { label: 'Enable on Create', value: 'Yes' },
+              { label: 'Name', value: effectiveConfig.name },
+              { label: 'Agent', value: effectiveConfig.agent },
+              { label: 'Evaluators', value: effectiveConfig.evaluators.join(', ') },
+              { label: 'Sampling Rate', value: `${effectiveConfig.samplingRate}%` },
+              { label: 'Enable on Deploy', value: effectiveConfig.enableOnCreate ? 'Yes' : 'No' },
             ]}
           />
         )}
diff --git a/src/cli/tui/screens/online-eval/OnlineEvalDashboard.tsx b/src/cli/tui/screens/online-eval/OnlineEvalDashboard.tsx
new file mode 100644
index 00000000..1c7ceb49
--- /dev/null
+++ b/src/cli/tui/screens/online-eval/OnlineEvalDashboard.tsx
@@ -0,0 +1,259 @@
+import type { GetOnlineEvalConfigResult } from '../../../aws/agentcore-control';
+import { getOnlineEvaluationConfig } from '../../../aws/agentcore-control';
+import { getErrorMessage } from '../../../errors';
+import { handlePauseResume } from '../../../operations/eval';
+import { loadDeployedProjectConfig } from '../../../operations/resolve-agent';
+import { Panel, Screen } from '../../components';
+import { useListNavigation } from '../../hooks';
+import { STATUS_COLORS } from '../../theme';
+import { Box, Text, useInput } from 'ink';
+import React, { useCallback, useEffect, useRef, useState } from 'react';
+
+interface OnlineEvalDashboardProps {
+  isInteractive: boolean;
+  onExit: () => void;
+}
+
+interface DashboardConfig {
+  name: string;
+  configId: string;
+  region: string;
+  evaluators: string[];
+  samplingRate: number;
+  liveStatus?: string;
+  executionStatus?: string;
+  failureReason?: string;
+  error?: string;
+}
+
+type Phase = 'loading' | 'loaded' | 'error' | 'toggling';
+
+interface DashboardState {
+  phase: Phase;
+  configs: DashboardConfig[];
+  error: string | null;
+}
+
+function executionStatusColor(status?: string): string {
+  switch (status) {
+    case 'ENABLED':
+      return 'green';
+    case 'DISABLED':
+      return 'yellow';
+    default:
+      return 'gray';
+  }
+}
+
+function configStatusColor(status?: string): string {
+  switch (status?.toUpperCase()) {
+    case 'ACTIVE':
+      return 'green';
+    case 'CREATING':
+    case 'UPDATING':
+      return 'yellow';
+    case 'FAILED':
+      return 'red';
+    default:
+      return 'gray';
+  }
+}
+
+async function fetchDashboardConfigs(): Promise<DashboardConfig[]> {
+  const context = await loadDeployedProjectConfig();
+  const project = context.project;
+  const targetNames = Object.keys(context.deployedState.targets);
+
+  if (targetNames.length === 0) return [];
+
+  const targetName = targetNames[0]!;
+  const targetResources = context.deployedState.targets[targetName]?.resources;
+  const targetConfig = context.awsTargets.find(t => t.name === targetName);
+  const region = targetConfig?.region ?? 'us-east-1';
+  const deployedOnlineEvals = targetResources?.onlineEvalConfigs ?? {};
+
+  const localConfigs = project.onlineEvalConfigs ?? [];
+  const configs: DashboardConfig[] = [];
+
+  for (const local of localConfigs) {
+    const deployed = deployedOnlineEvals[local.name];
+    configs.push({
+      name: local.name,
+      configId: deployed?.onlineEvaluationConfigId ?? '',
+      region,
+      evaluators: local.evaluators,
+      samplingRate: local.samplingRate,
+      executionStatus: deployed?.executionStatus,
+    });
+  }
+
+  // Enrich with live status from API
+  await Promise.all(
+    configs.map(async (config, i) => {
+      if (!config.configId) return;
+      try {
+        const live: GetOnlineEvalConfigResult = await getOnlineEvaluationConfig({
+          region: config.region,
+          configId: config.configId,
+        });
+        configs[i] = {
+          ...config,
+          liveStatus: live.status,
+          executionStatus: live.executionStatus,
+          failureReason: live.failureReason,
+        };
+      } catch (err) {
+        configs[i] = { ...config, error: getErrorMessage(err) };
+      }
+    })
+  );
+
+  return configs;
+}
+
+export function OnlineEvalDashboard({ onExit }: OnlineEvalDashboardProps) {
+  const [state, setState] = useState<DashboardState>({
+    phase: 'loading',
+    configs: [],
+    error: null,
+  });
+  const [refreshKey, setRefreshKey] = useState(0);
+  const mountedRef = useRef(true);
+
+  const refresh = useCallback(() => {
+    setState(prev => ({ ...prev, phase: 'loading', error: null }));
+    setRefreshKey(k => k + 1);
+  }, []);
+
+  useInput(
+    (input, key) => {
+      if (input === 'r' && key.ctrl && state.phase === 'loaded') {
+        refresh();
+      }
+    },
+    { isActive: state.phase === 'loaded' }
+  );
+
+  useEffect(() => {
+    mountedRef.current = true;
+    fetchDashboardConfigs()
+      .then(configs => {
+        if (mountedRef.current) setState({ phase: 'loaded', configs, error: null });
+      })
+      .catch(err => {
+        if (mountedRef.current) setState({ phase: 'error', configs: [], error: getErrorMessage(err) });
+      });
+    return () => {
+      mountedRef.current = false;
+    };
+  }, [refreshKey]);
+
+  const nav = useListNavigation({
+    items: state.configs,
+    onSelect: item => {
+      if (!item.configId) return;
+      const action = item.executionStatus === 'ENABLED' ? 'pause' : 'resume';
+      setState(prev => ({ ...prev, phase: 'toggling' }));
+      void handlePauseResume({ name: item.name }, action).then(result => {
+        if (!result.success) {
+          setState(prev => ({ ...prev, phase: 'loaded', error: result.error ?? 'Toggle failed' }));
+          return;
+        }
+        return fetchDashboardConfigs().then(configs => {
+          if (mountedRef.current) setState({ phase: 'loaded', configs, error: null });
+        });
+      });
+    },
+    onExit: () => onExit(),
+    isActive: state.phase === 'loaded' && state.configs.length > 0,
+  });
+
+  const helpText =
+    state.configs.length > 0
+      ? '↑↓ navigate · Enter toggle pause/resume · Ctrl+R refresh · Esc back'
+      : 'Esc back · Ctrl+C quit';
+
+  return (
+    <Screen title="Online Eval Dashboard" onExit={onExit} helpText={helpText} exitEnabled={state.configs.length === 0}>
+      {(state.phase === 'loading' || state.phase === 'toggling') && (
+        <Text dimColor>{state.phase === 'toggling' ? 'Updating...' : 'Loading online eval configs...'}</Text>
+      )}
+
+      {state.phase === 'error' && <Text color={STATUS_COLORS.error}>{state.error}</Text>}
+
+      {state.error && state.phase === 'loaded' && (
+        <Box marginBottom={1}>
+          <Text color={STATUS_COLORS.error}>{state.error}</Text>
+        </Box>
+      )}
+
+      {state.phase === 'loaded' && state.configs.length === 0 && (
+        <Box flexDirection="column">
+          <Text dimColor>No online eval configs found.</Text>
+          <Text dimColor>Run `agentcore add online-eval` then `agentcore deploy` to get started.</Text>
+        </Box>
+      )}
+
+      {state.phase === 'loaded' && state.configs.length > 0 && (
+        <Panel fullWidth>
+          <Box flexDirection="column">
+            {state.configs.map((config, idx) => {
+              const selected = idx === nav.selectedIndex;
+              const isDeployed = Boolean(config.configId);
+              const toggleLabel = config.executionStatus === 'ENABLED' ? 'Enter to pause' : 'Enter to resume';
+              return (
+                <Box key={config.name} flexDirection="column" marginBottom={idx < state.configs.length - 1 ? 1 : 0}>
+                  <Text wrap="wrap">
+                    <Text color={selected ? 'cyan' : undefined}>{selected ? '❯' : ' '} </Text>
+                    <Text color={selected ? 'cyan' : undefined} bold={selected}>
+                      {config.name}
+                    </Text>
+                    {config.liveStatus && (
+                      <Text color={configStatusColor(config.liveStatus)}> [{config.liveStatus}]</Text>
+                    )}
+                    {config.executionStatus && (
+                      <Text color={executionStatusColor(config.executionStatus)}> {config.executionStatus}</Text>
+                    )}
+                    {!isDeployed && <Text color="yellow"> [Not deployed]</Text>}
+                  </Text>
+                  <Text wrap="wrap">
+                    <Text>{'  '}</Text>
+                    <Text dimColor>
+                      Evaluators: {config.evaluators.join(', ')}
+                      {'  '}
+                      Sampling: {config.samplingRate}%
+                    </Text>
+                  </Text>
+                  {config.failureReason && (
+                    <Text>
+                      <Text>{'  '}</Text>
+                      <Text color="red">Failure: {config.failureReason}</Text>
+                    </Text>
+                  )}
+                  {config.error && (
+                    <Text>
+                      <Text>{'  '}</Text>
+                      <Text color="red">Error: {config.error}</Text>
+                    </Text>
+                  )}
+                  {selected && isDeployed && (
+                    <Text>
+                      <Text>{'  '}</Text>
+                      <Text dimColor>{toggleLabel}</Text>
+                    </Text>
+                  )}
+                  {selected && !isDeployed && (
+                    <Text>
+                      <Text>{'  '}</Text>
+                      <Text dimColor>Run `agentcore deploy` to start this online eval config</Text>
+                    </Text>
+                  )}
+                </Box>
+              );
+            })}
+          </Box>
+        </Panel>
+      )}
+    </Screen>
+  );
+}
diff --git a/src/cli/tui/screens/online-eval/index.ts b/src/cli/tui/screens/online-eval/index.ts
index fcd0d5f4..a20c949a 100644
--- a/src/cli/tui/screens/online-eval/index.ts
+++ b/src/cli/tui/screens/online-eval/index.ts
@@ -1,2 +1,3 @@
 export { AddOnlineEvalFlow } from './AddOnlineEvalFlow';
 export { AddOnlineEvalScreen } from './AddOnlineEvalScreen';
+export { OnlineEvalDashboard } from './OnlineEvalDashboard';
diff --git a/src/cli/tui/screens/online-eval/types.ts b/src/cli/tui/screens/online-eval/types.ts
index 62432cac..943eaee7 100644
--- a/src/cli/tui/screens/online-eval/types.ts
+++ b/src/cli/tui/screens/online-eval/types.ts
@@ -2,41 +2,39 @@
 // Online Eval Config Flow Types
 // ─────────────────────────────────────────────────────────────────────────────
 
-export type AddOnlineEvalStep = 'name' | 'agents' | 'evaluators' | 'samplingRate' | 'confirm';
+export type AddOnlineEvalStep = 'name' | 'agent' | 'evaluators' | 'samplingRate' | 'enableOnCreate' | 'confirm';
 
 export interface AddOnlineEvalConfig {
   name: string;
-  agents: string[];
+  agent: string;
   evaluators: string[];
   samplingRate: number;
+  enableOnCreate: boolean;
   description?: string;
-  enableOnCreate?: boolean;
 }
 
 export const ONLINE_EVAL_STEP_LABELS: Record<AddOnlineEvalStep, string> = {
   name: 'Name',
-  agents: 'Agents',
+  agent: 'Agent',
   evaluators: 'Evaluators',
   samplingRate: 'Rate',
+  enableOnCreate: 'Enable',
   confirm: 'Confirm',
 };
 
 // ─────────────────────────────────────────────────────────────────────────────
-// Built-in Evaluators
+// Evaluator Items (fetched from API)
 // ─────────────────────────────────────────────────────────────────────────────
 
-export const BUILTIN_EVALUATORS = [
-  { id: 'Builtin.Helpfulness', title: 'Builtin.Helpfulness', description: 'Measures how helpful agent responses are' },
-  {
-    id: 'Builtin.GoalSuccessRate',
-    title: 'Builtin.GoalSuccessRate',
-    description: 'Measures whether the agent achieved the user goal',
-  },
-  {
-    id: 'Builtin.Faithfulness',
-    title: 'Builtin.Faithfulness',
-    description: 'Measures factual consistency with source material',
-  },
-] as const;
+export interface EvaluatorItem {
+  /** ARN used as the stored identifier in the config */
+  arn: string;
+  /** Display name */
+  name: string;
+  /** 'Builtin' or 'Custom' */
+  type: string;
+  /** Optional description */
+  description?: string;
+}
 
 export const DEFAULT_SAMPLING_RATE = 10;
diff --git a/src/cli/tui/screens/online-eval/useAddOnlineEvalWizard.ts b/src/cli/tui/screens/online-eval/useAddOnlineEvalWizard.ts
index 64141ef3..0032469f 100644
--- a/src/cli/tui/screens/online-eval/useAddOnlineEvalWizard.ts
+++ b/src/cli/tui/screens/online-eval/useAddOnlineEvalWizard.ts
@@ -2,32 +2,42 @@ import type { AddOnlineEvalConfig, AddOnlineEvalStep } from './types';
 import { DEFAULT_SAMPLING_RATE } from './types';
 import { useCallback, useState } from 'react';
 
-const ALL_STEPS: AddOnlineEvalStep[] = ['name', 'agents', 'evaluators', 'samplingRate', 'confirm'];
+function getAllSteps(agentCount: number): AddOnlineEvalStep[] {
+  if (agentCount <= 1) {
+    return ['name', 'evaluators', 'samplingRate', 'enableOnCreate', 'confirm'];
+  }
+  return ['name', 'agent', 'evaluators', 'samplingRate', 'enableOnCreate', 'confirm'];
+}
 
 function getDefaultConfig(): AddOnlineEvalConfig {
   return {
     name: '',
-    agents: [],
+    agent: '',
     evaluators: [],
     samplingRate: DEFAULT_SAMPLING_RATE,
+    enableOnCreate: true,
   };
 }
 
-export function useAddOnlineEvalWizard() {
+export function useAddOnlineEvalWizard(agentCount: number) {
+  const allSteps = getAllSteps(agentCount);
   const [config, setConfig] = useState<AddOnlineEvalConfig>(getDefaultConfig);
-  const [step, setStep] = useState<AddOnlineEvalStep>('name');
+  const [step, setStep] = useState<AddOnlineEvalStep>(allSteps[0]!);
 
-  const currentIndex = ALL_STEPS.indexOf(step);
+  const currentIndex = allSteps.indexOf(step);
 
   const goBack = useCallback(() => {
-    const prevStep = ALL_STEPS[currentIndex - 1];
+    const prevStep = allSteps[currentIndex - 1];
     if (prevStep) setStep(prevStep);
-  }, [currentIndex]);
+  }, [allSteps, currentIndex, setStep]);
 
-  const nextStep = useCallback((currentStep: AddOnlineEvalStep): AddOnlineEvalStep | undefined => {
-    const idx = ALL_STEPS.indexOf(currentStep);
-    return ALL_STEPS[idx + 1];
-  }, []);
+  const nextStep = useCallback(
+    (currentStep: AddOnlineEvalStep): AddOnlineEvalStep | undefined => {
+      const idx = allSteps.indexOf(currentStep);
+      return allSteps[idx + 1];
+    },
+    [allSteps]
+  );
 
   const setName = useCallback(
     (name: string) => {
@@ -35,16 +45,16 @@ export function useAddOnlineEvalWizard() {
       const next = nextStep('name');
       if (next) setStep(next);
     },
-    [nextStep]
+    [nextStep, setConfig, setStep]
   );
 
-  const setAgents = useCallback(
-    (agents: string[]) => {
-      setConfig(c => ({ ...c, agents }));
-      const next = nextStep('agents');
+  const setAgent = useCallback(
+    (agent: string) => {
+      setConfig(c => ({ ...c, agent }));
+      const next = nextStep('agent');
       if (next) setStep(next);
     },
-    [nextStep]
+    [nextStep, setConfig, setStep]
   );
 
   const setEvaluators = useCallback(
@@ -53,7 +63,7 @@ export function useAddOnlineEvalWizard() {
       const next = nextStep('evaluators');
       if (next) setStep(next);
     },
-    [nextStep]
+    [nextStep, setConfig, setStep]
   );
 
   const setSamplingRate = useCallback(
@@ -62,24 +72,34 @@ export function useAddOnlineEvalWizard() {
       const next = nextStep('samplingRate');
       if (next) setStep(next);
     },
-    [nextStep]
+    [nextStep, setConfig, setStep]
+  );
+
+  const setEnableOnCreate = useCallback(
+    (enableOnCreate: boolean) => {
+      setConfig(c => ({ ...c, enableOnCreate }));
+      const next = nextStep('enableOnCreate');
+      if (next) setStep(next);
+    },
+    [nextStep, setConfig, setStep]
   );
 
   const reset = useCallback(() => {
     setConfig(getDefaultConfig());
-    setStep('name');
-  }, []);
+    setStep(allSteps[0]!);
+  }, [allSteps, setConfig, setStep]);
 
   return {
     config,
     step,
-    steps: ALL_STEPS,
+    steps: allSteps,
     currentIndex,
     goBack,
     setName,
-    setAgents,
+    setAgent,
     setEvaluators,
     setSamplingRate,
+    setEnableOnCreate,
     reset,
   };
 }
diff --git a/src/cli/tui/screens/run-eval/RunEvalFlow.tsx b/src/cli/tui/screens/run-eval/RunEvalFlow.tsx
new file mode 100644
index 00000000..6d4c4d57
--- /dev/null
+++ b/src/cli/tui/screens/run-eval/RunEvalFlow.tsx
@@ -0,0 +1,294 @@
+import { validateAwsCredentials } from '../../../aws/account';
+import { listEvaluators } from '../../../aws/agentcore-control';
+import { detectRegion } from '../../../aws/region';
+import { getErrorMessage } from '../../../errors';
+import { handleRunEval } from '../../../operations/eval';
+import type { RunEvalResult } from '../../../operations/eval/run-eval';
+import type { EvalRunResult } from '../../../operations/eval/types';
+import { loadDeployedProjectConfig } from '../../../operations/resolve-agent';
+import { ErrorPrompt, GradientText, Panel, Screen } from '../../components';
+import { HELP_TEXT } from '../../constants';
+import { useListNavigation } from '../../hooks';
+import { STATUS_COLORS } from '../../theme';
+import type { EvaluatorItem } from '../online-eval/types';
+import { RunEvalScreen } from './RunEvalScreen';
+import type { AgentItem, RunEvalConfig, RunEvalFlowData } from './types';
+import { Box, Text } from 'ink';
+import React, { useCallback, useEffect, useState } from 'react';
+
+type FlowState =
+  | { name: 'loading' }
+  | { name: 'wizard'; data: RunEvalFlowData }
+  | { name: 'running'; config: RunEvalConfig }
+  | { name: 'results'; result: RunEvalResult; run: EvalRunResult }
+  | { name: 'creds-error'; message: string }
+  | { name: 'error'; message: string };
+
+interface RunEvalFlowProps {
+  onExit: () => void;
+  onViewRuns?: () => void;
+}
+
+function scoreColor(score: number): string {
+  if (score >= 0.8) return 'green';
+  if (score >= 0.5) return 'yellow';
+  return 'red';
+}
+
+function shortEvalName(name: string): string {
+  return name.replace(/^Builtin\./, '');
+}
+
+export function RunEvalFlow({ onExit, onViewRuns }: RunEvalFlowProps) {
+  const [flow, setFlow] = useState<FlowState>({ name: 'loading' });
+
+  useEffect(() => {
+    if (flow.name !== 'loading') return;
+    let cancelled = false;
+
+    void (async () => {
+      try {
+        await validateAwsCredentials();
+      } catch (err) {
+        if (!cancelled) setFlow({ name: 'creds-error', message: getErrorMessage(err) });
+        return;
+      }
+
+      try {
+        const { region } = await detectRegion();
+        const [evalResult, context] = await Promise.all([listEvaluators({ region }), loadDeployedProjectConfig()]);
+
+        if (cancelled) return;
+
+        const evaluators: EvaluatorItem[] = evalResult.evaluators.map(e => ({
+          arn: e.evaluatorArn,
+          name: e.evaluatorName,
+          type: e.evaluatorType,
+          description: e.description,
+        }));
+
+        // Cross-reference project agents with deployed state to only show deployed agents
+        const deployedAgentNames = new Set<string>();
+        for (const target of Object.values(context.deployedState.targets)) {
+          const agentStates = target.resources?.agents;
+          if (agentStates) {
+            for (const name of Object.keys(agentStates)) {
+              deployedAgentNames.add(name);
+            }
+          }
+        }
+
+        const agents: AgentItem[] = context.project.agents
+          .filter(a => deployedAgentNames.has(a.name))
+          .map(a => ({
+            name: a.name,
+            build: a.build,
+          }));
+
+        if (agents.length === 0) {
+          if (!cancelled) {
+            setFlow({
+              name: 'error',
+              message:
+                context.project.agents.length === 0
+                  ? 'No agents found in project. Run `agentcore add agent` first.'
+                  : 'No deployed agents found. Run `agentcore deploy` first.',
+            });
+          }
+          return;
+        }
+
+        if (evaluators.length === 0) {
+          if (!cancelled) {
+            setFlow({
+              name: 'error',
+              message: 'No evaluators found in your account. Create an evaluator first.',
+            });
+          }
+          return;
+        }
+
+        setFlow({ name: 'wizard', data: { agents, evaluators } });
+      } catch (err) {
+        if (!cancelled) setFlow({ name: 'error', message: getErrorMessage(err) });
+      }
+    })();
+
+    return () => {
+      cancelled = true;
+    };
+  }, [flow.name]);
+
+  const handleRunComplete = useCallback((config: RunEvalConfig) => {
+    setFlow({ name: 'running', config });
+  }, []);
+
+  // Execute the eval when we enter 'running' state
+  useEffect(() => {
+    if (flow.name !== 'running') return;
+    let cancelled = false;
+
+    const { config } = flow;
+
+    void (async () => {
+      try {
+        const result = await handleRunEval({
+          agent: config.agent,
+          evaluator: [],
+          evaluatorArn: config.evaluators,
+          days: config.days,
+        });
+
+        if (cancelled) return;
+
+        if (!result.success || !result.run) {
+          setFlow({ name: 'error', message: result.error ?? 'Evaluation failed' });
+          return;
+        }
+
+        setFlow({ name: 'results', result, run: result.run });
+      } catch (err) {
+        if (!cancelled) setFlow({ name: 'error', message: getErrorMessage(err) });
+      }
+    })();
+
+    return () => {
+      cancelled = true;
+    };
+  }, [flow.name]); // eslint-disable-line react-hooks/exhaustive-deps
+
+  if (flow.name === 'loading') {
+    return (
+      <Screen title="Run On-demand Evaluation" onExit={onExit}>
+        <GradientText text="Loading agents and evaluators..." />
+      </Screen>
+    );
+  }
+
+  if (flow.name === 'creds-error') {
+    return <ErrorPrompt message="AWS credentials required" detail={flow.message} onBack={onExit} onExit={onExit} />;
+  }
+
+  if (flow.name === 'wizard') {
+    return (
+      <RunEvalScreen
+        agents={flow.data.agents}
+        evaluatorItems={flow.data.evaluators}
+        onComplete={handleRunComplete}
+        onExit={onExit}
+      />
+    );
+  }
+
+  if (flow.name === 'running') {
+    return (
+      <Screen title="Run On-demand Evaluation" onExit={onExit}>
+        <GradientText text="Running evaluation... this may take a few minutes" />
+      </Screen>
+    );
+  }
+
+  if (flow.name === 'results') {
+    return (
+      <ResultsView
+        run={flow.run}
+        filePath={flow.result.filePath}
+        onRunAnother={() => setFlow({ name: 'loading' })}
+        onViewRuns={onViewRuns}
+        onExit={onExit}
+      />
+    );
+  }
+
+  return (
+    <ErrorPrompt
+      message="Evaluation failed"
+      detail={flow.message}
+      onBack={() => setFlow({ name: 'loading' })}
+      onExit={onExit}
+    />
+  );
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Results view
+// ─────────────────────────────────────────────────────────────────────────────
+
+interface ResultsViewProps {
+  run: EvalRunResult;
+  filePath?: string;
+  onRunAnother: () => void;
+  onViewRuns?: () => void;
+  onExit: () => void;
+}
+
+function ResultsView({ run, filePath, onRunAnother, onViewRuns, onExit }: ResultsViewProps) {
+  const actions = [
+    { id: 'another', title: 'Run another evaluation' },
+    ...(onViewRuns ? [{ id: 'view-runs', title: 'View eval runs' }] : []),
+    { id: 'back', title: 'Back' },
+  ];
+
+  const nav = useListNavigation({
+    items: actions,
+    onSelect: item => {
+      if (item.id === 'another') onRunAnother();
+      else if (item.id === 'view-runs') onViewRuns?.();
+      else onExit();
+    },
+    onExit,
+    isActive: true,
+  });
+
+  return (
+    <Screen title="Evaluation Complete" onExit={onExit} helpText={HELP_TEXT.NAVIGATE_SELECT} exitEnabled={false}>
+      <Panel fullWidth>
+        <Box flexDirection="column">
+          <Text color="green">✓ Evaluation complete</Text>
+          <Text>
+            <Text bold>Agent:</Text> {run.agent}
+            {'  '}
+            <Text bold>Sessions:</Text> {run.sessionCount}
+            {'  '}
+            <Text bold>Lookback:</Text> {run.lookbackDays}d
+          </Text>
+
+          <Box marginTop={1} flexDirection="column">
+            {run.results.map((r, i) => {
+              const errCount = r.sessionScores.filter(s => s.errorMessage).length;
+              return (
+                <Text key={i}>
+                  {'  '}
+                  <Text bold>{shortEvalName(r.evaluator)}</Text>
+                  {'  '}
+                  <Text color={scoreColor(r.aggregateScore)}>{r.aggregateScore.toFixed(2)}</Text>
+                  {errCount > 0 && <Text color={STATUS_COLORS.error}> ({errCount} errors)</Text>}
+                </Text>
+              );
+            })}
+          </Box>
+
+          {filePath && (
+            <Box marginTop={1}>
+              <Text dimColor>Results saved to: {filePath}</Text>
+            </Box>
+          )}
+
+          <Box marginTop={1} flexDirection="column">
+            {actions.map((action, idx) => {
+              const selected = idx === nav.selectedIndex;
+              return (
+                <Text key={action.id}>
+                  <Text color={selected ? 'cyan' : undefined}>{selected ? '❯' : ' '} </Text>
+                  <Text color={selected ? 'cyan' : undefined} bold={selected}>
+                    {action.title}
+                  </Text>
+                </Text>
+              );
+            })}
+          </Box>
+        </Box>
+      </Panel>
+    </Screen>
+  );
+}
diff --git a/src/cli/tui/screens/run-eval/RunEvalScreen.tsx b/src/cli/tui/screens/run-eval/RunEvalScreen.tsx
new file mode 100644
index 00000000..fde8cb55
--- /dev/null
+++ b/src/cli/tui/screens/run-eval/RunEvalScreen.tsx
@@ -0,0 +1,142 @@
+import type { SelectableItem } from '../../components';
+import {
+  ConfirmReview,
+  Panel,
+  Screen,
+  StepIndicator,
+  TextInput,
+  WizardMultiSelect,
+  WizardSelect,
+} from '../../components';
+import { HELP_TEXT } from '../../constants';
+import { useListNavigation, useMultiSelectNavigation } from '../../hooks';
+import type { EvaluatorItem } from '../online-eval/types';
+import type { AgentItem, RunEvalConfig } from './types';
+import { DEFAULT_LOOKBACK_DAYS, RUN_EVAL_STEP_LABELS } from './types';
+import { useRunEvalWizard } from './useRunEvalWizard';
+import React, { useMemo } from 'react';
+
+interface RunEvalScreenProps {
+  agents: AgentItem[];
+  evaluatorItems: EvaluatorItem[];
+  onComplete: (config: RunEvalConfig) => void;
+  onExit: () => void;
+}
+
+export function RunEvalScreen({ agents, evaluatorItems: rawEvaluatorItems, onComplete, onExit }: RunEvalScreenProps) {
+  const wizard = useRunEvalWizard(agents.length);
+
+  // Auto-select agent if only one
+  const singleAgent = agents.length === 1 ? agents[0]!.name : null;
+  if (singleAgent && !wizard.config.agent) {
+    wizard.setAgent(singleAgent);
+  }
+
+  const agentItems: SelectableItem[] = useMemo(
+    () => agents.map(a => ({ id: a.name, title: a.name, description: a.build })),
+    [agents]
+  );
+
+  const evaluatorItems: SelectableItem[] = useMemo(
+    () =>
+      rawEvaluatorItems.map(e => ({
+        id: e.arn,
+        title: e.name,
+        description: e.type === 'Builtin' ? 'Built-in evaluator' : (e.description ?? 'Custom evaluator'),
+      })),
+    [rawEvaluatorItems]
+  );
+
+  const isAgentStep = wizard.step === 'agent';
+  const isEvaluatorsStep = wizard.step === 'evaluators';
+  const isDaysStep = wizard.step === 'days';
+  const isConfirmStep = wizard.step === 'confirm';
+
+  const agentNav = useListNavigation({
+    items: agentItems,
+    onSelect: item => wizard.setAgent(item.id),
+    onExit,
+    isActive: isAgentStep,
+  });
+
+  const evaluatorsNav = useMultiSelectNavigation({
+    items: evaluatorItems,
+    getId: item => item.id,
+    onConfirm: ids => wizard.setEvaluators(ids),
+    onExit: () => (agents.length <= 1 ? onExit() : wizard.goBack()),
+    isActive: isEvaluatorsStep,
+    requireSelection: true,
+  });
+
+  useListNavigation({
+    items: [{ id: 'confirm', title: 'Confirm' }],
+    onSelect: () => onComplete(wizard.config),
+    onExit: () => wizard.goBack(),
+    isActive: isConfirmStep,
+  });
+
+  const helpText = isAgentStep
+    ? HELP_TEXT.NAVIGATE_SELECT
+    : isEvaluatorsStep
+      ? 'Space toggle · Enter confirm · Esc back'
+      : isConfirmStep
+        ? HELP_TEXT.CONFIRM_CANCEL
+        : HELP_TEXT.TEXT_INPUT;
+
+  const headerContent = <StepIndicator steps={wizard.steps} currentStep={wizard.step} labels={RUN_EVAL_STEP_LABELS} />;
+
+  return (
+    <Screen title="Run On-demand Evaluation" onExit={onExit} helpText={helpText} headerContent={headerContent}>
+      <Panel>
+        {isAgentStep && (
+          <WizardSelect
+            title="Select agent to evaluate"
+            description="Choose a project agent"
+            items={agentItems}
+            selectedIndex={agentNav.selectedIndex}
+          />
+        )}
+
+        {isEvaluatorsStep && (
+          <WizardMultiSelect
+            title="Select evaluators"
+            description="Choose evaluators to run against agent traces"
+            items={evaluatorItems}
+            cursorIndex={evaluatorsNav.cursorIndex}
+            selectedIds={evaluatorsNav.selectedIds}
+          />
+        )}
+
+        {isDaysStep && (
+          <TextInput
+            key="days"
+            prompt="Lookback window (days)"
+            initialValue={String(DEFAULT_LOOKBACK_DAYS)}
+            onSubmit={value => {
+              const days = parseInt(value, 10);
+              if (isNaN(days) || days < 1 || days > 90) return;
+              wizard.setDays(days);
+            }}
+            onCancel={() => wizard.goBack()}
+            customValidation={value => {
+              const days = parseInt(value, 10);
+              if (isNaN(days)) return 'Must be a number';
+              if (days < 1 || days > 90) return 'Must be between 1 and 90';
+              return true;
+            }}
+          />
+        )}
+
+        {isConfirmStep && (
+          <ConfirmReview
+            fields={[
+              { label: 'Agent', value: wizard.config.agent },
+              { label: 'Evaluators', value: wizard.config.evaluators.join(', ') },
+              { label: 'Lookback', value: `${wizard.config.days} day${wizard.config.days !== 1 ? 's' : ''}` },
+            ]}
+          />
+        )}
+      </Panel>
+    </Screen>
+  );
+}
diff --git a/src/cli/tui/screens/run-eval/RunScreen.tsx b/src/cli/tui/screens/run-eval/RunScreen.tsx
new file mode 100644
index 00000000..63637bdb
--- /dev/null
+++ b/src/cli/tui/screens/run-eval/RunScreen.tsx
@@ -0,0 +1,32 @@
+import { Screen, WizardSelect } from '../../components';
+import type { SelectableItem } from '../../components';
+import { HELP_TEXT } from '../../constants';
+import { useListNavigation } from '../../hooks';
+import React, { useMemo } from 'react';
+
+interface RunScreenProps {
+  onRunEval: () => void;
+  onExit: () => void;
+}
+
+export function RunScreen({ onRunEval, onExit }: RunScreenProps) {
+  const items: SelectableItem[] = useMemo(
+    () => [
+      { id: 'run-eval', title: 'On-demand Evaluation', description: 'Evaluate agent traces with selected evaluators' },
+    ],
+    []
+  );
+
+  const nav = useListNavigation({
+    items,
+    onSelect: () => onRunEval(),
+    onExit,
+    isActive: true,
+  });
+
+  return (
+    <Screen title="Run" onExit={onExit} helpText={HELP_TEXT.NAVIGATE_SELECT} exitEnabled={false}>
+      <WizardSelect title="Choose an operation" items={items} selectedIndex={nav.selectedIndex} />
+    </Screen>
+  );
+}
diff --git a/src/cli/tui/screens/run-eval/index.ts b/src/cli/tui/screens/run-eval/index.ts
new file mode 100644
index 00000000..d76e0e08
--- /dev/null
+++ b/src/cli/tui/screens/run-eval/index.ts
@@ -0,0 +1,3 @@
+export { RunEvalFlow } from './RunEvalFlow';
+export { RunEvalScreen } from './RunEvalScreen';
+export { RunScreen } from './RunScreen';
diff --git a/src/cli/tui/screens/run-eval/types.ts b/src/cli/tui/screens/run-eval/types.ts
new file mode 100644
index 00000000..24c134b6
--- /dev/null
+++ b/src/cli/tui/screens/run-eval/types.ts
@@ -0,0 +1,28 @@
+import type { EvaluatorItem } from '../online-eval/types';
+
+export type RunEvalStep = 'agent' | 'evaluators' | 'days' | 'confirm';
+
+export interface RunEvalConfig {
+  agent: string;
+  evaluators: string[];
+  days: number;
+}
+
+export const RUN_EVAL_STEP_LABELS: Record<RunEvalStep, string> = {
+  agent: 'Agent',
+  evaluators: 'Evaluators',
+  days: 'Lookback',
+  confirm: 'Confirm',
+};
+
+export const DEFAULT_LOOKBACK_DAYS = 7;
+
+export interface AgentItem {
+  name: string;
+  build: string;
+}
+
+export interface RunEvalFlowData {
+  agents: AgentItem[];
+  evaluators: EvaluatorItem[];
+}
diff --git a/src/cli/tui/screens/run-eval/useRunEvalWizard.ts b/src/cli/tui/screens/run-eval/useRunEvalWizard.ts
new file mode 100644
index 00000000..f842cc0c
--- /dev/null
+++ b/src/cli/tui/screens/run-eval/useRunEvalWizard.ts
@@ -0,0 +1,83 @@
+import type { RunEvalConfig, RunEvalStep } from './types';
+import { DEFAULT_LOOKBACK_DAYS } from './types';
+import { useCallback, useState } from 'react';
+
+function getAllSteps(agentCount: number): RunEvalStep[] {
+  if (agentCount <= 1) {
+    return ['evaluators', 'days', 'confirm'];
+  }
+  return ['agent', 'evaluators', 'days', 'confirm'];
+}
+
+function getDefaultConfig(): RunEvalConfig {
+  return {
+    agent: '',
+    evaluators: [],
+    days: DEFAULT_LOOKBACK_DAYS,
+  };
+}
+
+export function useRunEvalWizard(agentCount: number) {
+  const allSteps = getAllSteps(agentCount);
+  const [config, setConfig] = useState<RunEvalConfig>(getDefaultConfig);
+  const [step, setStep] = useState<RunEvalStep>(allSteps[0]!);
+
+  const currentIndex = allSteps.indexOf(step);
+
+  const goBack = useCallback(() => {
+    const prevStep = allSteps[currentIndex - 1];
+    if (prevStep) setStep(prevStep);
+  }, [allSteps, currentIndex, setStep]);
+
+  const nextStep = useCallback(
+    (currentStep: RunEvalStep): RunEvalStep | undefined => {
+      const idx = allSteps.indexOf(currentStep);
+      return allSteps[idx + 1];
+    },
+    [allSteps]
+  );
+
+  const setAgent = useCallback(
+    (agent: string) => {
+      setConfig(c => ({ ...c, agent }));
+      const next = nextStep('agent');
+      if (next) setStep(next);
+    },
+    [nextStep, setConfig, setStep]
+  );
+
+  const setEvaluators = useCallback(
+    (evaluators: string[]) => {
+      setConfig(c => ({ ...c, evaluators }));
+      const next = nextStep('evaluators');
+      if (next) setStep(next);
+    },
+    [nextStep, setConfig, setStep]
+  );
+
+  const setDays = useCallback(
+    (days: number) => {
+      setConfig(c => ({ ...c, days }));
+      const next = nextStep('days');
+      if (next) setStep(next);
+    },
+    [nextStep, setConfig, setStep]
+  );
+
+  const reset = useCallback(() => {
+    setConfig(getDefaultConfig());
+    setStep(allSteps[0]!);
+  }, [allSteps, setConfig, setStep]);
+
+  return {
+    config,
+    step,
+    steps: allSteps,
+    currentIndex,
+    goBack,
+    setAgent,
+    setEvaluators,
+    setDays,
+    reset,
+  };
+}
diff --git a/src/cli/tui/utils/commands.ts b/src/cli/tui/utils/commands.ts
index e1bd4980..7e6a2784 100644
--- a/src/cli/tui/utils/commands.ts
+++ b/src/cli/tui/utils/commands.ts
@@ -11,7 +11,7 @@ export interface CommandMeta {
 /**
  * Commands hidden from TUI help but still available via CLI.
  */
-const HIDDEN_FROM_TUI = ['help', 'update', 'package', 'logs', 'traces', 'run', 'pause', 'resume'] as const;
+const HIDDEN_FROM_TUI = ['help', 'update', 'package', 'logs', 'traces', 'pause', 'resume', 'stop'] as const;
 
 /**
  * Commands hidden from TUI when inside an existing project.
diff --git a/src/schema/schemas/agentcore-project.ts b/src/schema/schemas/agentcore-project.ts
index ad515acb..de6f137a 100644
--- a/src/schema/schemas/agentcore-project.ts
+++ b/src/schema/schemas/agentcore-project.ts
@@ -21,7 +21,7 @@ export type { MemoryStrategy, MemoryStrategyType } from './primitives/memory';
 export type { OnlineEvalConfig } from './primitives/online-eval-config';
 export { OnlineEvalConfigSchema, OnlineEvalConfigNameSchema } from './primitives/online-eval-config';
 export type { EvaluationLevel, EvaluatorConfig, LlmAsAJudgeConfig, RatingScale } from './primitives/evaluator';
-export { BedrockModelIdSchema, EvaluatorNameSchema } from './primitives/evaluator';
+export { BedrockModelIdSchema, isValidBedrockModelId, EvaluatorNameSchema } from './primitives/evaluator';
 
 // ============================================================================
 // Project Name Schema
@@ -140,6 +140,7 @@ export type Evaluator = z.infer<typeof EvaluatorSchema>;
 // ============================================================================
 
 const BUILTIN_EVALUATOR_PREFIX = 'Builtin.';
+const ARN_PREFIX = 'arn:';
 
 export const AgentCoreProjectSpecSchema = z
   .object({
@@ -197,22 +198,23 @@ export const AgentCoreProjectSpecSchema = z
       ),
   })
   .superRefine((spec, ctx) => {
-    // Cross-field validation: onlineEvalConfigs reference valid agents and evaluators
     const agentNames = new Set(spec.agents.map(a => a.name));
     const evaluatorNames = new Set(spec.evaluators.map(e => e.name));
 
     for (const config of spec.onlineEvalConfigs) {
-      for (const agentName of config.agents) {
-        if (!agentNames.has(agentName)) {
-          ctx.addIssue({
-            code: z.ZodIssueCode.custom,
-            message: `Online eval config "${config.name}" references unknown agent "${agentName}"`,
-          });
-        }
+      // Validate agent reference
+      if (!agentNames.has(config.agent)) {
+        ctx.addIssue({
+          code: z.ZodIssueCode.custom,
+          message: `Online eval config "${config.name}" references unknown agent "${config.agent}"`,
+        });
       }
 
+      // Validate evaluator references
       for (const evalName of config.evaluators) {
-        if (!evalName.startsWith(BUILTIN_EVALUATOR_PREFIX) && !evaluatorNames.has(evalName)) {
+        // Skip built-in evaluators and ARN references (externally managed)
+        if (evalName.startsWith(BUILTIN_EVALUATOR_PREFIX) || evalName.startsWith(ARN_PREFIX)) continue;
+        if (!evaluatorNames.has(evalName)) {
           ctx.addIssue({
             code: z.ZodIssueCode.custom,
             message: `Online eval config "${config.name}" references unknown evaluator "${evalName}"`,
diff --git a/src/schema/schemas/primitives/__tests__/online-eval-config.test.ts b/src/schema/schemas/primitives/__tests__/online-eval-config.test.ts
index c141b463..1234bd4b 100644
--- a/src/schema/schemas/primitives/__tests__/online-eval-config.test.ts
+++ b/src/schema/schemas/primitives/__tests__/online-eval-config.test.ts
@@ -29,7 +29,7 @@ describe('OnlineEvalConfigSchema', () => {
   const validConfig = {
     type: 'OnlineEvaluationConfig' as const,
     name: 'TestConfig',
-    agents: ['agent1'],
+    agent: 'MyAgent',
     evaluators: ['Builtin.GoalSuccessRate'],
     samplingRate: 10,
   };
@@ -38,18 +38,21 @@ describe('OnlineEvalConfigSchema', () => {
     expect(OnlineEvalConfigSchema.safeParse(validConfig).success).toBe(true);
   });
 
-  it('accepts multiple agents and evaluators', () => {
-    const config = { ...validConfig, agents: ['a1', 'a2'], evaluators: ['Builtin.X', 'CustomEval'] };
+  it('accepts multiple evaluators', () => {
+    const config = { ...validConfig, evaluators: ['Builtin.X', 'CustomEval'] };
     expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(true);
   });
 
-  it('rejects wrong type literal', () => {
-    const config = { ...validConfig, type: 'WrongType' };
-    expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(false);
+  it('accepts evaluator ARNs', () => {
+    const config = {
+      ...validConfig,
+      evaluators: ['arn:aws:bedrock:us-east-1:123456:evaluator/MyEval-abc'],
+    };
+    expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(true);
   });
 
-  it('rejects empty agents array', () => {
-    const config = { ...validConfig, agents: [] };
+  it('rejects wrong type literal', () => {
+    const config = { ...validConfig, type: 'WrongType' };
     expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(false);
   });
 
@@ -78,11 +81,6 @@ describe('OnlineEvalConfigSchema', () => {
     expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(true);
   });
 
-  it('rejects empty string in agents array', () => {
-    const config = { ...validConfig, agents: [''] };
-    expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(false);
-  });
-
   it('rejects empty string in evaluators array', () => {
     const config = { ...validConfig, evaluators: [''] };
     expect(OnlineEvalConfigSchema.safeParse(config).success).toBe(false);
diff --git a/src/schema/schemas/primitives/evaluator.ts b/src/schema/schemas/primitives/evaluator.ts
index c6e29f7e..ced23b53 100644
--- a/src/schema/schemas/primitives/evaluator.ts
+++ b/src/schema/schemas/primitives/evaluator.ts
@@ -55,13 +55,15 @@ export type RatingScale = z.infer<typeof RatingScaleSchema>;
 // LLM-as-a-Judge Config
 // ============================================================================
 
-export const BedrockModelIdSchema = z
-  .string()
-  .min(1, 'Model ID is required')
-  .regex(
-    /^(arn:aws(-[a-z]+)?:bedrock:[a-z0-9-]+:\d{12}:(inference-profile|foundation-model)\/[a-zA-Z0-9._:-]+|([a-z]{2}(-[a-z]+)?\.)?[a-z0-9]+\.[a-zA-Z0-9._-]+(:[0-9]+)?)$/,
-    'Must be a valid Bedrock model ID (e.g. us.anthropic.claude-sonnet-4-5-20250929-v1:0) or model ARN'
-  );
+// eslint-disable-next-line security/detect-unsafe-regex -- anchored pattern, no backtracking risk
+const BEDROCK_MODEL_ID_PATTERN = /^[a-z][a-z0-9-]*\.[a-zA-Z0-9._-]+(:[0-9]+)?$/;
+const BEDROCK_ARN_PATTERN = /^arn:aws[a-z-]*:bedrock:[a-z0-9-]+:\d{12}:(inference-profile|foundation-model)\/.+$/;
+
+export function isValidBedrockModelId(value: string): boolean {
+  return BEDROCK_MODEL_ID_PATTERN.test(value) || BEDROCK_ARN_PATTERN.test(value);
+}
+
+export const BedrockModelIdSchema = z.string().min(1, 'Model ID is required');
 
 export const LlmAsAJudgeConfigSchema = z.object({
   model: BedrockModelIdSchema,
diff --git a/src/schema/schemas/primitives/index.ts b/src/schema/schemas/primitives/index.ts
index f68fa3e4..1d0fb665 100644
--- a/src/schema/schemas/primitives/index.ts
+++ b/src/schema/schemas/primitives/index.ts
@@ -16,6 +16,7 @@ export type {
 } from './evaluator';
 export {
   BedrockModelIdSchema,
+  isValidBedrockModelId,
   EvaluationLevelSchema,
   EvaluatorConfigSchema,
   EvaluatorNameSchema,
diff --git a/src/schema/schemas/primitives/online-eval-config.ts b/src/schema/schemas/primitives/online-eval-config.ts
index 784c3c25..4c87b27c 100644
--- a/src/schema/schemas/primitives/online-eval-config.ts
+++ b/src/schema/schemas/primitives/online-eval-config.ts
@@ -16,9 +16,9 @@ export const OnlineEvalConfigNameSchema = z
 export const OnlineEvalConfigSchema = z.object({
   type: z.literal('OnlineEvaluationConfig'),
   name: OnlineEvalConfigNameSchema,
-  /** Agent names this online eval config monitors */
-  agents: z.array(z.string().min(1)).min(1, 'At least one agent is required'),
-  /** Evaluator names (custom) or Builtin.* IDs */
+  /** Agent name to monitor (must match a project agent) */
+  agent: z.string().min(1, 'Agent name is required'),
+  /** Evaluator names (custom), Builtin.* IDs, or evaluator ARNs */
   evaluators: z.array(z.string().min(1)).min(1, 'At least one evaluator is required'),
   /** Sampling rate as a percentage (0.01 to 100) */
   samplingRate: z.number().min(0.01).max(100),