From 40e8fc658cba33e7277bc56a992ae52d6ecb0e38 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 28 Jan 2026 07:24:35 +0000
Subject: [PATCH 1/9] feat(eval): add definePromptTemplate SDK wrapper for
 executable prompt templates

Add TypeScript/JavaScript support for custom evaluator prompts using the
same subprocess pattern as code judges.

Changes:
- Add PromptTemplateInputSchema and definePromptTemplate to @agentv/eval
- Update orchestrator to execute .ts/.js prompt files as subprocesses
- Add config and resolvedPromptPath to LlmJudgeEvaluatorConfig
- Skip validation for executable prompt templates in evaluator parser
- Add unit tests for PromptTemplateInputSchema
- Add integration tests for executable prompt templates
- Add example in examples/features/prompt-template-sdk/

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .../features/prompt-template-sdk/README.md    |  58 ++++++
 .../prompt-template-sdk/evals/dataset.yaml    |  51 +++++
 .../prompts/custom-evaluator.ts               |  48 +++++
 .../evaluation/loaders/evaluator-parser.ts    |  30 ++-
 packages/core/src/evaluation/orchestrator.ts  |  92 ++++++++-
 packages/core/src/evaluation/types.ts         |   4 +
 .../core/test/evaluation/orchestrator.test.ts | 180 +++++++++++++++++
 packages/eval/src/index.ts                    |  47 +++++
 packages/eval/src/prompt-template.ts          | 104 ++++++++++
 packages/eval/src/schemas.ts                  |  20 ++
 .../eval/test/define-prompt-template.test.ts  | 182 ++++++++++++++++++
 11 files changed, 800 insertions(+), 16 deletions(-)
 create mode 100644 examples/features/prompt-template-sdk/README.md
 create mode 100644 examples/features/prompt-template-sdk/evals/dataset.yaml
 create mode 100644 examples/features/prompt-template-sdk/prompts/custom-evaluator.ts
 create mode 100644 packages/eval/src/prompt-template.ts
 create mode 100644 packages/eval/test/define-prompt-template.test.ts

diff --git a/examples/features/prompt-template-sdk/README.md b/examples/features/prompt-template-sdk/README.md
new file mode 100644
index 00000000..3b2347d4
--- /dev/null
+++ b/examples/features/prompt-template-sdk/README.md
@@ -0,0 +1,58 @@
+# Prompt Template SDK
+
+This example demonstrates using TypeScript files for custom LLM judge prompts using the `definePromptTemplate` helper from `@agentv/eval`.
+
+## Features
+
+- **Type-safe prompt generation**: Full TypeScript support with autocomplete for context fields
+- **Conditional logic**: Use JavaScript/TypeScript conditionals for dynamic prompts
+- **Config pass-through**: Access custom config from YAML in your prompt template
+- **Same pattern as code judges**: Follows the familiar subprocess pattern
+
+## How It Works
+
+Instead of static text files with `{{variable}}` placeholders, you can use TypeScript files that export a prompt template:
+
+```typescript
+import { definePromptTemplate } from '@agentv/eval';
+
+export default definePromptTemplate((ctx) => `
+  Question: ${ctx.question}
+  Answer: ${ctx.candidateAnswer}
+
+  ${ctx.referenceAnswer ? `Reference: ${ctx.referenceAnswer}` : ''}
+`);
+```
+
+The template receives evaluation context via stdin (JSON) and outputs the prompt string to stdout.
+
+## Available Context Fields
+
+- `question` - The eval case question
+- `candidateAnswer` - The agent's response being evaluated
+- `referenceAnswer` - Optional reference answer
+- `expectedOutcome` - Optional expected outcome
+- `expectedMessages` - Optional expected messages
+- `outputMessages` - Optional output messages from agent
+- `guidelineFiles` - Paths to guideline files
+- `inputFiles` - Paths to input files
+- `inputMessages` - Input messages to agent
+- `traceSummary` - Optional trace summary with tool usage metrics
+- `config` - Optional pass-through config from YAML
+
+## Running
+
+```bash
+bun agentv eval examples/features/prompt-template-sdk/evals/dataset.yaml --dry-run
+```
+
+## File Structure
+
+```
+prompt-template-sdk/
+  evals/
+    dataset.yaml       # Eval cases using TypeScript prompt
+  prompts/
+    custom-evaluator.ts  # TypeScript prompt template
+  README.md
+```
diff --git a/examples/features/prompt-template-sdk/evals/dataset.yaml b/examples/features/prompt-template-sdk/evals/dataset.yaml
new file mode 100644
index 00000000..a820cd48
--- /dev/null
+++ b/examples/features/prompt-template-sdk/evals/dataset.yaml
@@ -0,0 +1,51 @@
+# Prompt Template SDK Demo
+# Demonstrates using TypeScript/JavaScript files for custom evaluator prompts.
+
+description: Demonstrates TypeScript prompt templates for custom LLM judge prompts
+
+# Uses the CLI target defined in .agentv/targets.yaml
+execution:
+  target: local_cli
+
+evalcases:
+  - id: prompt-template-basic
+    expected_outcome: The CLI provides a clear answer about TypeScript benefits.
+
+    input_messages:
+      - role: user
+        content:
+          - type: text
+            value: What are the main benefits of TypeScript over JavaScript?
+
+    reference_answer: |-
+      TypeScript provides static type checking, better IDE support, and improved maintainability.
+
+    execution:
+      evaluators:
+        - name: custom-prompt-eval
+          type: llm_judge
+          prompt: ../prompts/custom-evaluator.ts
+
+  - id: prompt-template-with-config
+    expected_outcome: The CLI explains async/await correctly.
+
+    input_messages:
+      - role: user
+        content:
+          - type: text
+            value: Explain async/await in JavaScript.
+
+    reference_answer: |-
+      Async/await is syntactic sugar over Promises that makes asynchronous code look synchronous.
+
+    execution:
+      evaluators:
+        - name: strict-eval
+          type: llm_judge
+          prompt: ../prompts/custom-evaluator.ts
+          config:
+            rubric: |-
+              - Must mention Promises
+              - Must explain the synchronous-looking syntax
+              - Should provide an example or use case
+            strictMode: true
diff --git a/examples/features/prompt-template-sdk/prompts/custom-evaluator.ts b/examples/features/prompt-template-sdk/prompts/custom-evaluator.ts
new file mode 100644
index 00000000..f4cabfad
--- /dev/null
+++ b/examples/features/prompt-template-sdk/prompts/custom-evaluator.ts
@@ -0,0 +1,48 @@
+#!/usr/bin/env bun
+/**
+ * Custom Prompt Template Demo
+ *
+ * Uses the declarative definePromptTemplate helper to generate
+ * a custom evaluation prompt with full TypeScript support.
+ */
+import { definePromptTemplate } from '@agentv/eval';
+
+export default definePromptTemplate((ctx) => {
+  // Access typed config from YAML
+  const rubric = ctx.config?.rubric as string | undefined;
+  const strictMode = ctx.config?.strictMode as boolean | undefined;
+
+  // Build conditional sections
+  const referenceSection = ctx.referenceAnswer
+    ? `\n## Reference Answer\n${ctx.referenceAnswer}`
+    : '';
+
+  const rubricSection = rubric ? `\n## Evaluation Rubric\n${rubric}` : '';
+
+  const strictWarning = strictMode
+    ? '\n**Note:** Strict mode enabled - minor inaccuracies should result in lower scores.'
+    : '';
+
+  return `You are evaluating an AI assistant's response.
+
+## Question
+${ctx.question}
+
+## Candidate Answer
+${ctx.candidateAnswer}
+${referenceSection}
+${rubricSection}
+${strictWarning}
+
+## Instructions
+Evaluate the candidate answer based on:
+1. Correctness - Does it accurately answer the question?
+2. Completeness - Does it address all parts of the question?
+3. Clarity - Is the response clear and well-structured?
+
+Respond with a JSON object containing:
+- score: A number from 0 to 1
+- reasoning: Brief explanation of your evaluation
+- hits: Array of positive aspects
+- misses: Array of issues or missing elements`;
+});
diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts
index 06f4afa6..7b468b6c 100644
--- a/packages/core/src/evaluation/loaders/evaluator-parser.ts
+++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts
@@ -528,13 +528,18 @@ export async function parseEvaluators(
       const resolved = await resolveFileReference(prompt, searchRoots);
       if (resolved.resolvedPath) {
         promptPath = path.resolve(resolved.resolvedPath);
-        // Validate custom prompt content upfront - throws error if validation fails
-        try {
-          await validateCustomPromptContent(promptPath);
-        } catch (error) {
-          const message = error instanceof Error ? error.message : String(error);
-          // Add context and re-throw for the caller to handle
-          throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`);
+        // Skip validation for executable prompt templates (.ts/.js files)
+        // These are executed as subprocesses, not parsed as text templates
+        const ext = path.extname(promptPath).toLowerCase();
+        if (ext !== '.ts' && ext !== '.js') {
+          // Validate custom prompt content upfront - throws error if validation fails
+          try {
+            await validateCustomPromptContent(promptPath);
+          } catch (error) {
+            const message = error instanceof Error ? error.message : String(error);
+            // Add context and re-throw for the caller to handle
+            throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`);
+          }
         }
       } else {
         logWarning(
@@ -577,13 +582,24 @@ export async function parseEvaluators(
 
     const weight = validateWeight(rawEvaluator.weight, name, evalId);
 
+    // Collect unrecognized properties as pass-through config (for executable prompt templates)
+    const knownProps = new Set(['name', 'type', 'prompt', 'model', 'rubrics', 'weight']);
+    const config: Record<string, JsonValue> = {};
+    for (const [key, value] of Object.entries(rawEvaluator)) {
+      if (!knownProps.has(key) && value !== undefined) {
+        config[key] = value as JsonValue;
+      }
+    }
+
     evaluators.push({
       name,
       type: 'llm_judge',
       prompt,
       promptPath,
+      ...(promptPath ? { resolvedPromptPath: promptPath } : {}),
       ...(parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}),
       ...(weight !== undefined ? { weight } : {}),
+      ...(Object.keys(config).length > 0 ? { config } : {}),
     });
   }
 
diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
index 94f24ac6..ce8a73c6 100644
--- a/packages/core/src/evaluation/orchestrator.ts
+++ b/packages/core/src/evaluation/orchestrator.ts
@@ -3,6 +3,7 @@ import path from 'node:path';
 import micromatch from 'micromatch';
 import pLimit from 'p-limit';
 
+import { toSnakeCaseDeep } from './case-conversion.js';
 import {
   type ChildEvaluatorResult,
   CodeEvaluator,
@@ -15,6 +16,7 @@ import {
   LlmJudgeEvaluator,
   TokenUsageEvaluator,
   ToolTrajectoryEvaluator,
+  executeScript,
   isNonEmptyString,
   scoreToVerdict,
 } from './evaluators.js';
@@ -895,6 +897,8 @@ async function runEvaluatorList(options: {
           promptInputs,
           now,
           judgeProvider,
+          outputMessages,
+          traceSummary,
         });
         const weight = evaluator.weight ?? 1.0;
         scored.push({ score, name: evaluator.name, type: evaluator.type, weight });
@@ -1251,6 +1255,8 @@ async function runLlmJudgeEvaluator(options: {
   readonly promptInputs: PromptInputs;
   readonly now: Date;
   readonly judgeProvider?: Provider;
+  readonly outputMessages?: readonly OutputMessage[];
+  readonly traceSummary?: TraceSummary;
 }): Promise<EvaluationScore> {
   const {
     config,
@@ -1263,8 +1269,16 @@ async function runLlmJudgeEvaluator(options: {
     promptInputs,
     now,
     judgeProvider,
+    outputMessages,
+    traceSummary,
   } = options;
-  const customPrompt = await resolveCustomPrompt(config);
+  const customPrompt = await resolveCustomPrompt(config, {
+    evalCase,
+    candidate,
+    outputMessages,
+    traceSummary,
+    config: config.config,
+  });
 
   return evaluatorRegistry.llm_judge.evaluate({
     evalCase,
@@ -1280,20 +1294,80 @@ async function runLlmJudgeEvaluator(options: {
   });
 }
 
-async function resolveCustomPrompt(config: {
-  readonly prompt?: string;
-  readonly promptPath?: string;
-}): Promise<string | undefined> {
-  if (config.promptPath) {
+interface ResolveCustomPromptContext {
+  readonly evalCase: EvalCase;
+  readonly candidate: string;
+  readonly outputMessages?: readonly OutputMessage[];
+  readonly traceSummary?: TraceSummary;
+  readonly config?: Record<string, unknown>;
+}
+
+async function resolveCustomPrompt(
+  promptConfig: {
+    readonly prompt?: string;
+    readonly promptPath?: string;
+    readonly resolvedPromptPath?: string;
+    readonly config?: Record<string, unknown>;
+  },
+  context?: ResolveCustomPromptContext,
+): Promise<string | undefined> {
+  const promptPath = promptConfig.resolvedPromptPath ?? promptConfig.promptPath;
+
+  if (promptPath) {
+    const ext = path.extname(promptPath).toLowerCase();
+
+    // Executable prompt template (same pattern as code judges)
+    if (ext === '.ts' || ext === '.js') {
+      if (!context) {
+        throw new Error('Context required for executable prompt templates (.ts/.js files)');
+      }
+      return executePromptTemplate(promptPath, context, promptConfig.config);
+    }
+
+    // Static text file (existing behavior)
     try {
-      const content = await readTextFile(config.promptPath);
+      const content = await readTextFile(promptPath);
       return content;
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
-      console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
+      console.warn(`Could not read custom prompt at ${promptPath}: ${message}`);
     }
   }
-  return config.prompt;
+  return promptConfig.prompt;
+}
+
+async function executePromptTemplate(
+  scriptPath: string,
+  context: ResolveCustomPromptContext,
+  config?: Record<string, unknown>,
+): Promise<string> {
+  // Build payload matching code judge input format for consistency
+  const payload = {
+    question: context.evalCase.question,
+    expectedOutcome: context.evalCase.expected_outcome,
+    expectedMessages: context.evalCase.expected_messages,
+    referenceAnswer: context.evalCase.reference_answer,
+    candidateAnswer: context.candidate,
+    outputMessages: context.outputMessages ?? null,
+    guidelineFiles: context.evalCase.guideline_paths,
+    inputFiles: context.evalCase.file_paths.filter(
+      (p) => !context.evalCase.guideline_paths.includes(p),
+    ),
+    inputMessages: context.evalCase.input_messages,
+    traceSummary: context.traceSummary ?? null,
+    config: config ?? context.config ?? null,
+  };
+
+  const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
+  const cwd = path.dirname(scriptPath);
+
+  try {
+    const stdout = await executeScript(['bun', 'run', scriptPath], inputJson, undefined, cwd);
+    return stdout.trim();
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    throw new Error(`Prompt template execution failed: ${message}`);
+  }
 }
 
 function filterEvalCases(evalCases: readonly EvalCase[], filter?: string): readonly EvalCase[] {
diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
index f41a63fb..eb1e458c 100644
--- a/packages/core/src/evaluation/types.ts
+++ b/packages/core/src/evaluation/types.ts
@@ -196,8 +196,12 @@ export type LlmJudgeEvaluatorConfig = {
   readonly type: 'llm_judge';
   readonly prompt?: string;
   readonly promptPath?: string;
+  /** Resolved absolute path for prompt file (used by executable .ts/.js prompts) */
+  readonly resolvedPromptPath?: string;
   readonly rubrics?: readonly RubricItem[];
   readonly weight?: number;
+  /** Pass-through configuration for custom evaluator prompts */
+  readonly config?: Record<string, unknown>;
 };
 
 /**
diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts
index 7d27093c..4db19fcf 100644
--- a/packages/core/test/evaluation/orchestrator.test.ts
+++ b/packages/core/test/evaluation/orchestrator.test.ts
@@ -957,4 +957,184 @@ describe('runEvalCase trace integration', () => {
       expect(result.score).toBe(0);
     });
   });
+
+  describe('executable prompt templates', () => {
+    it('executes TypeScript prompt template and uses output as custom prompt', async () => {
+      const tmpDir = mkdtempSync(path.join(tmpdir(), 'prompt-template-'));
+      const promptPath = path.join(tmpDir, 'my-prompt.ts');
+
+      // Write a simple TypeScript prompt template that reads stdin manually
+      // (avoiding dependency on @agentv/eval which won't resolve from temp dir)
+      writeFileSync(
+        promptPath,
+        `import { readFileSync } from 'fs';
+const stdin = readFileSync(0, 'utf8');
+const input = JSON.parse(stdin);
+console.log(\`Question: \${input.question}
+Candidate: \${input.candidate_answer}
+Reference: \${input.reference_answer ?? 'none'}\`);
+`,
+      );
+
+      // Custom judge that captures the prompt it receives
+      let receivedQuestion = '';
+      const captureJudge = {
+        kind: 'llm_judge' as const,
+        async evaluate(context: { evalCase: EvalCase; evaluatorTemplateOverride?: string }) {
+          // The evaluatorTemplateOverride should contain our custom prompt
+          receivedQuestion = context.evaluatorTemplateOverride ?? '';
+          return {
+            score: 1.0,
+            verdict: 'pass' as const,
+            hits: ['Test passed'],
+            misses: [],
+            expectedAspectCount: 1,
+          };
+        },
+      };
+
+      const provider = new SequenceProvider('mock', {
+        responses: [
+          {
+            outputMessages: [{ role: 'assistant', content: 'The answer is 4' }],
+          },
+        ],
+      });
+
+      const result = await runEvalCase({
+        evalCase: {
+          ...baseTestCase,
+          question: 'What is 2+2?',
+          reference_answer: 'The sum is 4',
+          evaluators: [
+            {
+              name: 'ts-prompt-eval',
+              type: 'llm_judge',
+              promptPath: promptPath,
+              resolvedPromptPath: promptPath,
+            },
+          ],
+        },
+        provider,
+        target: baseTarget,
+        evaluators: { llm_judge: captureJudge },
+      });
+
+      expect(result.score).toBe(1.0);
+      expect(receivedQuestion).toContain('Question: What is 2+2?');
+      expect(receivedQuestion).toContain('Candidate: The answer is 4');
+      expect(receivedQuestion).toContain('Reference: The sum is 4');
+    });
+
+    it('executes JavaScript prompt template', async () => {
+      const tmpDir = mkdtempSync(path.join(tmpdir(), 'prompt-template-js-'));
+      const promptPath = path.join(tmpDir, 'my-prompt.js');
+
+      // Write a simple JS prompt template that reads stdin manually
+      writeFileSync(
+        promptPath,
+        `const fs = require('fs');
+const stdin = fs.readFileSync(0, 'utf8');
+const input = JSON.parse(stdin);
+console.log('Question: ' + input.question + '\\nAnswer: ' + input.candidate_answer);
+`,
+      );
+
+      let receivedPrompt = '';
+      const captureJudge = {
+        kind: 'llm_judge' as const,
+        async evaluate(context: { evaluatorTemplateOverride?: string }) {
+          receivedPrompt = context.evaluatorTemplateOverride ?? '';
+          return {
+            score: 1.0,
+            verdict: 'pass' as const,
+            hits: [],
+            misses: [],
+            expectedAspectCount: 1,
+          };
+        },
+      };
+
+      const provider = new SequenceProvider('mock', {
+        responses: [
+          {
+            outputMessages: [{ role: 'assistant', content: 'Test response' }],
+          },
+        ],
+      });
+
+      const result = await runEvalCase({
+        evalCase: {
+          ...baseTestCase,
+          question: 'Test question',
+          evaluators: [
+            {
+              name: 'js-prompt-eval',
+              type: 'llm_judge',
+              promptPath: promptPath,
+              resolvedPromptPath: promptPath,
+            },
+          ],
+        },
+        provider,
+        target: baseTarget,
+        evaluators: { llm_judge: captureJudge },
+      });
+
+      expect(result.score).toBe(1.0);
+      expect(receivedPrompt).toContain('Question: Test question');
+      expect(receivedPrompt).toContain('Answer: Test response');
+    });
+
+    it('falls back to text file reading for .txt files', async () => {
+      const tmpDir = mkdtempSync(path.join(tmpdir(), 'prompt-txt-'));
+      const promptPath = path.join(tmpDir, 'my-prompt.txt');
+
+      // Write a static text prompt
+      writeFileSync(promptPath, 'Static prompt content from text file');
+
+      let receivedPrompt = '';
+      const captureJudge = {
+        kind: 'llm_judge' as const,
+        async evaluate(context: { evaluatorTemplateOverride?: string }) {
+          receivedPrompt = context.evaluatorTemplateOverride ?? '';
+          return {
+            score: 1.0,
+            verdict: 'pass' as const,
+            hits: [],
+            misses: [],
+            expectedAspectCount: 1,
+          };
+        },
+      };
+
+      const provider = new SequenceProvider('mock', {
+        responses: [
+          {
+            outputMessages: [{ role: 'assistant', content: 'Response' }],
+          },
+        ],
+      });
+
+      const result = await runEvalCase({
+        evalCase: {
+          ...baseTestCase,
+          evaluators: [
+            {
+              name: 'txt-prompt-eval',
+              type: 'llm_judge',
+              promptPath: promptPath,
+              resolvedPromptPath: promptPath,
+            },
+          ],
+        },
+        provider,
+        target: baseTarget,
+        evaluators: { llm_judge: captureJudge },
+      });
+
+      expect(result.score).toBe(1.0);
+      expect(receivedPrompt).toBe('Static prompt content from text file');
+    });
+  });
 });
diff --git a/packages/eval/src/index.ts b/packages/eval/src/index.ts
index 8eb53afc..ddb3161c 100644
--- a/packages/eval/src/index.ts
+++ b/packages/eval/src/index.ts
@@ -47,12 +47,14 @@ export {
   MessageSchema,
   ToolCallSchema,
   TokenUsageSchema,
+  PromptTemplateInputSchema,
   type CodeJudgeInput,
   type CodeJudgeResult,
   type TraceSummary,
   type Message,
   type ToolCall,
   type TokenUsage,
+  type PromptTemplateInput,
 } from './schemas.js';
 
 // Re-export target client
@@ -69,10 +71,12 @@ export {
 // Re-export Zod for typed config support
 export { z } from 'zod';
 
+import { type PromptTemplateHandler, runPromptTemplate } from './prompt-template.js';
 // Import runtime
 import { type CodeJudgeHandler, runCodeJudge } from './runtime.js';
 
 export type { CodeJudgeHandler };
+export type { PromptTemplateHandler };
 
 /**
  * Define a code judge evaluator with automatic stdin/stdout handling.
@@ -122,3 +126,46 @@ export function defineCodeJudge(handler: CodeJudgeHandler): void {
   // Run immediately when module is loaded
   runCodeJudge(handler);
 }
+
+/**
+ * Define a prompt template with automatic stdin/stdout handling.
+ *
+ * This function:
+ * 1. Reads JSON from stdin (snake_case format)
+ * 2. Converts to camelCase and validates with Zod
+ * 3. Calls your handler with typed input
+ * 4. Outputs the generated prompt string to stdout
+ * 5. Handles errors gracefully with proper exit codes
+ *
+ * @param handler - Function that generates the prompt string from input
+ *
+ * @example
+ * ```typescript
+ * import { definePromptTemplate } from '@agentv/eval';
+ *
+ * export default definePromptTemplate((ctx) => `
+ *   Question: ${ctx.question}
+ *   Answer: ${ctx.candidateAnswer}
+ *
+ *   ${ctx.referenceAnswer ? `Reference: ${ctx.referenceAnswer}` : ''}
+ * `);
+ * ```
+ *
+ * @example With conditional logic
+ * ```typescript
+ * import { definePromptTemplate } from '@agentv/eval';
+ *
+ * export default definePromptTemplate((ctx) => {
+ *   const rubric = ctx.config?.rubric as string | undefined;
+ *   return `
+ *     Question: ${ctx.question}
+ *     Candidate Answer: ${ctx.candidateAnswer}
+ *     ${rubric ? `\nEvaluation Criteria:\n${rubric}` : ''}
+ *   `;
+ * });
+ * ```
+ */
+export function definePromptTemplate(handler: PromptTemplateHandler): void {
+  // Run immediately when module is loaded
+  runPromptTemplate(handler);
+}
diff --git a/packages/eval/src/prompt-template.ts b/packages/eval/src/prompt-template.ts
new file mode 100644
index 00000000..c96b1fdd
--- /dev/null
+++ b/packages/eval/src/prompt-template.ts
@@ -0,0 +1,104 @@
+/**
+ * Runtime for prompt template evaluators.
+ * Handles stdin parsing, validation, error handling, and string output.
+ */
+import { readFileSync } from 'node:fs';
+
+import { toCamelCaseDeep } from './case-conversion.js';
+import { type PromptTemplateInput, PromptTemplateInputSchema } from './schemas.js';
+
+/**
+ * Handler function type for prompt templates.
+ * Returns the prompt string to use for evaluation.
+ */
+export type PromptTemplateHandler = (input: PromptTemplateInput) => string | Promise<string>;
+
+/**
+ * Read stdin synchronously (works in both Node.js and Bun).
+ */
+function readStdin(): string {
+  return readFileSync(0, 'utf8');
+}
+
+/**
+ * Run a prompt template handler with full stdin/stdout handling.
+ * This is the internal implementation called by definePromptTemplate.
+ */
+export async function runPromptTemplate(handler: PromptTemplateHandler): Promise<void> {
+  try {
+    // 1. Read stdin
+    const stdin = readStdin();
+
+    // 2. Parse JSON
+    const rawInput = JSON.parse(stdin) as Record<string, unknown>;
+
+    // 3. Convert snake_case to camelCase
+    const camelInput = toCamelCaseDeep(rawInput);
+
+    // 4. Validate input with Zod
+    const input = PromptTemplateInputSchema.parse(camelInput);
+
+    // 5. Run handler
+    const prompt = await handler(input);
+
+    // 6. Output raw string (not JSON) - the prompt itself
+    console.log(prompt);
+  } catch (error) {
+    // Output error to stderr and exit with non-zero code
+    console.error(error instanceof Error ? error.message : String(error));
+    process.exit(1);
+  }
+}
+
+/**
+ * Define a prompt template with automatic stdin/stdout handling.
+ *
+ * This function:
+ * 1. Reads JSON from stdin (snake_case format)
+ * 2. Converts to camelCase and validates with Zod
+ * 3. Calls your handler with typed input
+ * 4. Outputs the generated prompt string to stdout
+ * 5. Handles errors gracefully with proper exit codes
+ *
+ * @param handler - Function that generates the prompt string from input
+ *
+ * @example
+ * ```typescript
+ * import { definePromptTemplate } from '@agentv/eval';
+ *
+ * export default definePromptTemplate((ctx) => `
+ *   Question: ${ctx.question}
+ *   Answer: ${ctx.candidateAnswer}
+ *
+ *   ${ctx.referenceAnswer ? `Reference: ${ctx.referenceAnswer}` : ''}
+ * `);
+ * ```
+ *
+ * @example With conditional logic
+ * ```typescript
+ * import { definePromptTemplate } from '@agentv/eval';
+ *
+ * export default definePromptTemplate((ctx) => {
+ *   const rubric = ctx.config?.rubric as string | undefined;
+ *   return `
+ *     Question: ${ctx.question}
+ *     Candidate Answer: ${ctx.candidateAnswer}
+ *     ${rubric ? `\nEvaluation Criteria:\n${rubric}` : ''}
+ *   `;
+ * });
+ * ```
+ *
+ * @example Async handler
+ * ```typescript
+ * import { definePromptTemplate } from '@agentv/eval';
+ *
+ * export default definePromptTemplate(async (ctx) => {
+ *   // Async operations are supported
+ *   return `Question: ${ctx.question}\nAnswer: ${ctx.candidateAnswer}`;
+ * });
+ * ```
+ */
+export function definePromptTemplate(handler: PromptTemplateHandler): void {
+  // Run immediately when module is loaded
+  runPromptTemplate(handler);
+}
diff --git a/packages/eval/src/schemas.ts b/packages/eval/src/schemas.ts
index f9afa6b3..af6bde4f 100644
--- a/packages/eval/src/schemas.ts
+++ b/packages/eval/src/schemas.ts
@@ -88,3 +88,23 @@ export type TraceSummary = z.infer<typeof TraceSummarySchema>;
 export type Message = z.infer<typeof MessageSchema>;
 export type ToolCall = z.infer<typeof ToolCallSchema>;
 export type TokenUsage = z.infer<typeof TokenUsageSchema>;
+
+/**
+ * Prompt template input schema (camelCase, converted from snake_case wire format).
+ * Uses the same fields as CodeJudgeInput for consistency.
+ */
+export const PromptTemplateInputSchema = z.object({
+  question: z.string(),
+  expectedOutcome: z.string().optional(),
+  expectedMessages: z.array(MessageSchema).optional(),
+  referenceAnswer: z.string().optional(),
+  candidateAnswer: z.string(),
+  outputMessages: z.array(MessageSchema).nullable().optional(),
+  guidelineFiles: z.array(z.string()).optional(),
+  inputFiles: z.array(z.string()).optional(),
+  inputMessages: z.array(MessageSchema).optional(),
+  traceSummary: TraceSummarySchema.nullable().optional(),
+  config: z.record(z.unknown()).nullable().optional(),
+});
+
+export type PromptTemplateInput = z.infer<typeof PromptTemplateInputSchema>;
diff --git a/packages/eval/test/define-prompt-template.test.ts b/packages/eval/test/define-prompt-template.test.ts
new file mode 100644
index 00000000..ab0ab831
--- /dev/null
+++ b/packages/eval/test/define-prompt-template.test.ts
@@ -0,0 +1,182 @@
+import { describe, expect, it } from 'bun:test';
+
+import { type PromptTemplateInput, PromptTemplateInputSchema } from '../src/schemas.js';
+
+describe('PromptTemplateInputSchema', () => {
+  const validInput = {
+    question: 'What is 2+2?',
+    candidateAnswer: 'The answer is 4',
+  };
+
+  it('parses minimal valid input', () => {
+    const result = PromptTemplateInputSchema.parse(validInput);
+    expect(result.question).toBe('What is 2+2?');
+    expect(result.candidateAnswer).toBe('The answer is 4');
+  });
+
+  it('accepts optional expectedOutcome', () => {
+    const inputWithOutcome = {
+      ...validInput,
+      expectedOutcome: 'The answer should be 4',
+    };
+    const result = PromptTemplateInputSchema.parse(inputWithOutcome);
+    expect(result.expectedOutcome).toBe('The answer should be 4');
+  });
+
+  it('accepts optional expectedMessages', () => {
+    const inputWithMessages = {
+      ...validInput,
+      expectedMessages: [{ role: 'assistant', content: '4' }],
+    };
+    const result = PromptTemplateInputSchema.parse(inputWithMessages);
+    expect(result.expectedMessages?.[0].content).toBe('4');
+  });
+
+  it('accepts optional referenceAnswer', () => {
+    const inputWithReference = {
+      ...validInput,
+      referenceAnswer: 'The sum of 2 and 2 is 4',
+    };
+    const result = PromptTemplateInputSchema.parse(inputWithReference);
+    expect(result.referenceAnswer).toBe('The sum of 2 and 2 is 4');
+  });
+
+  it('accepts optional traceSummary', () => {
+    const inputWithTrace = {
+      ...validInput,
+      traceSummary: {
+        eventCount: 3,
+        toolNames: ['read', 'write'],
+        toolCallsByName: { read: 2, write: 1 },
+        errorCount: 0,
+      },
+    };
+    const result = PromptTemplateInputSchema.parse(inputWithTrace);
+    expect(result.traceSummary?.eventCount).toBe(3);
+    expect(result.traceSummary?.toolNames).toEqual(['read', 'write']);
+  });
+
+  it('accepts null traceSummary', () => {
+    const inputWithNullTrace = {
+      ...validInput,
+      traceSummary: null,
+    };
+    const result = PromptTemplateInputSchema.parse(inputWithNullTrace);
+    expect(result.traceSummary).toBeNull();
+  });
+
+  it('accepts optional config', () => {
+    const inputWithConfig = {
+      ...validInput,
+      config: { rubric: 'Check for correctness', strictMode: true },
+    };
+    const result = PromptTemplateInputSchema.parse(inputWithConfig);
+    expect(result.config).toEqual({ rubric: 'Check for correctness', strictMode: true });
+  });
+
+  it('accepts optional guidelineFiles', () => {
+    const inputWithGuidelines = {
+      ...validInput,
+      guidelineFiles: ['/path/to/guideline1.txt', '/path/to/guideline2.txt'],
+    };
+    const result = PromptTemplateInputSchema.parse(inputWithGuidelines);
+    expect(result.guidelineFiles).toEqual(['/path/to/guideline1.txt', '/path/to/guideline2.txt']);
+  });
+
+  it('accepts optional inputFiles', () => {
+    const inputWithFiles = {
+      ...validInput,
+      inputFiles: ['/path/to/input1.txt'],
+    };
+    const result = PromptTemplateInputSchema.parse(inputWithFiles);
+    expect(result.inputFiles).toEqual(['/path/to/input1.txt']);
+  });
+
+  it('accepts optional inputMessages', () => {
+    const inputWithMessages = {
+      ...validInput,
+      inputMessages: [{ role: 'user', content: 'What is 2+2?' }],
+    };
+    const result = PromptTemplateInputSchema.parse(inputWithMessages);
+    expect(result.inputMessages?.[0].content).toBe('What is 2+2?');
+  });
+
+  it('accepts optional outputMessages with toolCalls', () => {
+    const inputWithOutput = {
+      ...validInput,
+      outputMessages: [
+        {
+          role: 'assistant',
+          content: 'Reading file...',
+          toolCalls: [{ tool: 'read', input: { path: 'test.txt' } }],
+        },
+      ],
+    };
+    const result = PromptTemplateInputSchema.parse(inputWithOutput);
+    expect(result.outputMessages?.[0].toolCalls?.[0].tool).toBe('read');
+  });
+
+  it('accepts full input with all optional fields', () => {
+    const fullInput = {
+      question: 'What is 2+2?',
+      expectedOutcome: 'The answer should be 4',
+      expectedMessages: [{ role: 'assistant', content: '4' }],
+      referenceAnswer: 'The sum is 4',
+      candidateAnswer: 'The answer is 4',
+      outputMessages: [{ role: 'assistant', content: 'The answer is 4' }],
+      guidelineFiles: ['/path/to/guideline.txt'],
+      inputFiles: ['/path/to/input.txt'],
+      inputMessages: [{ role: 'user', content: 'What is 2+2?' }],
+      traceSummary: {
+        eventCount: 1,
+        toolNames: [],
+        toolCallsByName: {},
+        errorCount: 0,
+      },
+      config: { rubric: 'Check correctness' },
+    };
+    const result = PromptTemplateInputSchema.parse(fullInput);
+    expect(result.question).toBe('What is 2+2?');
+    expect(result.expectedOutcome).toBe('The answer should be 4');
+    expect(result.referenceAnswer).toBe('The sum is 4');
+    expect(result.candidateAnswer).toBe('The answer is 4');
+    expect(result.config).toEqual({ rubric: 'Check correctness' });
+  });
+});
+
+describe('Schema type inference', () => {
+  it('PromptTemplateInput has expected shape', () => {
+    // Type-level test: ensure inferred types have expected properties
+    const input: PromptTemplateInput = {
+      question: 'test',
+      candidateAnswer: 'test',
+    };
+
+    // These should all type-check correctly
+    const _q: string = input.question;
+    const _c: string = input.candidateAnswer;
+    const _trace: PromptTemplateInput['traceSummary'] = undefined;
+    const _config: PromptTemplateInput['config'] = null;
+    const _ref: PromptTemplateInput['referenceAnswer'] = undefined;
+    const _outcome: PromptTemplateInput['expectedOutcome'] = undefined;
+
+    expect(input.question).toBe('test');
+  });
+
+  it('PromptTemplateInput allows all optional fields to be omitted', () => {
+    const minimalInput: PromptTemplateInput = {
+      question: 'test question',
+      candidateAnswer: 'test answer',
+    };
+
+    expect(minimalInput.expectedOutcome).toBeUndefined();
+    expect(minimalInput.expectedMessages).toBeUndefined();
+    expect(minimalInput.referenceAnswer).toBeUndefined();
+    expect(minimalInput.outputMessages).toBeUndefined();
+    expect(minimalInput.guidelineFiles).toBeUndefined();
+    expect(minimalInput.inputFiles).toBeUndefined();
+    expect(minimalInput.inputMessages).toBeUndefined();
+    expect(minimalInput.traceSummary).toBeUndefined();
+    expect(minimalInput.config).toBeUndefined();
+  });
+});

From 303f5db31d3c188830944d7365daa27730ae677a Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 28 Jan 2026 07:31:54 +0000
Subject: [PATCH 2/9] chore: archive adopt-ts-template-prompts openspec

Move OpenSpec to archive after implementation is complete.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .../design.md                                 | 169 ++++++++++++++++++
 .../proposal.md                               |  37 ++++
 .../specs/custom-evaluator-prompts/spec.md    |  92 ++++++++++
 .../tasks.md                                  |  22 +++
 4 files changed, 320 insertions(+)
 create mode 100644 docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/design.md
 create mode 100644 docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/proposal.md
 create mode 100644 docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/specs/custom-evaluator-prompts/spec.md
 create mode 100644 docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/tasks.md

diff --git a/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/design.md b/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/design.md
new file mode 100644
index 00000000..e0350d4c
--- /dev/null
+++ b/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/design.md
@@ -0,0 +1,169 @@
+# Design: TypeScript Template Literals for Evaluator Prompts
+
+## Architecture
+
+Follow the established code judge pattern: subprocess execution with an SDK wrapper that handles stdin/stdout.
+
+### SDK: `definePromptTemplate`
+
+Add to `@agentv/eval` package, mirroring `defineCodeJudge`:
+
+```typescript
+// packages/eval/src/prompt-template.ts
+import { readFileSync } from 'node:fs';
+import { toCamelCaseDeep } from './case-conversion.js';
+import { PromptTemplateInputSchema, type PromptTemplateInput } from './schemas.js';
+
+export type PromptTemplateHandler = (
+  input: PromptTemplateInput,
+) => string | Promise<string>;
+
+function readStdin(): string {
+  return readFileSync(0, 'utf8');
+}
+
+export async function runPromptTemplate(handler: PromptTemplateHandler): Promise<void> {
+  try {
+    const stdin = readStdin();
+    const rawInput = JSON.parse(stdin) as Record<string, unknown>;
+    const camelInput = toCamelCaseDeep(rawInput);
+    const input = PromptTemplateInputSchema.parse(camelInput);
+
+    const prompt = await handler(input);
+
+    // Output raw string (not JSON) - the prompt itself
+    console.log(prompt);
+  } catch (error) {
+    console.error(error instanceof Error ? error.message : String(error));
+    process.exit(1);
+  }
+}
+
+export function definePromptTemplate(handler: PromptTemplateHandler): void {
+  runPromptTemplate(handler);
+}
+```
+
+### Input Schema
+
+Reuse the same input shape as code judges for consistency:
+
+```typescript
+// packages/eval/src/schemas.ts
+export const PromptTemplateInputSchema = z.object({
+  question: z.string(),
+  expectedOutcome: z.string().optional(),
+  expectedMessages: z.array(MessageSchema).optional(),
+  referenceAnswer: z.string().optional(),
+  candidateAnswer: z.string(),
+  outputMessages: z.array(MessageSchema).nullable().optional(),
+  guidelineFiles: z.array(z.string()).optional(),
+  inputFiles: z.array(z.string()).optional(),
+  inputMessages: z.array(MessageSchema).optional(),
+  traceSummary: z.string().nullable().optional(),
+  config: z.record(z.unknown()).nullable().optional(),
+});
+
+export type PromptTemplateInput = z.infer<typeof PromptTemplateInputSchema>;
+```
+
+### Core: Loader Changes
+
+Update `resolveCustomPrompt` in `orchestrator.ts` to detect executable prompt files:
+
+```typescript
+async function resolveCustomPrompt(
+  promptPath: string,
+  context: EvaluationContext,
+  cwd?: string,
+): Promise<string> {
+  const ext = path.extname(promptPath).toLowerCase();
+
+  // Executable prompt template (same pattern as code judges)
+  if (ext === '.ts' || ext === '.js') {
+    return executePromptTemplate(promptPath, context, cwd);
+  }
+
+  // Static text file (existing behavior)
+  const content = await readFile(promptPath, 'utf8');
+  return substituteVariables(content, context);
+}
+
+async function executePromptTemplate(
+  scriptPath: string,
+  context: EvaluationContext,
+  cwd?: string,
+): Promise<string> {
+  const payload = buildCodeJudgePayload(context); // Reuse existing payload builder
+  const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
+
+  // Execute using existing infrastructure
+  const stdout = await executeScript(
+    ['bun', 'run', scriptPath],
+    inputJson,
+    undefined, // timeout
+    cwd,
+  );
+
+  return stdout.trim();
+}
+```
+
+## User Experience
+
+### Writing a Prompt Template
+
+```typescript
+// my-evaluator-prompt.ts
+import { definePromptTemplate } from '@agentv/eval';
+
+export default definePromptTemplate((ctx) => `
+You are evaluating a response to the following question:
+
+Question: ${ctx.question}
+
+Candidate Answer:
+${ctx.candidateAnswer}
+
+${ctx.referenceAnswer ? `Reference Answer:\n${ctx.referenceAnswer}` : ''}
+
+${ctx.config?.rubric ? `Evaluation Criteria:\n${ctx.config.rubric}` : ''}
+
+Evaluate the candidate answer and provide a score from 0 to 1.
+`);
+```
+
+### YAML Configuration
+
+```yaml
+cases:
+  - id: example
+    question: "What is the capital of France?"
+    evaluator:
+      type: llm_judge
+      prompt: ./prompts/my-evaluator-prompt.ts  # Detected as executable
+```
+
+## Trade-offs
+
+| Aspect | Subprocess Pattern | In-process (jiti) |
+|--------|-------------------|-------------------|
+| Consistency | Same as code judges | New pattern |
+| Dependencies | None (existing infra) | Adds jiti |
+| Performance | Process spawn overhead | Faster |
+| Isolation | Sandboxed | In-process |
+| Language support | Any (TS, Python, etc.) | TS/JS only |
+
+The subprocess pattern is preferred because:
+1. **Consistency** - Same mental model as code judges
+2. **No new dependencies** - Uses existing `executeScript` infrastructure
+3. **Isolation** - User code runs in separate process
+4. **Language agnostic** - Could support Python prompt templates in future
+
+## Alternatives Considered
+
+### In-process loading with jiti
+Rejected: Adds dependency, inconsistent with code judge pattern, runs user code in main process.
+
+### Require pre-compiled JS only
+Rejected: Worse DX - users already expect `bun run` to handle `.ts` files.
diff --git a/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/proposal.md b/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/proposal.md
new file mode 100644
index 00000000..034e8a7a
--- /dev/null
+++ b/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/proposal.md
@@ -0,0 +1,37 @@
+# Adopt TypeScript Template Literals for Custom Evaluator Prompts
+
+## Summary
+Enable the use of native TypeScript template literals for defining custom evaluator prompts using the same subprocess pattern as code judges. This provides type safety, complex logic support, and a consistent developer experience.
+
+## Problem
+Currently, `LlmJudgeEvaluator` relies on string templates with `{{variable}}` placeholders. This approach:
+- Lacks type safety: No compile-time check if variables exist in the context.
+- Has limited logic: Conditional logic or loops require complex template syntax or are impossible.
+- Is error-prone: Typos in placeholders are only caught at runtime.
+
+## Solution
+Follow the established code judge pattern:
+
+1. Add a `definePromptTemplate` SDK wrapper to `@agentv/eval` that handles stdin/stdout, mirroring `defineCodeJudge`.
+2. Update the evaluator loader to detect `.ts`/`.js` prompt files and execute them as subprocesses.
+3. The script receives evaluation context via stdin (JSON), returns the prompt string via stdout.
+
+Users write prompt templates the same way they write code judges:
+
+```typescript
+import { definePromptTemplate } from '@agentv/eval';
+
+export default definePromptTemplate((context) => `
+  Question: ${context.question}
+  Answer: ${context.candidateAnswer}
+
+  ${context.config?.includeRubric ? `Rubric: ${context.referenceAnswer}` : ''}
+`);
+```
+
+## Impact
+- **Core**: `orchestrator.ts` loader logic to detect and execute `.ts`/`.js` prompts as subprocesses.
+- **SDK**: New `definePromptTemplate` wrapper in `@agentv/eval`.
+- **DX**: Consistent pattern with code judges - same mental model.
+- **Dependencies**: None - uses existing subprocess infrastructure.
+- **Backward Compatibility**: Existing string-based templates and `.txt` prompt files continue to work.
diff --git a/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/specs/custom-evaluator-prompts/spec.md b/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/specs/custom-evaluator-prompts/spec.md
new file mode 100644
index 00000000..8f61d3d7
--- /dev/null
+++ b/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/specs/custom-evaluator-prompts/spec.md
@@ -0,0 +1,92 @@
+# Spec: Custom Evaluator Prompts
+
+## ADDED Requirements
+
+### Requirement: SDK Wrapper for Prompt Templates
+The `@agentv/eval` package MUST provide a `definePromptTemplate` helper that handles stdin/stdout, mirroring the `defineCodeJudge` pattern.
+
+#### Scenario: Using definePromptTemplate
+Given a TypeScript file that uses `definePromptTemplate`
+When the file is executed as a subprocess
+Then it should read evaluation context from stdin (JSON)
+And output the generated prompt string to stdout
+
+```typescript
+import { definePromptTemplate } from '@agentv/eval';
+
+export default definePromptTemplate((ctx) => `
+  Question: ${ctx.question}
+  Answer: ${ctx.candidateAnswer}
+`);
+```
+
+#### Scenario: Type safety with PromptTemplateInput
+Given a developer writing a prompt template
+When they use `definePromptTemplate`
+Then TypeScript should provide autocomplete for `ctx.question`, `ctx.candidateAnswer`, `ctx.referenceAnswer`, etc.
+
+#### Scenario: Async prompt generation
+Given a prompt template that needs async operations
+When the handler returns a Promise
+Then the wrapper should await and output the resolved string
+
+```typescript
+export default definePromptTemplate(async (ctx) => {
+  const extraContext = await fetchSomeData();
+  return `Question: ${ctx.question}\nContext: ${extraContext}`;
+});
+```
+
+### Requirement: Executable Prompt File Detection
+The evaluator loader MUST detect `.ts` and `.js` prompt files and execute them as subprocesses.
+
+#### Scenario: Loading a TypeScript prompt template
+Given an eval case with `prompt: ./my-prompt.ts`
+When the evaluator runs
+Then it should execute the file as a subprocess using `bun run`
+And pass the evaluation context via stdin as JSON
+And use stdout as the prompt string
+
+#### Scenario: Loading a JavaScript prompt template
+Given an eval case with `prompt: ./my-prompt.js`
+When the evaluator runs
+Then it should execute the file as a subprocess
+And use stdout as the prompt string
+
+#### Scenario: Backward compatibility with text files
+Given an eval case with `prompt: ./my-prompt.txt`
+When the evaluator runs
+Then it should read the file as text (existing behavior)
+And apply `{{variable}}` substitution
+
+### Requirement: Consistent Input Schema
+The prompt template input MUST use the same schema as code judges for consistency.
+
+#### Scenario: Input fields available
+Given a prompt template handler
+Then the input should include:
+- `question` - the eval case question
+- `candidateAnswer` - the agent's response
+- `referenceAnswer` - optional reference answer
+- `expectedOutcome` - optional expected outcome
+- `expectedMessages` - optional expected messages
+- `outputMessages` - optional output messages from agent
+- `guidelineFiles` - paths to guideline files
+- `inputFiles` - paths to input files
+- `inputMessages` - input messages to agent
+- `traceSummary` - optional trace summary
+- `config` - optional pass-through config from YAML
+
+### Requirement: Error Handling
+The subprocess execution MUST handle errors gracefully.
+
+#### Scenario: Script exits with non-zero code
+Given a prompt template script that throws an error
+When it is executed
+Then the evaluator should fail with a descriptive error message
+And include the script's stderr in the error
+
+#### Scenario: Script outputs nothing
+Given a prompt template script that outputs an empty string
+When it is executed
+Then the evaluator should use the empty string as the prompt
diff --git a/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/tasks.md b/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/tasks.md
new file mode 100644
index 00000000..6ae3c569
--- /dev/null
+++ b/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/tasks.md
@@ -0,0 +1,22 @@
+# Tasks: Adopt TypeScript Template Literals for Custom Evaluator Prompts
+
+## SDK (`@agentv/eval`)
+
+- [x] Add `PromptTemplateInput` type to `packages/eval/src/schemas.ts` (reuse CodeJudgeInput fields) <!-- id: add-input-schema -->
+- [x] Add `definePromptTemplate` wrapper to `packages/eval/src/prompt-template.ts` <!-- id: add-wrapper -->
+- [x] Export `definePromptTemplate` and `PromptTemplateInput` from `packages/eval/src/index.ts` <!-- id: export-sdk -->
+
+## Core (`@agentv/core`)
+
+- [x] Add `executePromptTemplate` function to execute `.ts`/`.js` prompt files as subprocesses <!-- id: add-executor -->
+- [x] Update `resolveCustomPrompt` in `orchestrator.ts` to detect and handle executable prompts <!-- id: update-loader -->
+
+## Testing
+
+- [x] Add unit tests for `definePromptTemplate` stdin/stdout handling <!-- id: test-wrapper -->
+- [x] Add integration tests for executable prompt templates in eval runs <!-- id: test-integration -->
+
+## Documentation
+
+- [x] Create example prompt template in `examples/features/prompt-template-sdk/` <!-- id: create-example -->
+- [ ] Update skill reference docs with prompt template pattern <!-- id: update-docs -->

From 25d23fa330a83e565240603ed16bd5ed96b415d5 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 28 Jan 2026 07:34:52 +0000
Subject: [PATCH 3/9] chore: archive adopt-ts-template-prompts openspec

- Archive change to openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/
- Create new spec openspec/specs/custom-evaluator-prompts/

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .../design.md                                 |  0
 .../proposal.md                               |  0
 .../specs/custom-evaluator-prompts/spec.md    |  0
 .../tasks.md                                  | 16 ++--
 .../specs/custom-evaluator-prompts/spec.md    | 94 +++++++++++++++++++
 5 files changed, 102 insertions(+), 8 deletions(-)
 rename {docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts => openspec/changes/archive/2026-01-28-adopt-ts-template-prompts}/design.md (100%)
 rename {docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts => openspec/changes/archive/2026-01-28-adopt-ts-template-prompts}/proposal.md (100%)
 rename {docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts => openspec/changes/archive/2026-01-28-adopt-ts-template-prompts}/specs/custom-evaluator-prompts/spec.md (100%)
 rename {docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts => openspec/changes/archive/2026-01-28-adopt-ts-template-prompts}/tasks.md (55%)
 create mode 100644 openspec/specs/custom-evaluator-prompts/spec.md

diff --git a/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/design.md b/openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/design.md
similarity index 100%
rename from docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/design.md
rename to openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/design.md
diff --git a/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/proposal.md b/openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/proposal.md
similarity index 100%
rename from docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/proposal.md
rename to openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/proposal.md
diff --git a/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/specs/custom-evaluator-prompts/spec.md b/openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/specs/custom-evaluator-prompts/spec.md
similarity index 100%
rename from docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/specs/custom-evaluator-prompts/spec.md
rename to openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/specs/custom-evaluator-prompts/spec.md
diff --git a/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/tasks.md b/openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/tasks.md
similarity index 55%
rename from docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/tasks.md
rename to openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/tasks.md
index 6ae3c569..2690f6d8 100644
--- a/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/tasks.md
+++ b/openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/tasks.md
@@ -2,21 +2,21 @@
 
 ## SDK (`@agentv/eval`)
 
-- [x] Add `PromptTemplateInput` type to `packages/eval/src/schemas.ts` (reuse CodeJudgeInput fields) <!-- id: add-input-schema -->
-- [x] Add `definePromptTemplate` wrapper to `packages/eval/src/prompt-template.ts` <!-- id: add-wrapper -->
-- [x] Export `definePromptTemplate` and `PromptTemplateInput` from `packages/eval/src/index.ts` <!-- id: export-sdk -->
+- [ ] Add `PromptTemplateInput` type to `packages/eval/src/schemas.ts` (reuse CodeJudgeInput fields) <!-- id: add-input-schema -->
+- [ ] Add `definePromptTemplate` wrapper to `packages/eval/src/prompt-template.ts` <!-- id: add-wrapper -->
+- [ ] Export `definePromptTemplate` and `PromptTemplateInput` from `packages/eval/src/index.ts` <!-- id: export-sdk -->
 
 ## Core (`@agentv/core`)
 
-- [x] Add `executePromptTemplate` function to execute `.ts`/`.js` prompt files as subprocesses <!-- id: add-executor -->
-- [x] Update `resolveCustomPrompt` in `orchestrator.ts` to detect and handle executable prompts <!-- id: update-loader -->
+- [ ] Add `executePromptTemplate` function to execute `.ts`/`.js` prompt files as subprocesses <!-- id: add-executor -->
+- [ ] Update `resolveCustomPrompt` in `orchestrator.ts` to detect and handle executable prompts <!-- id: update-loader -->
 
 ## Testing
 
-- [x] Add unit tests for `definePromptTemplate` stdin/stdout handling <!-- id: test-wrapper -->
-- [x] Add integration tests for executable prompt templates in eval runs <!-- id: test-integration -->
+- [ ] Add unit tests for `definePromptTemplate` stdin/stdout handling <!-- id: test-wrapper -->
+- [ ] Add integration tests for executable prompt templates in eval runs <!-- id: test-integration -->
 
 ## Documentation
 
-- [x] Create example prompt template in `examples/features/prompt-template-sdk/` <!-- id: create-example -->
+- [ ] Create example prompt template in `examples/features/prompt-template-sdk/` <!-- id: create-example -->
 - [ ] Update skill reference docs with prompt template pattern <!-- id: update-docs -->
diff --git a/openspec/specs/custom-evaluator-prompts/spec.md b/openspec/specs/custom-evaluator-prompts/spec.md
new file mode 100644
index 00000000..5485705b
--- /dev/null
+++ b/openspec/specs/custom-evaluator-prompts/spec.md
@@ -0,0 +1,94 @@
+# custom-evaluator-prompts Specification
+
+## Purpose
+TBD - created by archiving change adopt-ts-template-prompts. Update Purpose after archive.
+## Requirements
+### Requirement: SDK Wrapper for Prompt Templates
+The `@agentv/eval` package MUST provide a `definePromptTemplate` helper that handles stdin/stdout, mirroring the `defineCodeJudge` pattern.
+
+#### Scenario: Using definePromptTemplate
+Given a TypeScript file that uses `definePromptTemplate`
+When the file is executed as a subprocess
+Then it should read evaluation context from stdin (JSON)
+And output the generated prompt string to stdout
+
+```typescript
+import { definePromptTemplate } from '@agentv/eval';
+
+export default definePromptTemplate((ctx) => `
+  Question: ${ctx.question}
+  Answer: ${ctx.candidateAnswer}
+`);
+```
+
+#### Scenario: Type safety with PromptTemplateInput
+Given a developer writing a prompt template
+When they use `definePromptTemplate`
+Then TypeScript should provide autocomplete for `ctx.question`, `ctx.candidateAnswer`, `ctx.referenceAnswer`, etc.
+
+#### Scenario: Async prompt generation
+Given a prompt template that needs async operations
+When the handler returns a Promise
+Then the wrapper should await and output the resolved string
+
+```typescript
+export default definePromptTemplate(async (ctx) => {
+  const extraContext = await fetchSomeData();
+  return `Question: ${ctx.question}\nContext: ${extraContext}`;
+});
+```
+
+### Requirement: Executable Prompt File Detection
+The evaluator loader MUST detect `.ts` and `.js` prompt files and execute them as subprocesses.
+
+#### Scenario: Loading a TypeScript prompt template
+Given an eval case with `prompt: ./my-prompt.ts`
+When the evaluator runs
+Then it should execute the file as a subprocess using `bun run`
+And pass the evaluation context via stdin as JSON
+And use stdout as the prompt string
+
+#### Scenario: Loading a JavaScript prompt template
+Given an eval case with `prompt: ./my-prompt.js`
+When the evaluator runs
+Then it should execute the file as a subprocess
+And use stdout as the prompt string
+
+#### Scenario: Backward compatibility with text files
+Given an eval case with `prompt: ./my-prompt.txt`
+When the evaluator runs
+Then it should read the file as text (existing behavior)
+And apply `{{variable}}` substitution
+
+### Requirement: Consistent Input Schema
+The prompt template input MUST use the same schema as code judges for consistency.
+
+#### Scenario: Input fields available
+Given a prompt template handler
+Then the input should include:
+- `question` - the eval case question
+- `candidateAnswer` - the agent's response
+- `referenceAnswer` - optional reference answer
+- `expectedOutcome` - optional expected outcome
+- `expectedMessages` - optional expected messages
+- `outputMessages` - optional output messages from agent
+- `guidelineFiles` - paths to guideline files
+- `inputFiles` - paths to input files
+- `inputMessages` - input messages to agent
+- `traceSummary` - optional trace summary
+- `config` - optional pass-through config from YAML
+
+### Requirement: Error Handling
+The subprocess execution MUST handle errors gracefully.
+
+#### Scenario: Script exits with non-zero code
+Given a prompt template script that throws an error
+When it is executed
+Then the evaluator should fail with a descriptive error message
+And include the script's stderr in the error
+
+#### Scenario: Script outputs nothing
+Given a prompt template script that outputs an empty string
+When it is executed
+Then the evaluator should use the empty string as the prompt
+

From 3010527dd1d5936f2d08fc9b3974b8c41b764d50 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 28 Jan 2026 08:11:15 +0000
Subject: [PATCH 4/9] fix(eval): improve prompt template robustness and schema
 consistency

- Reuse CodeJudgeInputSchema for PromptTemplateInputSchema (consistent payloads)
- Add timeout support to executePromptTemplate (prevents hanging scripts)
- Validate non-empty output from prompt templates
- Throw error for missing .ts/.js prompt template files (fail-fast)
- Update tests to reflect required fields

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .../evaluation/loaders/evaluator-parser.ts    |  8 ++
 packages/core/src/evaluation/orchestrator.ts  | 35 +++++---
 packages/eval/src/schemas.ts                  | 18 +---
 .../eval/test/define-prompt-template.test.ts  | 89 ++++++++++++-------
 4 files changed, 93 insertions(+), 57 deletions(-)

diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts
index 7b468b6c..d243b60c 100644
--- a/packages/core/src/evaluation/loaders/evaluator-parser.ts
+++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts
@@ -542,6 +542,14 @@ export async function parseEvaluators(
           }
         }
       } else {
+        // Check if the prompt looks like an executable template path (.ts/.js)
+        // These must exist as files - don't fall back to inline prompt
+        const promptExt = path.extname(prompt).toLowerCase();
+        if (promptExt === '.ts' || promptExt === '.js') {
+          throw new Error(
+            `Evaluator '${name}' in '${evalId}': prompt template file not found: ${resolved.displayPath}`,
+          );
+        }
         logWarning(
           `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
           resolved.attempted.length > 0
diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
index ce8a73c6..5bbdea0a 100644
--- a/packages/core/src/evaluation/orchestrator.ts
+++ b/packages/core/src/evaluation/orchestrator.ts
@@ -899,6 +899,7 @@ async function runEvaluatorList(options: {
           judgeProvider,
           outputMessages,
           traceSummary,
+          agentTimeoutMs,
         });
         const weight = evaluator.weight ?? 1.0;
         scored.push({ score, name: evaluator.name, type: evaluator.type, weight });
@@ -1257,6 +1258,7 @@ async function runLlmJudgeEvaluator(options: {
   readonly judgeProvider?: Provider;
   readonly outputMessages?: readonly OutputMessage[];
   readonly traceSummary?: TraceSummary;
+  readonly agentTimeoutMs?: number;
 }): Promise<EvaluationScore> {
   const {
     config,
@@ -1271,14 +1273,19 @@ async function runLlmJudgeEvaluator(options: {
     judgeProvider,
     outputMessages,
     traceSummary,
+    agentTimeoutMs,
   } = options;
-  const customPrompt = await resolveCustomPrompt(config, {
-    evalCase,
-    candidate,
-    outputMessages,
-    traceSummary,
-    config: config.config,
-  });
+  const customPrompt = await resolveCustomPrompt(
+    config,
+    {
+      evalCase,
+      candidate,
+      outputMessages,
+      traceSummary,
+      config: config.config,
+    },
+    agentTimeoutMs,
+  );
 
   return evaluatorRegistry.llm_judge.evaluate({
     evalCase,
@@ -1310,6 +1317,7 @@ async function resolveCustomPrompt(
     readonly config?: Record<string, unknown>;
   },
   context?: ResolveCustomPromptContext,
+  timeoutMs?: number,
 ): Promise<string | undefined> {
   const promptPath = promptConfig.resolvedPromptPath ?? promptConfig.promptPath;
 
@@ -1321,7 +1329,7 @@ async function resolveCustomPrompt(
       if (!context) {
         throw new Error('Context required for executable prompt templates (.ts/.js files)');
       }
-      return executePromptTemplate(promptPath, context, promptConfig.config);
+      return executePromptTemplate(promptPath, context, promptConfig.config, timeoutMs);
     }
 
     // Static text file (existing behavior)
@@ -1340,6 +1348,7 @@ async function executePromptTemplate(
   scriptPath: string,
   context: ResolveCustomPromptContext,
   config?: Record<string, unknown>,
+  timeoutMs?: number,
 ): Promise<string> {
   // Build payload matching code judge input format for consistency
   const payload = {
@@ -1362,8 +1371,14 @@ async function executePromptTemplate(
   const cwd = path.dirname(scriptPath);
 
   try {
-    const stdout = await executeScript(['bun', 'run', scriptPath], inputJson, undefined, cwd);
-    return stdout.trim();
+    const stdout = await executeScript(['bun', 'run', scriptPath], inputJson, timeoutMs, cwd);
+    const prompt = stdout.trim();
+
+    if (!prompt) {
+      throw new Error('Prompt template produced empty output');
+    }
+
+    return prompt;
   } catch (error) {
     const message = error instanceof Error ? error.message : String(error);
     throw new Error(`Prompt template execution failed: ${message}`);
diff --git a/packages/eval/src/schemas.ts b/packages/eval/src/schemas.ts
index af6bde4f..61734d71 100644
--- a/packages/eval/src/schemas.ts
+++ b/packages/eval/src/schemas.ts
@@ -91,20 +91,8 @@ export type TokenUsage = z.infer<typeof TokenUsageSchema>;
 
 /**
  * Prompt template input schema (camelCase, converted from snake_case wire format).
- * Uses the same fields as CodeJudgeInput for consistency.
+ * Uses the same schema as CodeJudgeInput since the orchestrator sends identical payloads.
  */
-export const PromptTemplateInputSchema = z.object({
-  question: z.string(),
-  expectedOutcome: z.string().optional(),
-  expectedMessages: z.array(MessageSchema).optional(),
-  referenceAnswer: z.string().optional(),
-  candidateAnswer: z.string(),
-  outputMessages: z.array(MessageSchema).nullable().optional(),
-  guidelineFiles: z.array(z.string()).optional(),
-  inputFiles: z.array(z.string()).optional(),
-  inputMessages: z.array(MessageSchema).optional(),
-  traceSummary: TraceSummarySchema.nullable().optional(),
-  config: z.record(z.unknown()).nullable().optional(),
-});
+export const PromptTemplateInputSchema = CodeJudgeInputSchema;
 
-export type PromptTemplateInput = z.infer<typeof PromptTemplateInputSchema>;
+export type PromptTemplateInput = CodeJudgeInput;
diff --git a/packages/eval/test/define-prompt-template.test.ts b/packages/eval/test/define-prompt-template.test.ts
index ab0ab831..525b9d41 100644
--- a/packages/eval/test/define-prompt-template.test.ts
+++ b/packages/eval/test/define-prompt-template.test.ts
@@ -3,33 +3,34 @@ import { describe, expect, it } from 'bun:test';
 import { type PromptTemplateInput, PromptTemplateInputSchema } from '../src/schemas.js';
 
 describe('PromptTemplateInputSchema', () => {
+  // Minimal valid input with all required fields
   const validInput = {
     question: 'What is 2+2?',
+    expectedOutcome: 'The answer should be 4',
+    expectedMessages: [],
     candidateAnswer: 'The answer is 4',
+    guidelineFiles: [],
+    inputFiles: [],
+    inputMessages: [],
   };
 
-  it('parses minimal valid input', () => {
+  it('parses valid input with all required fields', () => {
     const result = PromptTemplateInputSchema.parse(validInput);
     expect(result.question).toBe('What is 2+2?');
     expect(result.candidateAnswer).toBe('The answer is 4');
-  });
-
-  it('accepts optional expectedOutcome', () => {
-    const inputWithOutcome = {
-      ...validInput,
-      expectedOutcome: 'The answer should be 4',
-    };
-    const result = PromptTemplateInputSchema.parse(inputWithOutcome);
     expect(result.expectedOutcome).toBe('The answer should be 4');
+    expect(result.expectedMessages).toEqual([]);
+    expect(result.guidelineFiles).toEqual([]);
+    expect(result.inputFiles).toEqual([]);
+    expect(result.inputMessages).toEqual([]);
   });
 
-  it('accepts optional expectedMessages', () => {
-    const inputWithMessages = {
-      ...validInput,
-      expectedMessages: [{ role: 'assistant', content: '4' }],
+  it('rejects input missing required fields', () => {
+    const minimalInput = {
+      question: 'What is 2+2?',
+      candidateAnswer: 'The answer is 4',
     };
-    const result = PromptTemplateInputSchema.parse(inputWithMessages);
-    expect(result.expectedMessages?.[0].content).toBe('4');
+    expect(() => PromptTemplateInputSchema.parse(minimalInput)).toThrow();
   });
 
   it('accepts optional referenceAnswer', () => {
@@ -74,7 +75,16 @@ describe('PromptTemplateInputSchema', () => {
     expect(result.config).toEqual({ rubric: 'Check for correctness', strictMode: true });
   });
 
-  it('accepts optional guidelineFiles', () => {
+  it('accepts expectedMessages with content', () => {
+    const inputWithMessages = {
+      ...validInput,
+      expectedMessages: [{ role: 'assistant', content: '4' }],
+    };
+    const result = PromptTemplateInputSchema.parse(inputWithMessages);
+    expect(result.expectedMessages[0].content).toBe('4');
+  });
+
+  it('accepts guidelineFiles with paths', () => {
     const inputWithGuidelines = {
       ...validInput,
       guidelineFiles: ['/path/to/guideline1.txt', '/path/to/guideline2.txt'],
@@ -83,7 +93,7 @@ describe('PromptTemplateInputSchema', () => {
     expect(result.guidelineFiles).toEqual(['/path/to/guideline1.txt', '/path/to/guideline2.txt']);
   });
 
-  it('accepts optional inputFiles', () => {
+  it('accepts inputFiles with paths', () => {
     const inputWithFiles = {
       ...validInput,
       inputFiles: ['/path/to/input1.txt'],
@@ -92,13 +102,13 @@ describe('PromptTemplateInputSchema', () => {
     expect(result.inputFiles).toEqual(['/path/to/input1.txt']);
   });
 
-  it('accepts optional inputMessages', () => {
+  it('accepts inputMessages with content', () => {
     const inputWithMessages = {
       ...validInput,
       inputMessages: [{ role: 'user', content: 'What is 2+2?' }],
     };
     const result = PromptTemplateInputSchema.parse(inputWithMessages);
-    expect(result.inputMessages?.[0].content).toBe('What is 2+2?');
+    expect(result.inputMessages[0].content).toBe('What is 2+2?');
   });
 
   it('accepts optional outputMessages with toolCalls', () => {
@@ -116,7 +126,7 @@ describe('PromptTemplateInputSchema', () => {
     expect(result.outputMessages?.[0].toolCalls?.[0].tool).toBe('read');
   });
 
-  it('accepts full input with all optional fields', () => {
+  it('accepts full input with all fields', () => {
     const fullInput = {
       question: 'What is 2+2?',
       expectedOutcome: 'The answer should be 4',
@@ -149,34 +159,49 @@ describe('Schema type inference', () => {
     // Type-level test: ensure inferred types have expected properties
     const input: PromptTemplateInput = {
       question: 'test',
+      expectedOutcome: 'expected',
+      expectedMessages: [],
       candidateAnswer: 'test',
+      guidelineFiles: [],
+      inputFiles: [],
+      inputMessages: [],
     };
 
     // These should all type-check correctly
     const _q: string = input.question;
     const _c: string = input.candidateAnswer;
+    const _outcome: string = input.expectedOutcome;
     const _trace: PromptTemplateInput['traceSummary'] = undefined;
     const _config: PromptTemplateInput['config'] = null;
     const _ref: PromptTemplateInput['referenceAnswer'] = undefined;
-    const _outcome: PromptTemplateInput['expectedOutcome'] = undefined;
 
     expect(input.question).toBe('test');
   });
 
-  it('PromptTemplateInput allows all optional fields to be omitted', () => {
-    const minimalInput: PromptTemplateInput = {
+  it('PromptTemplateInput requires core fields', () => {
+    const input: PromptTemplateInput = {
       question: 'test question',
+      expectedOutcome: 'expected outcome',
+      expectedMessages: [],
       candidateAnswer: 'test answer',
+      guidelineFiles: [],
+      inputFiles: [],
+      inputMessages: [],
     };
 
-    expect(minimalInput.expectedOutcome).toBeUndefined();
-    expect(minimalInput.expectedMessages).toBeUndefined();
-    expect(minimalInput.referenceAnswer).toBeUndefined();
-    expect(minimalInput.outputMessages).toBeUndefined();
-    expect(minimalInput.guidelineFiles).toBeUndefined();
-    expect(minimalInput.inputFiles).toBeUndefined();
-    expect(minimalInput.inputMessages).toBeUndefined();
-    expect(minimalInput.traceSummary).toBeUndefined();
-    expect(minimalInput.config).toBeUndefined();
+    // Required fields must be present
+    expect(input.question).toBe('test question');
+    expect(input.expectedOutcome).toBe('expected outcome');
+    expect(input.candidateAnswer).toBe('test answer');
+    expect(input.expectedMessages).toEqual([]);
+    expect(input.guidelineFiles).toEqual([]);
+    expect(input.inputFiles).toEqual([]);
+    expect(input.inputMessages).toEqual([]);
+
+    // Optional fields can be omitted
+    expect(input.referenceAnswer).toBeUndefined();
+    expect(input.outputMessages).toBeUndefined();
+    expect(input.traceSummary).toBeUndefined();
+    expect(input.config).toBeUndefined();
   });
 });

From 6f1a54cd3c00f9391e9342dccab0c9db77af3d9d Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 28 Jan 2026 09:41:19 +0000
Subject: [PATCH 5/9] fix(examples): use default target for prompt-template-sdk
 example

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 examples/features/prompt-template-sdk/evals/dataset.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/features/prompt-template-sdk/evals/dataset.yaml b/examples/features/prompt-template-sdk/evals/dataset.yaml
index a820cd48..f899db95 100644
--- a/examples/features/prompt-template-sdk/evals/dataset.yaml
+++ b/examples/features/prompt-template-sdk/evals/dataset.yaml
@@ -3,9 +3,9 @@
 
 description: Demonstrates TypeScript prompt templates for custom LLM judge prompts
 
-# Uses the CLI target defined in .agentv/targets.yaml
+# Uses the default target defined in .agentv/targets.yaml
 execution:
-  target: local_cli
+  target: default
 
 evalcases:
   - id: prompt-template-basic

From 7450dc6986174d08790c0e8ed8b9c6941b2659ee Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 28 Jan 2026 10:13:54 +0000
Subject: [PATCH 6/9] refactor(eval): use explicit script arrays for executable
 prompt templates

Change executable prompt templates to use explicit script arrays instead
of auto-detecting runtime by file extension. This matches the code_judge
pattern for consistency.

Before:
  prompt: ../prompts/custom-evaluator.ts  # ambiguous runtime

After:
  prompt:
    script: [bun, run, ../prompts/custom-evaluator.ts]
    config: { ... }

Benefits:
- Consistent with code_judge pattern (one mental model)
- No ambiguity about runtime (user explicitly specifies bun/node/python)
- Future-proof (works with any runtime without code changes)
- Aligns with "Built-ins for Primitives Only" design principle

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 apps/cli/package.json                         |   5 +-
 .../prompt-template-sdk/evals/dataset.yaml    |  21 +-
 .../design.md                                 | 188 ++++++++++++++----
 packages/core/package.json                    |   5 +-
 .../evaluation/loaders/evaluator-parser.ts    |  84 +++++---
 packages/core/src/evaluation/orchestrator.ts  |  42 ++--
 packages/core/src/evaluation/types.ts         |  20 +-
 .../core/test/evaluation/orchestrator.test.ts |   8 +-
 8 files changed, 269 insertions(+), 104 deletions(-)

diff --git a/apps/cli/package.json b/apps/cli/package.json
index eadb3a1c..48296e92 100644
--- a/apps/cli/package.json
+++ b/apps/cli/package.json
@@ -14,10 +14,7 @@
   "bin": {
     "agentv": "./dist/cli.js"
   },
-  "files": [
-    "dist",
-    "README.md"
-  ],
+  "files": ["dist", "README.md"],
   "scripts": {
     "dev": "bun --watch src/index.ts",
     "build": "tsup && bun run copy-readme",
diff --git a/examples/features/prompt-template-sdk/evals/dataset.yaml b/examples/features/prompt-template-sdk/evals/dataset.yaml
index f899db95..876c8299 100644
--- a/examples/features/prompt-template-sdk/evals/dataset.yaml
+++ b/examples/features/prompt-template-sdk/evals/dataset.yaml
@@ -1,5 +1,6 @@
 # Prompt Template SDK Demo
 # Demonstrates using TypeScript/JavaScript files for custom evaluator prompts.
+# Uses the same explicit script pattern as code_judge for consistency.
 
 description: Demonstrates TypeScript prompt templates for custom LLM judge prompts
 
@@ -24,7 +25,9 @@ evalcases:
       evaluators:
         - name: custom-prompt-eval
           type: llm_judge
-          prompt: ../prompts/custom-evaluator.ts
+          # Executable prompt template using explicit script array (matches code_judge pattern)
+          prompt:
+            script: [bun, run, ../prompts/custom-evaluator.ts]
 
   - id: prompt-template-with-config
     expected_outcome: The CLI explains async/await correctly.
@@ -42,10 +45,12 @@ evalcases:
       evaluators:
         - name: strict-eval
           type: llm_judge
-          prompt: ../prompts/custom-evaluator.ts
-          config:
-            rubric: |-
-              - Must mention Promises
-              - Must explain the synchronous-looking syntax
-              - Should provide an example or use case
-            strictMode: true
+          # Executable prompt template with config
+          prompt:
+            script: [bun, run, ../prompts/custom-evaluator.ts]
+            config:
+              rubric: |-
+                - Must mention Promises
+                - Must explain the synchronous-looking syntax
+                - Should provide an example or use case
+              strictMode: true
diff --git a/openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/design.md b/openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/design.md
index e0350d4c..a169e1bd 100644
--- a/openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/design.md
+++ b/openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/design.md
@@ -2,7 +2,39 @@
 
 ## Architecture
 
-Follow the established code judge pattern: subprocess execution with an SDK wrapper that handles stdin/stdout.
+Follow the established code judge pattern: subprocess execution with an SDK wrapper that handles stdin/stdout, using **explicit script arrays** to specify the runtime.
+
+### Key Design Decision: Explicit Script Arrays
+
+Executable prompt templates use the same explicit script array pattern as `code_judge`:
+
+```yaml
+# code_judge pattern (existing)
+evaluator:
+  type: code_judge
+  script: [bun, run, ../scripts/verify.ts]
+
+# Executable prompt template (new - same pattern)
+evaluator:
+  type: llm_judge
+  prompt:
+    script: [bun, run, ../prompts/custom-evaluator.ts]
+    config:
+      rubric: "..."
+```
+
+**Why explicit script arrays instead of auto-detection?**
+
+| Approach | Pros | Cons |
+|----------|------|------|
+| Auto-detect by extension (`.ts` → bun) | Less verbose | Ambiguous, magic behavior, limited to known runtimes |
+| Explicit script array | Consistent with code_judge, supports any runtime | More verbose |
+
+We chose explicit script arrays because:
+1. **Consistency** - Same pattern as code_judge, one mental model
+2. **No ambiguity** - User explicitly chooses bun, node, python, deno, etc.
+3. **Future-proof** - Works with any runtime without code changes
+4. **Aligns with design principles** - "Built-ins for Primitives Only" - the primitive is "execute a script"
 
 ### SDK: `definePromptTemplate`
 
@@ -67,44 +99,68 @@ export const PromptTemplateInputSchema = z.object({
 export type PromptTemplateInput = z.infer<typeof PromptTemplateInputSchema>;
 ```
 
+### Core: Type Definitions
+
+```typescript
+// packages/core/src/evaluation/types.ts
+
+/**
+ * Executable prompt template configuration.
+ * Matches code_judge pattern for consistency.
+ */
+export type PromptScriptConfig = {
+  /** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
+  readonly script: readonly string[];
+  /** Pass-through configuration for the prompt template */
+  readonly config?: Record<string, unknown>;
+};
+
+export type LlmJudgeEvaluatorConfig = {
+  readonly name: string;
+  readonly type: 'llm_judge';
+  /** Text prompt (inline or file path) or executable script config */
+  readonly prompt?: string | PromptScriptConfig;
+  // ... other fields
+  /** Resolved script array for executable prompts (matches code_judge pattern) */
+  readonly resolvedPromptScript?: readonly string[];
+};
+```
+
 ### Core: Loader Changes
 
-Update `resolveCustomPrompt` in `orchestrator.ts` to detect executable prompt files:
+The evaluator parser resolves `prompt.script` to `resolvedPromptScript`:
 
 ```typescript
-async function resolveCustomPrompt(
-  promptPath: string,
-  context: EvaluationContext,
-  cwd?: string,
-): Promise<string> {
-  const ext = path.extname(promptPath).toLowerCase();
+// packages/core/src/evaluation/loaders/evaluator-parser.ts
+if (isJsonObject(rawPrompt)) {
+  // Executable prompt template: { script: [...], config: {...} }
+  const scriptArray = asStringArray(rawPrompt.script, ...);
 
-  // Executable prompt template (same pattern as code judges)
-  if (ext === '.ts' || ext === '.js') {
-    return executePromptTemplate(promptPath, context, cwd);
-  }
+  // Resolve the script path (last element)
+  const scriptPath = scriptArray[scriptArray.length - 1];
+  const resolved = await resolveFileReference(scriptPath, searchRoots);
 
-  // Static text file (existing behavior)
-  const content = await readFile(promptPath, 'utf8');
-  return substituteVariables(content, context);
+  if (resolved.resolvedPath) {
+    resolvedPromptScript = [...scriptArray.slice(0, -1), path.resolve(resolved.resolvedPath)];
+  }
 }
+```
 
+The orchestrator executes using the resolved script array:
+
+```typescript
+// packages/core/src/evaluation/orchestrator.ts
 async function executePromptTemplate(
-  scriptPath: string,
-  context: EvaluationContext,
-  cwd?: string,
+  script: readonly string[],  // e.g., ['bun', 'run', '/abs/path/template.ts']
+  context: ResolveCustomPromptContext,
+  config?: Record<string, unknown>,
+  timeoutMs?: number,
 ): Promise<string> {
-  const payload = buildCodeJudgePayload(context); // Reuse existing payload builder
+  const payload = { /* ... same as code judge */ };
   const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
+  const cwd = path.dirname(script[script.length - 1]);
 
-  // Execute using existing infrastructure
-  const stdout = await executeScript(
-    ['bun', 'run', scriptPath],
-    inputJson,
-    undefined, // timeout
-    cwd,
-  );
-
+  const stdout = await executeScript(script, inputJson, timeoutMs, cwd);
   return stdout.trim();
 }
 ```
@@ -114,7 +170,7 @@ async function executePromptTemplate(
 ### Writing a Prompt Template
 
 ```typescript
-// my-evaluator-prompt.ts
+// prompts/custom-evaluator.ts
 import { definePromptTemplate } from '@agentv/eval';
 
 export default definePromptTemplate((ctx) => `
@@ -136,34 +192,82 @@ Evaluate the candidate answer and provide a score from 0 to 1.
 ### YAML Configuration
 
 ```yaml
-cases:
+evalcases:
   - id: example
     question: "What is the capital of France?"
-    evaluator:
-      type: llm_judge
-      prompt: ./prompts/my-evaluator-prompt.ts  # Detected as executable
+    execution:
+      evaluators:
+        - name: custom-eval
+          type: llm_judge
+          # Executable prompt template with explicit script array
+          prompt:
+            script: [bun, run, ../prompts/custom-evaluator.ts]
+            config:
+              rubric: |
+                - Must be factually correct
+                - Should be concise
+```
+
+### Supported Runtimes
+
+The explicit script array supports any runtime:
+
+```yaml
+# TypeScript with Bun
+prompt:
+  script: [bun, run, ./template.ts]
+
+# TypeScript with Node + tsx
+prompt:
+  script: [npx, tsx, ./template.ts]
+
+# JavaScript with Node
+prompt:
+  script: [node, ./template.js]
+
+# Python (future)
+prompt:
+  script: [python, ./template.py]
 ```
 
 ## Trade-offs
 
-| Aspect | Subprocess Pattern | In-process (jiti) |
-|--------|-------------------|-------------------|
-| Consistency | Same as code judges | New pattern |
-| Dependencies | None (existing infra) | Adds jiti |
-| Performance | Process spawn overhead | Faster |
-| Isolation | Sandboxed | In-process |
-| Language support | Any (TS, Python, etc.) | TS/JS only |
+| Aspect | Subprocess Pattern | In-process (jiti/dynamic import) |
+|--------|-------------------|----------------------------------|
+| Consistency | Same as code judges | New pattern, different from code_judge |
+| Dependencies | None (existing infra) | Adds jiti dependency |
+| Performance | Process spawn overhead | Faster execution |
+| Isolation | Sandboxed in subprocess | Runs in main process |
+| Language support | Any (TS, JS, Python, etc.) | TypeScript/JavaScript only |
+| API compatibility | Works with existing SDK | Would require different SDK API |
 
 The subprocess pattern is preferred because:
 1. **Consistency** - Same mental model as code judges
 2. **No new dependencies** - Uses existing `executeScript` infrastructure
 3. **Isolation** - User code runs in separate process
-4. **Language agnostic** - Could support Python prompt templates in future
+4. **Language agnostic** - Supports any runtime (bun, node, python, deno)
+5. **SDK compatibility** - The `definePromptTemplate` SDK is designed for stdin/stdout
 
 ## Alternatives Considered
 
 ### In-process loading with jiti
-Rejected: Adds dependency, inconsistent with code judge pattern, runs user code in main process.
+
+**Rejected.** While jiti provides lighter-weight TypeScript execution without subprocess overhead:
+- Adds a new dependency
+- Inconsistent with code_judge pattern (subprocess)
+- Runs user code in the main process (less isolation)
+- Would require a different API - the current SDK reads stdin/writes stdout
+- Only works for JS/TS, not other languages
+
+If there's demand for a lighter-weight in-process option in the future, it could be added as a separate feature (e.g., `prompt_module: ./file.ts`) rather than replacing the subprocess approach.
+
+### Auto-detect runtime by file extension
+
+**Rejected.** The original design auto-detected runtime based on file extension (`.ts` → `bun run`). This was changed to explicit script arrays because:
+- Ambiguous: What runtime does `.ts` use? bun? node? tsx?
+- Inconsistent: code_judge requires explicit `script:` array
+- Inflexible: Adding new runtimes requires code changes
 
 ### Require pre-compiled JS only
-Rejected: Worse DX - users already expect `bun run` to handle `.ts` files.
+
+**Rejected.** Worse DX - users already expect `bun run` to handle `.ts` files directly.
diff --git a/packages/core/package.json b/packages/core/package.json
index eba6c241..a79c8e90 100644
--- a/packages/core/package.json
+++ b/packages/core/package.json
@@ -36,10 +36,7 @@
     "test:watch": "bun test --watch",
     "diagnostics:azure": "bun src/diagnostics/azure-deployment-diag.ts"
   },
-  "files": [
-    "dist",
-    "README.md"
-  ],
+  "files": ["dist", "README.md"],
   "dependencies": {
     "@ai-sdk/anthropic": "^2.0.53",
     "@ai-sdk/azure": "^2.0.78",
diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts
index d243b60c..0d5e895b 100644
--- a/packages/core/src/evaluation/loaders/evaluator-parser.ts
+++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts
@@ -522,34 +522,56 @@ export async function parseEvaluators(
       continue;
     }
 
-    const prompt = asString(rawEvaluator.prompt);
+    // Parse prompt field - can be string (text template) or object (executable script)
+    const rawPrompt = rawEvaluator.prompt;
+    let prompt: string | undefined;
     let promptPath: string | undefined;
-    if (prompt) {
+    let resolvedPromptScript: string[] | undefined;
+    let promptScriptConfig: Record<string, unknown> | undefined;
+
+    if (isJsonObject(rawPrompt)) {
+      // Executable prompt template: { script: [...], config: {...} }
+      const scriptArray = asStringArray(
+        rawPrompt.script,
+        `prompt.script for evaluator '${name}' in '${evalId}'`,
+      );
+
+      if (!scriptArray) {
+        throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires script array`);
+      }
+
+      // Resolve the script path (last element is typically the file path)
+      const scriptPath = scriptArray[scriptArray.length - 1];
+      const resolved = await resolveFileReference(scriptPath, searchRoots);
+
+      if (resolved.resolvedPath) {
+        // Replace the last element with the resolved path
+        resolvedPromptScript = [...scriptArray.slice(0, -1), path.resolve(resolved.resolvedPath)];
+      } else {
+        throw new Error(
+          `Evaluator '${name}' in '${evalId}': prompt script file not found: ${resolved.displayPath}`,
+        );
+      }
+
+      // Extract config from prompt object
+      if (isJsonObject(rawPrompt.config)) {
+        promptScriptConfig = rawPrompt.config as Record<string, unknown>;
+      }
+    } else if (typeof rawPrompt === 'string') {
+      // Text template prompt (existing behavior)
+      prompt = rawPrompt;
       const resolved = await resolveFileReference(prompt, searchRoots);
       if (resolved.resolvedPath) {
         promptPath = path.resolve(resolved.resolvedPath);
-        // Skip validation for executable prompt templates (.ts/.js files)
-        // These are executed as subprocesses, not parsed as text templates
-        const ext = path.extname(promptPath).toLowerCase();
-        if (ext !== '.ts' && ext !== '.js') {
-          // Validate custom prompt content upfront - throws error if validation fails
-          try {
-            await validateCustomPromptContent(promptPath);
-          } catch (error) {
-            const message = error instanceof Error ? error.message : String(error);
-            // Add context and re-throw for the caller to handle
-            throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`);
-          }
+        // Validate custom prompt content upfront - throws error if validation fails
+        try {
+          await validateCustomPromptContent(promptPath);
+        } catch (error) {
+          const message = error instanceof Error ? error.message : String(error);
+          // Add context and re-throw for the caller to handle
+          throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`);
         }
       } else {
-        // Check if the prompt looks like an executable template path (.ts/.js)
-        // These must exist as files - don't fall back to inline prompt
-        const promptExt = path.extname(prompt).toLowerCase();
-        if (promptExt === '.ts' || promptExt === '.js') {
-          throw new Error(
-            `Evaluator '${name}' in '${evalId}': prompt template file not found: ${resolved.displayPath}`,
-          );
-        }
         logWarning(
           `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
           resolved.attempted.length > 0
@@ -590,8 +612,9 @@ export async function parseEvaluators(
 
     const weight = validateWeight(rawEvaluator.weight, name, evalId);
 
-    // Collect unrecognized properties as pass-through config (for executable prompt templates)
-    const knownProps = new Set(['name', 'type', 'prompt', 'model', 'rubrics', 'weight']);
+    // Collect unrecognized properties as pass-through config (for text prompt templates)
+    // Note: For script prompts, config comes from prompt.config instead
+    const knownProps = new Set(['name', 'type', 'prompt', 'model', 'rubrics', 'weight', 'config']);
     const config: Record<string, JsonValue> = {};
     for (const [key, value] of Object.entries(rawEvaluator)) {
       if (!knownProps.has(key) && value !== undefined) {
@@ -599,15 +622,26 @@ export async function parseEvaluators(
       }
     }
 
+    // Merge top-level config with any extra properties (top-level config takes precedence)
+    const topLevelConfig = isJsonObject(rawEvaluator.config)
+      ? (rawEvaluator.config as Record<string, JsonValue>)
+      : {};
+    const mergedConfig = { ...config, ...topLevelConfig };
+
+    // Determine final config: prompt.config for script prompts, merged config for text prompts
+    const finalConfig =
+      promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : undefined);
+
     evaluators.push({
       name,
       type: 'llm_judge',
       prompt,
       promptPath,
       ...(promptPath ? { resolvedPromptPath: promptPath } : {}),
+      ...(resolvedPromptScript ? { resolvedPromptScript } : {}),
       ...(parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}),
       ...(weight !== undefined ? { weight } : {}),
-      ...(Object.keys(config).length > 0 ? { config } : {}),
+      ...(finalConfig ? { config: finalConfig } : {}),
     });
   }
 
diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
index 5bbdea0a..e277f535 100644
--- a/packages/core/src/evaluation/orchestrator.ts
+++ b/packages/core/src/evaluation/orchestrator.ts
@@ -1311,27 +1311,31 @@ interface ResolveCustomPromptContext {
 
 async function resolveCustomPrompt(
   promptConfig: {
-    readonly prompt?: string;
+    readonly prompt?: string | import('./types.js').PromptScriptConfig;
     readonly promptPath?: string;
     readonly resolvedPromptPath?: string;
+    readonly resolvedPromptScript?: readonly string[];
     readonly config?: Record<string, unknown>;
   },
   context?: ResolveCustomPromptContext,
   timeoutMs?: number,
 ): Promise<string | undefined> {
+  // Executable prompt template using script array (matches code_judge pattern)
+  if (promptConfig.resolvedPromptScript && promptConfig.resolvedPromptScript.length > 0) {
+    if (!context) {
+      throw new Error('Context required for executable prompt templates');
+    }
+    return executePromptTemplate(
+      promptConfig.resolvedPromptScript,
+      context,
+      promptConfig.config,
+      timeoutMs,
+    );
+  }
+
   const promptPath = promptConfig.resolvedPromptPath ?? promptConfig.promptPath;
 
   if (promptPath) {
-    const ext = path.extname(promptPath).toLowerCase();
-
-    // Executable prompt template (same pattern as code judges)
-    if (ext === '.ts' || ext === '.js') {
-      if (!context) {
-        throw new Error('Context required for executable prompt templates (.ts/.js files)');
-      }
-      return executePromptTemplate(promptPath, context, promptConfig.config, timeoutMs);
-    }
-
     // Static text file (existing behavior)
     try {
       const content = await readTextFile(promptPath);
@@ -1341,11 +1345,18 @@ async function resolveCustomPrompt(
       console.warn(`Could not read custom prompt at ${promptPath}: ${message}`);
     }
   }
-  return promptConfig.prompt;
+
+  // Handle prompt as string - could be inline or the original prompt value
+  const promptValue = promptConfig.prompt;
+  if (typeof promptValue === 'string') {
+    return promptValue;
+  }
+
+  return undefined;
 }
 
 async function executePromptTemplate(
-  scriptPath: string,
+  script: readonly string[],
   context: ResolveCustomPromptContext,
   config?: Record<string, unknown>,
   timeoutMs?: number,
@@ -1368,10 +1379,13 @@ async function executePromptTemplate(
   };
 
   const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
+
+  // Derive cwd from the last element of the script array (the script file path)
+  const scriptPath = script[script.length - 1];
   const cwd = path.dirname(scriptPath);
 
   try {
-    const stdout = await executeScript(['bun', 'run', scriptPath], inputJson, timeoutMs, cwd);
+    const stdout = await executeScript(script, inputJson, timeoutMs, cwd);
     const prompt = stdout.trim();
 
     if (!prompt) {
diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
index eb1e458c..9f5d6595 100644
--- a/packages/core/src/evaluation/types.ts
+++ b/packages/core/src/evaluation/types.ts
@@ -191,16 +191,30 @@ export type CodeEvaluatorConfig = {
   readonly target?: TargetAccessConfig;
 };
 
+/**
+ * Executable prompt template configuration.
+ * Matches code_judge pattern for consistency.
+ */
+export type PromptScriptConfig = {
+  /** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
+  readonly script: readonly string[];
+  /** Pass-through configuration for the prompt template */
+  readonly config?: Record<string, unknown>;
+};
+
 export type LlmJudgeEvaluatorConfig = {
   readonly name: string;
   readonly type: 'llm_judge';
-  readonly prompt?: string;
+  /** Text prompt (inline or file path) or executable script config */
+  readonly prompt?: string | PromptScriptConfig;
   readonly promptPath?: string;
-  /** Resolved absolute path for prompt file (used by executable .ts/.js prompts) */
+  /** Resolved absolute path for prompt file (used for text template prompts) */
   readonly resolvedPromptPath?: string;
+  /** Resolved script array for executable prompts (matches code_judge pattern) */
+  readonly resolvedPromptScript?: readonly string[];
   readonly rubrics?: readonly RubricItem[];
   readonly weight?: number;
-  /** Pass-through configuration for custom evaluator prompts */
+  /** Pass-through configuration for custom evaluator prompts (legacy, prefer prompt.config) */
   readonly config?: Record<string, unknown>;
 };
 
diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts
index 4db19fcf..199aa619 100644
--- a/packages/core/test/evaluation/orchestrator.test.ts
+++ b/packages/core/test/evaluation/orchestrator.test.ts
@@ -1010,8 +1010,8 @@ Reference: \${input.reference_answer ?? 'none'}\`);
             {
               name: 'ts-prompt-eval',
               type: 'llm_judge',
-              promptPath: promptPath,
-              resolvedPromptPath: promptPath,
+              // Use explicit script array (matches code_judge pattern)
+              resolvedPromptScript: ['bun', 'run', promptPath],
             },
           ],
         },
@@ -1071,8 +1071,8 @@ console.log('Question: ' + input.question + '\\nAnswer: ' + input.candidate_answ
             {
               name: 'js-prompt-eval',
               type: 'llm_judge',
-              promptPath: promptPath,
-              resolvedPromptPath: promptPath,
+              // Use explicit script array - node for JavaScript files
+              resolvedPromptScript: ['node', promptPath],
             },
           ],
         },

From 48140e25249870628eaf37372c36e57fa21d4d3f Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 28 Jan 2026 10:28:56 +0000
Subject: [PATCH 7/9] feat(cli): add --eval-id as deprecated alias for --filter

Maintains backward compatibility for users who were using --eval-id.
Shows deprecation warning when used.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/index.ts | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/apps/cli/src/commands/eval/index.ts b/apps/cli/src/commands/eval/index.ts
index 6a53e0c0..70dd84b2 100644
--- a/apps/cli/src/commands/eval/index.ts
+++ b/apps/cli/src/commands/eval/index.ts
@@ -40,6 +40,12 @@ export const evalCommand = command({
       long: 'filter',
       description: 'Filter eval cases by ID pattern (glob supported, e.g., "summary-*")',
     }),
+    // Deprecated alias for --filter (backward compatibility)
+    evalId: option({
+      type: optional(string),
+      long: 'eval-id',
+      description: '[Deprecated: use --filter] Filter eval cases by ID pattern',
+    }),
     workers: option({
       type: number,
       long: 'workers',
@@ -103,11 +109,17 @@ export const evalCommand = command({
     }),
   },
   handler: async (args) => {
+    // Support deprecated --eval-id as alias for --filter
+    const filter = args.filter ?? args.evalId;
+    if (args.evalId && !args.filter) {
+      console.warn('Warning: --eval-id is deprecated, use --filter instead');
+    }
+
     const resolvedPaths = await resolveEvalPaths(args.evalPaths, process.cwd());
     const rawOptions: Record<string, unknown> = {
       target: args.target,
       targets: args.targets,
-      filter: args.filter,
+      filter,
       workers: args.workers,
       out: args.out,
       outputFormat: args.outputFormat,

From 006250e809626f359324a444dc6d44339f403f0a Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 28 Jan 2026 11:11:21 +0000
Subject: [PATCH 8/9] revert(cli): use --eval-id instead of --filter

Reverts to --eval-id as the primary flag for filtering eval cases.
This aligns with Jest/Vitest convention (--testNamePattern) where
the flag name describes what is being filtered, not the action.

Removes --filter alias to keep the CLI simple and match existing docs.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/index.ts | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/apps/cli/src/commands/eval/index.ts b/apps/cli/src/commands/eval/index.ts
index 70dd84b2..345c78e3 100644
--- a/apps/cli/src/commands/eval/index.ts
+++ b/apps/cli/src/commands/eval/index.ts
@@ -35,16 +35,10 @@ export const evalCommand = command({
       long: 'targets',
       description: 'Path to targets.yaml (overrides discovery)',
     }),
-    filter: option({
-      type: optional(string),
-      long: 'filter',
-      description: 'Filter eval cases by ID pattern (glob supported, e.g., "summary-*")',
-    }),
-    // Deprecated alias for --filter (backward compatibility)
     evalId: option({
       type: optional(string),
       long: 'eval-id',
-      description: '[Deprecated: use --filter] Filter eval cases by ID pattern',
+      description: 'Filter eval cases by ID pattern (glob supported, e.g., "summary-*")',
     }),
     workers: option({
       type: number,
@@ -109,17 +103,11 @@ export const evalCommand = command({
     }),
   },
   handler: async (args) => {
-    // Support deprecated --eval-id as alias for --filter
-    const filter = args.filter ?? args.evalId;
-    if (args.evalId && !args.filter) {
-      console.warn('Warning: --eval-id is deprecated, use --filter instead');
-    }
-
     const resolvedPaths = await resolveEvalPaths(args.evalPaths, process.cwd());
     const rawOptions: Record<string, unknown> = {
       target: args.target,
       targets: args.targets,
-      filter,
+      filter: args.evalId,
       workers: args.workers,
       out: args.out,
       outputFormat: args.outputFormat,

From daa8ccb04ec66aea8f5ca79520b144449333f60f Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 28 Jan 2026 12:42:14 +0000
Subject: [PATCH 9/9] docs(skills): add TypeScript prompt template
 documentation

Document the new definePromptTemplate SDK for creating dynamic LLM judge
prompts with TypeScript. Includes YAML configuration example and available
context fields.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .../references/custom-evaluators.md           | 60 ++++++++++++++++++-
 1 file changed, 57 insertions(+), 3 deletions(-)

diff --git a/.claude/skills/agentv-eval-builder/references/custom-evaluators.md b/.claude/skills/agentv-eval-builder/references/custom-evaluators.md
index 7b6a2958..098a92b1 100644
--- a/.claude/skills/agentv-eval-builder/references/custom-evaluators.md
+++ b/.claude/skills/agentv-eval-builder/references/custom-evaluators.md
@@ -178,11 +178,65 @@ export default defineCodeJudge(async ({ question, candidateAnswer }) => {
 
 **See also:** `examples/features/code-judge-with-llm-calls/`
 
-## LLM Judge Prompt Template
+## LLM Judge Prompt Templates
 
-LLM judges use markdown prompts. AgentV handles the output format automatically.
+LLM judges support two types of prompt templates:
 
-**Available Template Variables:**
+### Text Templates (Markdown)
+
+Simple markdown files with variable substitution. AgentV handles the output format automatically.
+
+### TypeScript/JavaScript Templates
+
+For dynamic prompt generation with full programming capabilities. Uses the same subprocess pattern as code evaluators.
+
+**YAML Configuration:**
+
+```yaml
+evaluators:
+  - name: custom-eval
+    type: llm_judge
+    prompt:
+      script: [bun, run, ../prompts/custom-evaluator.ts]
+      config:  # Optional, passed to script
+        rubric: "Your rubric here"
+        strictMode: true
+```
+
+**TypeScript Template:**
+
+```typescript
+#!/usr/bin/env bun
+import { definePromptTemplate } from '@agentv/eval';
+
+export default definePromptTemplate((ctx) => {
+  const rubric = ctx.config?.rubric as string | undefined;
+
+  return `You are evaluating an AI assistant's response.
+
+## Question
+${ctx.question}
+
+## Candidate Answer
+${ctx.candidateAnswer}
+
+${ctx.referenceAnswer ? `## Reference Answer\n${ctx.referenceAnswer}` : ''}
+
+${rubric ? `## Evaluation Criteria\n${rubric}` : ''}
+
+Evaluate and provide a score from 0 to 1.`;
+});
+```
+
+**Available context fields:** `question`, `candidateAnswer`, `referenceAnswer`, `expectedOutcome`, `expectedMessages`, `outputMessages`, `config`, `traceSummary`
+
+**See also:** `examples/features/prompt-template-sdk/`
+
+---
+
+## Text Template Variables
+
+**Available variables for markdown templates:**
 - `{{question}}` - The original question/task
 - `{{expected_outcome}}` - What the answer should accomplish
 - `{{candidate_answer}}` - The actual output to evaluate