From 40e8fc658cba33e7277bc56a992ae52d6ecb0e38 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 28 Jan 2026 07:24:35 +0000 Subject: [PATCH 1/9] feat(eval): add definePromptTemplate SDK wrapper for executable prompt templates Add TypeScript/JavaScript support for custom evaluator prompts using the same subprocess pattern as code judges. Changes: - Add PromptTemplateInputSchema and definePromptTemplate to @agentv/eval - Update orchestrator to execute .ts/.js prompt files as subprocesses - Add config and resolvedPromptPath to LlmJudgeEvaluatorConfig - Skip validation for executable prompt templates in evaluator parser - Add unit tests for PromptTemplateInputSchema - Add integration tests for executable prompt templates - Add example in examples/features/prompt-template-sdk/ Co-Authored-By: Claude Opus 4.5 --- .../features/prompt-template-sdk/README.md | 58 ++++++ .../prompt-template-sdk/evals/dataset.yaml | 51 +++++ .../prompts/custom-evaluator.ts | 48 +++++ .../evaluation/loaders/evaluator-parser.ts | 30 ++- packages/core/src/evaluation/orchestrator.ts | 92 ++++++++- packages/core/src/evaluation/types.ts | 4 + .../core/test/evaluation/orchestrator.test.ts | 180 +++++++++++++++++ packages/eval/src/index.ts | 47 +++++ packages/eval/src/prompt-template.ts | 104 ++++++++++ packages/eval/src/schemas.ts | 20 ++ .../eval/test/define-prompt-template.test.ts | 182 ++++++++++++++++++ 11 files changed, 800 insertions(+), 16 deletions(-) create mode 100644 examples/features/prompt-template-sdk/README.md create mode 100644 examples/features/prompt-template-sdk/evals/dataset.yaml create mode 100644 examples/features/prompt-template-sdk/prompts/custom-evaluator.ts create mode 100644 packages/eval/src/prompt-template.ts create mode 100644 packages/eval/test/define-prompt-template.test.ts diff --git a/examples/features/prompt-template-sdk/README.md b/examples/features/prompt-template-sdk/README.md new file mode 100644 index 00000000..3b2347d4 --- /dev/null +++ b/examples/features/prompt-template-sdk/README.md @@ -0,0 +1,58 @@ +# Prompt Template SDK + +This example demonstrates using TypeScript files for custom LLM judge prompts using the `definePromptTemplate` helper from `@agentv/eval`. + +## Features + +- **Type-safe prompt generation**: Full TypeScript support with autocomplete for context fields +- **Conditional logic**: Use JavaScript/TypeScript conditionals for dynamic prompts +- **Config pass-through**: Access custom config from YAML in your prompt template +- **Same pattern as code judges**: Follows the familiar subprocess pattern + +## How It Works + +Instead of static text files with `{{variable}}` placeholders, you can use TypeScript files that export a prompt template: + +```typescript +import { definePromptTemplate } from '@agentv/eval'; + +export default definePromptTemplate((ctx) => ` + Question: ${ctx.question} + Answer: ${ctx.candidateAnswer} + + ${ctx.referenceAnswer ? `Reference: ${ctx.referenceAnswer}` : ''} +`); +``` + +The template receives evaluation context via stdin (JSON) and outputs the prompt string to stdout. + +## Available Context Fields + +- `question` - The eval case question +- `candidateAnswer` - The agent's response being evaluated +- `referenceAnswer` - Optional reference answer +- `expectedOutcome` - Optional expected outcome +- `expectedMessages` - Optional expected messages +- `outputMessages` - Optional output messages from agent +- `guidelineFiles` - Paths to guideline files +- `inputFiles` - Paths to input files +- `inputMessages` - Input messages to agent +- `traceSummary` - Optional trace summary with tool usage metrics +- `config` - Optional pass-through config from YAML + +## Running + +```bash +bun agentv eval examples/features/prompt-template-sdk/evals/dataset.yaml --dry-run +``` + +## File Structure + +``` +prompt-template-sdk/ + evals/ + dataset.yaml # Eval cases using TypeScript prompt + prompts/ + custom-evaluator.ts # TypeScript prompt template + README.md +``` diff --git a/examples/features/prompt-template-sdk/evals/dataset.yaml b/examples/features/prompt-template-sdk/evals/dataset.yaml new file mode 100644 index 00000000..a820cd48 --- /dev/null +++ b/examples/features/prompt-template-sdk/evals/dataset.yaml @@ -0,0 +1,51 @@ +# Prompt Template SDK Demo +# Demonstrates using TypeScript/JavaScript files for custom evaluator prompts. + +description: Demonstrates TypeScript prompt templates for custom LLM judge prompts + +# Uses the CLI target defined in .agentv/targets.yaml +execution: + target: local_cli + +evalcases: + - id: prompt-template-basic + expected_outcome: The CLI provides a clear answer about TypeScript benefits. + + input_messages: + - role: user + content: + - type: text + value: What are the main benefits of TypeScript over JavaScript? + + reference_answer: |- + TypeScript provides static type checking, better IDE support, and improved maintainability. + + execution: + evaluators: + - name: custom-prompt-eval + type: llm_judge + prompt: ../prompts/custom-evaluator.ts + + - id: prompt-template-with-config + expected_outcome: The CLI explains async/await correctly. + + input_messages: + - role: user + content: + - type: text + value: Explain async/await in JavaScript. + + reference_answer: |- + Async/await is syntactic sugar over Promises that makes asynchronous code look synchronous. + + execution: + evaluators: + - name: strict-eval + type: llm_judge + prompt: ../prompts/custom-evaluator.ts + config: + rubric: |- + - Must mention Promises + - Must explain the synchronous-looking syntax + - Should provide an example or use case + strictMode: true diff --git a/examples/features/prompt-template-sdk/prompts/custom-evaluator.ts b/examples/features/prompt-template-sdk/prompts/custom-evaluator.ts new file mode 100644 index 00000000..f4cabfad --- /dev/null +++ b/examples/features/prompt-template-sdk/prompts/custom-evaluator.ts @@ -0,0 +1,48 @@ +#!/usr/bin/env bun +/** + * Custom Prompt Template Demo + * + * Uses the declarative definePromptTemplate helper to generate + * a custom evaluation prompt with full TypeScript support. + */ +import { definePromptTemplate } from '@agentv/eval'; + +export default definePromptTemplate((ctx) => { + // Access typed config from YAML + const rubric = ctx.config?.rubric as string | undefined; + const strictMode = ctx.config?.strictMode as boolean | undefined; + + // Build conditional sections + const referenceSection = ctx.referenceAnswer + ? `\n## Reference Answer\n${ctx.referenceAnswer}` + : ''; + + const rubricSection = rubric ? `\n## Evaluation Rubric\n${rubric}` : ''; + + const strictWarning = strictMode + ? '\n**Note:** Strict mode enabled - minor inaccuracies should result in lower scores.' + : ''; + + return `You are evaluating an AI assistant's response. + +## Question +${ctx.question} + +## Candidate Answer +${ctx.candidateAnswer} +${referenceSection} +${rubricSection} +${strictWarning} + +## Instructions +Evaluate the candidate answer based on: +1. Correctness - Does it accurately answer the question? +2. Completeness - Does it address all parts of the question? +3. Clarity - Is the response clear and well-structured? + +Respond with a JSON object containing: +- score: A number from 0 to 1 +- reasoning: Brief explanation of your evaluation +- hits: Array of positive aspects +- misses: Array of issues or missing elements`; +}); diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts index 06f4afa6..7b468b6c 100644 --- a/packages/core/src/evaluation/loaders/evaluator-parser.ts +++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts @@ -528,13 +528,18 @@ export async function parseEvaluators( const resolved = await resolveFileReference(prompt, searchRoots); if (resolved.resolvedPath) { promptPath = path.resolve(resolved.resolvedPath); - // Validate custom prompt content upfront - throws error if validation fails - try { - await validateCustomPromptContent(promptPath); - } catch (error) { - const message = error instanceof Error ? error.message : String(error); - // Add context and re-throw for the caller to handle - throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`); + // Skip validation for executable prompt templates (.ts/.js files) + // These are executed as subprocesses, not parsed as text templates + const ext = path.extname(promptPath).toLowerCase(); + if (ext !== '.ts' && ext !== '.js') { + // Validate custom prompt content upfront - throws error if validation fails + try { + await validateCustomPromptContent(promptPath); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + // Add context and re-throw for the caller to handle + throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`); + } } } else { logWarning( @@ -577,13 +582,24 @@ export async function parseEvaluators( const weight = validateWeight(rawEvaluator.weight, name, evalId); + // Collect unrecognized properties as pass-through config (for executable prompt templates) + const knownProps = new Set(['name', 'type', 'prompt', 'model', 'rubrics', 'weight']); + const config: Record = {}; + for (const [key, value] of Object.entries(rawEvaluator)) { + if (!knownProps.has(key) && value !== undefined) { + config[key] = value as JsonValue; + } + } + evaluators.push({ name, type: 'llm_judge', prompt, promptPath, + ...(promptPath ? { resolvedPromptPath: promptPath } : {}), ...(parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}), ...(weight !== undefined ? { weight } : {}), + ...(Object.keys(config).length > 0 ? { config } : {}), }); } diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 94f24ac6..ce8a73c6 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -3,6 +3,7 @@ import path from 'node:path'; import micromatch from 'micromatch'; import pLimit from 'p-limit'; +import { toSnakeCaseDeep } from './case-conversion.js'; import { type ChildEvaluatorResult, CodeEvaluator, @@ -15,6 +16,7 @@ import { LlmJudgeEvaluator, TokenUsageEvaluator, ToolTrajectoryEvaluator, + executeScript, isNonEmptyString, scoreToVerdict, } from './evaluators.js'; @@ -895,6 +897,8 @@ async function runEvaluatorList(options: { promptInputs, now, judgeProvider, + outputMessages, + traceSummary, }); const weight = evaluator.weight ?? 1.0; scored.push({ score, name: evaluator.name, type: evaluator.type, weight }); @@ -1251,6 +1255,8 @@ async function runLlmJudgeEvaluator(options: { readonly promptInputs: PromptInputs; readonly now: Date; readonly judgeProvider?: Provider; + readonly outputMessages?: readonly OutputMessage[]; + readonly traceSummary?: TraceSummary; }): Promise { const { config, @@ -1263,8 +1269,16 @@ async function runLlmJudgeEvaluator(options: { promptInputs, now, judgeProvider, + outputMessages, + traceSummary, } = options; - const customPrompt = await resolveCustomPrompt(config); + const customPrompt = await resolveCustomPrompt(config, { + evalCase, + candidate, + outputMessages, + traceSummary, + config: config.config, + }); return evaluatorRegistry.llm_judge.evaluate({ evalCase, @@ -1280,20 +1294,80 @@ async function runLlmJudgeEvaluator(options: { }); } -async function resolveCustomPrompt(config: { - readonly prompt?: string; - readonly promptPath?: string; -}): Promise { - if (config.promptPath) { +interface ResolveCustomPromptContext { + readonly evalCase: EvalCase; + readonly candidate: string; + readonly outputMessages?: readonly OutputMessage[]; + readonly traceSummary?: TraceSummary; + readonly config?: Record; +} + +async function resolveCustomPrompt( + promptConfig: { + readonly prompt?: string; + readonly promptPath?: string; + readonly resolvedPromptPath?: string; + readonly config?: Record; + }, + context?: ResolveCustomPromptContext, +): Promise { + const promptPath = promptConfig.resolvedPromptPath ?? promptConfig.promptPath; + + if (promptPath) { + const ext = path.extname(promptPath).toLowerCase(); + + // Executable prompt template (same pattern as code judges) + if (ext === '.ts' || ext === '.js') { + if (!context) { + throw new Error('Context required for executable prompt templates (.ts/.js files)'); + } + return executePromptTemplate(promptPath, context, promptConfig.config); + } + + // Static text file (existing behavior) try { - const content = await readTextFile(config.promptPath); + const content = await readTextFile(promptPath); return content; } catch (error) { const message = error instanceof Error ? error.message : String(error); - console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`); + console.warn(`Could not read custom prompt at ${promptPath}: ${message}`); } } - return config.prompt; + return promptConfig.prompt; +} + +async function executePromptTemplate( + scriptPath: string, + context: ResolveCustomPromptContext, + config?: Record, +): Promise { + // Build payload matching code judge input format for consistency + const payload = { + question: context.evalCase.question, + expectedOutcome: context.evalCase.expected_outcome, + expectedMessages: context.evalCase.expected_messages, + referenceAnswer: context.evalCase.reference_answer, + candidateAnswer: context.candidate, + outputMessages: context.outputMessages ?? null, + guidelineFiles: context.evalCase.guideline_paths, + inputFiles: context.evalCase.file_paths.filter( + (p) => !context.evalCase.guideline_paths.includes(p), + ), + inputMessages: context.evalCase.input_messages, + traceSummary: context.traceSummary ?? null, + config: config ?? context.config ?? null, + }; + + const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2); + const cwd = path.dirname(scriptPath); + + try { + const stdout = await executeScript(['bun', 'run', scriptPath], inputJson, undefined, cwd); + return stdout.trim(); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + throw new Error(`Prompt template execution failed: ${message}`); + } } function filterEvalCases(evalCases: readonly EvalCase[], filter?: string): readonly EvalCase[] { diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index f41a63fb..eb1e458c 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -196,8 +196,12 @@ export type LlmJudgeEvaluatorConfig = { readonly type: 'llm_judge'; readonly prompt?: string; readonly promptPath?: string; + /** Resolved absolute path for prompt file (used by executable .ts/.js prompts) */ + readonly resolvedPromptPath?: string; readonly rubrics?: readonly RubricItem[]; readonly weight?: number; + /** Pass-through configuration for custom evaluator prompts */ + readonly config?: Record; }; /** diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 7d27093c..4db19fcf 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -957,4 +957,184 @@ describe('runEvalCase trace integration', () => { expect(result.score).toBe(0); }); }); + + describe('executable prompt templates', () => { + it('executes TypeScript prompt template and uses output as custom prompt', async () => { + const tmpDir = mkdtempSync(path.join(tmpdir(), 'prompt-template-')); + const promptPath = path.join(tmpDir, 'my-prompt.ts'); + + // Write a simple TypeScript prompt template that reads stdin manually + // (avoiding dependency on @agentv/eval which won't resolve from temp dir) + writeFileSync( + promptPath, + `import { readFileSync } from 'fs'; +const stdin = readFileSync(0, 'utf8'); +const input = JSON.parse(stdin); +console.log(\`Question: \${input.question} +Candidate: \${input.candidate_answer} +Reference: \${input.reference_answer ?? 'none'}\`); +`, + ); + + // Custom judge that captures the prompt it receives + let receivedQuestion = ''; + const captureJudge = { + kind: 'llm_judge' as const, + async evaluate(context: { evalCase: EvalCase; evaluatorTemplateOverride?: string }) { + // The evaluatorTemplateOverride should contain our custom prompt + receivedQuestion = context.evaluatorTemplateOverride ?? ''; + return { + score: 1.0, + verdict: 'pass' as const, + hits: ['Test passed'], + misses: [], + expectedAspectCount: 1, + }; + }, + }; + + const provider = new SequenceProvider('mock', { + responses: [ + { + outputMessages: [{ role: 'assistant', content: 'The answer is 4' }], + }, + ], + }); + + const result = await runEvalCase({ + evalCase: { + ...baseTestCase, + question: 'What is 2+2?', + reference_answer: 'The sum is 4', + evaluators: [ + { + name: 'ts-prompt-eval', + type: 'llm_judge', + promptPath: promptPath, + resolvedPromptPath: promptPath, + }, + ], + }, + provider, + target: baseTarget, + evaluators: { llm_judge: captureJudge }, + }); + + expect(result.score).toBe(1.0); + expect(receivedQuestion).toContain('Question: What is 2+2?'); + expect(receivedQuestion).toContain('Candidate: The answer is 4'); + expect(receivedQuestion).toContain('Reference: The sum is 4'); + }); + + it('executes JavaScript prompt template', async () => { + const tmpDir = mkdtempSync(path.join(tmpdir(), 'prompt-template-js-')); + const promptPath = path.join(tmpDir, 'my-prompt.js'); + + // Write a simple JS prompt template that reads stdin manually + writeFileSync( + promptPath, + `const fs = require('fs'); +const stdin = fs.readFileSync(0, 'utf8'); +const input = JSON.parse(stdin); +console.log('Question: ' + input.question + '\\nAnswer: ' + input.candidate_answer); +`, + ); + + let receivedPrompt = ''; + const captureJudge = { + kind: 'llm_judge' as const, + async evaluate(context: { evaluatorTemplateOverride?: string }) { + receivedPrompt = context.evaluatorTemplateOverride ?? ''; + return { + score: 1.0, + verdict: 'pass' as const, + hits: [], + misses: [], + expectedAspectCount: 1, + }; + }, + }; + + const provider = new SequenceProvider('mock', { + responses: [ + { + outputMessages: [{ role: 'assistant', content: 'Test response' }], + }, + ], + }); + + const result = await runEvalCase({ + evalCase: { + ...baseTestCase, + question: 'Test question', + evaluators: [ + { + name: 'js-prompt-eval', + type: 'llm_judge', + promptPath: promptPath, + resolvedPromptPath: promptPath, + }, + ], + }, + provider, + target: baseTarget, + evaluators: { llm_judge: captureJudge }, + }); + + expect(result.score).toBe(1.0); + expect(receivedPrompt).toContain('Question: Test question'); + expect(receivedPrompt).toContain('Answer: Test response'); + }); + + it('falls back to text file reading for .txt files', async () => { + const tmpDir = mkdtempSync(path.join(tmpdir(), 'prompt-txt-')); + const promptPath = path.join(tmpDir, 'my-prompt.txt'); + + // Write a static text prompt + writeFileSync(promptPath, 'Static prompt content from text file'); + + let receivedPrompt = ''; + const captureJudge = { + kind: 'llm_judge' as const, + async evaluate(context: { evaluatorTemplateOverride?: string }) { + receivedPrompt = context.evaluatorTemplateOverride ?? ''; + return { + score: 1.0, + verdict: 'pass' as const, + hits: [], + misses: [], + expectedAspectCount: 1, + }; + }, + }; + + const provider = new SequenceProvider('mock', { + responses: [ + { + outputMessages: [{ role: 'assistant', content: 'Response' }], + }, + ], + }); + + const result = await runEvalCase({ + evalCase: { + ...baseTestCase, + evaluators: [ + { + name: 'txt-prompt-eval', + type: 'llm_judge', + promptPath: promptPath, + resolvedPromptPath: promptPath, + }, + ], + }, + provider, + target: baseTarget, + evaluators: { llm_judge: captureJudge }, + }); + + expect(result.score).toBe(1.0); + expect(receivedPrompt).toBe('Static prompt content from text file'); + }); + }); }); diff --git a/packages/eval/src/index.ts b/packages/eval/src/index.ts index 8eb53afc..ddb3161c 100644 --- a/packages/eval/src/index.ts +++ b/packages/eval/src/index.ts @@ -47,12 +47,14 @@ export { MessageSchema, ToolCallSchema, TokenUsageSchema, + PromptTemplateInputSchema, type CodeJudgeInput, type CodeJudgeResult, type TraceSummary, type Message, type ToolCall, type TokenUsage, + type PromptTemplateInput, } from './schemas.js'; // Re-export target client @@ -69,10 +71,12 @@ export { // Re-export Zod for typed config support export { z } from 'zod'; +import { type PromptTemplateHandler, runPromptTemplate } from './prompt-template.js'; // Import runtime import { type CodeJudgeHandler, runCodeJudge } from './runtime.js'; export type { CodeJudgeHandler }; +export type { PromptTemplateHandler }; /** * Define a code judge evaluator with automatic stdin/stdout handling. @@ -122,3 +126,46 @@ export function defineCodeJudge(handler: CodeJudgeHandler): void { // Run immediately when module is loaded runCodeJudge(handler); } + +/** + * Define a prompt template with automatic stdin/stdout handling. + * + * This function: + * 1. Reads JSON from stdin (snake_case format) + * 2. Converts to camelCase and validates with Zod + * 3. Calls your handler with typed input + * 4. Outputs the generated prompt string to stdout + * 5. Handles errors gracefully with proper exit codes + * + * @param handler - Function that generates the prompt string from input + * + * @example + * ```typescript + * import { definePromptTemplate } from '@agentv/eval'; + * + * export default definePromptTemplate((ctx) => ` + * Question: ${ctx.question} + * Answer: ${ctx.candidateAnswer} + * + * ${ctx.referenceAnswer ? `Reference: ${ctx.referenceAnswer}` : ''} + * `); + * ``` + * + * @example With conditional logic + * ```typescript + * import { definePromptTemplate } from '@agentv/eval'; + * + * export default definePromptTemplate((ctx) => { + * const rubric = ctx.config?.rubric as string | undefined; + * return ` + * Question: ${ctx.question} + * Candidate Answer: ${ctx.candidateAnswer} + * ${rubric ? `\nEvaluation Criteria:\n${rubric}` : ''} + * `; + * }); + * ``` + */ +export function definePromptTemplate(handler: PromptTemplateHandler): void { + // Run immediately when module is loaded + runPromptTemplate(handler); +} diff --git a/packages/eval/src/prompt-template.ts b/packages/eval/src/prompt-template.ts new file mode 100644 index 00000000..c96b1fdd --- /dev/null +++ b/packages/eval/src/prompt-template.ts @@ -0,0 +1,104 @@ +/** + * Runtime for prompt template evaluators. + * Handles stdin parsing, validation, error handling, and string output. + */ +import { readFileSync } from 'node:fs'; + +import { toCamelCaseDeep } from './case-conversion.js'; +import { type PromptTemplateInput, PromptTemplateInputSchema } from './schemas.js'; + +/** + * Handler function type for prompt templates. + * Returns the prompt string to use for evaluation. + */ +export type PromptTemplateHandler = (input: PromptTemplateInput) => string | Promise; + +/** + * Read stdin synchronously (works in both Node.js and Bun). + */ +function readStdin(): string { + return readFileSync(0, 'utf8'); +} + +/** + * Run a prompt template handler with full stdin/stdout handling. + * This is the internal implementation called by definePromptTemplate. + */ +export async function runPromptTemplate(handler: PromptTemplateHandler): Promise { + try { + // 1. Read stdin + const stdin = readStdin(); + + // 2. Parse JSON + const rawInput = JSON.parse(stdin) as Record; + + // 3. Convert snake_case to camelCase + const camelInput = toCamelCaseDeep(rawInput); + + // 4. Validate input with Zod + const input = PromptTemplateInputSchema.parse(camelInput); + + // 5. Run handler + const prompt = await handler(input); + + // 6. Output raw string (not JSON) - the prompt itself + console.log(prompt); + } catch (error) { + // Output error to stderr and exit with non-zero code + console.error(error instanceof Error ? error.message : String(error)); + process.exit(1); + } +} + +/** + * Define a prompt template with automatic stdin/stdout handling. + * + * This function: + * 1. Reads JSON from stdin (snake_case format) + * 2. Converts to camelCase and validates with Zod + * 3. Calls your handler with typed input + * 4. Outputs the generated prompt string to stdout + * 5. Handles errors gracefully with proper exit codes + * + * @param handler - Function that generates the prompt string from input + * + * @example + * ```typescript + * import { definePromptTemplate } from '@agentv/eval'; + * + * export default definePromptTemplate((ctx) => ` + * Question: ${ctx.question} + * Answer: ${ctx.candidateAnswer} + * + * ${ctx.referenceAnswer ? `Reference: ${ctx.referenceAnswer}` : ''} + * `); + * ``` + * + * @example With conditional logic + * ```typescript + * import { definePromptTemplate } from '@agentv/eval'; + * + * export default definePromptTemplate((ctx) => { + * const rubric = ctx.config?.rubric as string | undefined; + * return ` + * Question: ${ctx.question} + * Candidate Answer: ${ctx.candidateAnswer} + * ${rubric ? `\nEvaluation Criteria:\n${rubric}` : ''} + * `; + * }); + * ``` + * + * @example Async handler + * ```typescript + * import { definePromptTemplate } from '@agentv/eval'; + * + * export default definePromptTemplate(async (ctx) => { + * // Async operations are supported + * return `Question: ${ctx.question}\nAnswer: ${ctx.candidateAnswer}`; + * }); + * ``` + */ +export function definePromptTemplate(handler: PromptTemplateHandler): void { + // Run immediately when module is loaded + runPromptTemplate(handler); +} diff --git a/packages/eval/src/schemas.ts b/packages/eval/src/schemas.ts index f9afa6b3..af6bde4f 100644 --- a/packages/eval/src/schemas.ts +++ b/packages/eval/src/schemas.ts @@ -88,3 +88,23 @@ export type TraceSummary = z.infer; export type Message = z.infer; export type ToolCall = z.infer; export type TokenUsage = z.infer; + +/** + * Prompt template input schema (camelCase, converted from snake_case wire format). + * Uses the same fields as CodeJudgeInput for consistency. + */ +export const PromptTemplateInputSchema = z.object({ + question: z.string(), + expectedOutcome: z.string().optional(), + expectedMessages: z.array(MessageSchema).optional(), + referenceAnswer: z.string().optional(), + candidateAnswer: z.string(), + outputMessages: z.array(MessageSchema).nullable().optional(), + guidelineFiles: z.array(z.string()).optional(), + inputFiles: z.array(z.string()).optional(), + inputMessages: z.array(MessageSchema).optional(), + traceSummary: TraceSummarySchema.nullable().optional(), + config: z.record(z.unknown()).nullable().optional(), +}); + +export type PromptTemplateInput = z.infer; diff --git a/packages/eval/test/define-prompt-template.test.ts b/packages/eval/test/define-prompt-template.test.ts new file mode 100644 index 00000000..ab0ab831 --- /dev/null +++ b/packages/eval/test/define-prompt-template.test.ts @@ -0,0 +1,182 @@ +import { describe, expect, it } from 'bun:test'; + +import { type PromptTemplateInput, PromptTemplateInputSchema } from '../src/schemas.js'; + +describe('PromptTemplateInputSchema', () => { + const validInput = { + question: 'What is 2+2?', + candidateAnswer: 'The answer is 4', + }; + + it('parses minimal valid input', () => { + const result = PromptTemplateInputSchema.parse(validInput); + expect(result.question).toBe('What is 2+2?'); + expect(result.candidateAnswer).toBe('The answer is 4'); + }); + + it('accepts optional expectedOutcome', () => { + const inputWithOutcome = { + ...validInput, + expectedOutcome: 'The answer should be 4', + }; + const result = PromptTemplateInputSchema.parse(inputWithOutcome); + expect(result.expectedOutcome).toBe('The answer should be 4'); + }); + + it('accepts optional expectedMessages', () => { + const inputWithMessages = { + ...validInput, + expectedMessages: [{ role: 'assistant', content: '4' }], + }; + const result = PromptTemplateInputSchema.parse(inputWithMessages); + expect(result.expectedMessages?.[0].content).toBe('4'); + }); + + it('accepts optional referenceAnswer', () => { + const inputWithReference = { + ...validInput, + referenceAnswer: 'The sum of 2 and 2 is 4', + }; + const result = PromptTemplateInputSchema.parse(inputWithReference); + expect(result.referenceAnswer).toBe('The sum of 2 and 2 is 4'); + }); + + it('accepts optional traceSummary', () => { + const inputWithTrace = { + ...validInput, + traceSummary: { + eventCount: 3, + toolNames: ['read', 'write'], + toolCallsByName: { read: 2, write: 1 }, + errorCount: 0, + }, + }; + const result = PromptTemplateInputSchema.parse(inputWithTrace); + expect(result.traceSummary?.eventCount).toBe(3); + expect(result.traceSummary?.toolNames).toEqual(['read', 'write']); + }); + + it('accepts null traceSummary', () => { + const inputWithNullTrace = { + ...validInput, + traceSummary: null, + }; + const result = PromptTemplateInputSchema.parse(inputWithNullTrace); + expect(result.traceSummary).toBeNull(); + }); + + it('accepts optional config', () => { + const inputWithConfig = { + ...validInput, + config: { rubric: 'Check for correctness', strictMode: true }, + }; + const result = PromptTemplateInputSchema.parse(inputWithConfig); + expect(result.config).toEqual({ rubric: 'Check for correctness', strictMode: true }); + }); + + it('accepts optional guidelineFiles', () => { + const inputWithGuidelines = { + ...validInput, + guidelineFiles: ['/path/to/guideline1.txt', '/path/to/guideline2.txt'], + }; + const result = PromptTemplateInputSchema.parse(inputWithGuidelines); + expect(result.guidelineFiles).toEqual(['/path/to/guideline1.txt', '/path/to/guideline2.txt']); + }); + + it('accepts optional inputFiles', () => { + const inputWithFiles = { + ...validInput, + inputFiles: ['/path/to/input1.txt'], + }; + const result = PromptTemplateInputSchema.parse(inputWithFiles); + expect(result.inputFiles).toEqual(['/path/to/input1.txt']); + }); + + it('accepts optional inputMessages', () => { + const inputWithMessages = { + ...validInput, + inputMessages: [{ role: 'user', content: 'What is 2+2?' }], + }; + const result = PromptTemplateInputSchema.parse(inputWithMessages); + expect(result.inputMessages?.[0].content).toBe('What is 2+2?'); + }); + + it('accepts optional outputMessages with toolCalls', () => { + const inputWithOutput = { + ...validInput, + outputMessages: [ + { + role: 'assistant', + content: 'Reading file...', + toolCalls: [{ tool: 'read', input: { path: 'test.txt' } }], + }, + ], + }; + const result = PromptTemplateInputSchema.parse(inputWithOutput); + expect(result.outputMessages?.[0].toolCalls?.[0].tool).toBe('read'); + }); + + it('accepts full input with all optional fields', () => { + const fullInput = { + question: 'What is 2+2?', + expectedOutcome: 'The answer should be 4', + expectedMessages: [{ role: 'assistant', content: '4' }], + referenceAnswer: 'The sum is 4', + candidateAnswer: 'The answer is 4', + outputMessages: [{ role: 'assistant', content: 'The answer is 4' }], + guidelineFiles: ['/path/to/guideline.txt'], + inputFiles: ['/path/to/input.txt'], + inputMessages: [{ role: 'user', content: 'What is 2+2?' }], + traceSummary: { + eventCount: 1, + toolNames: [], + toolCallsByName: {}, + errorCount: 0, + }, + config: { rubric: 'Check correctness' }, + }; + const result = PromptTemplateInputSchema.parse(fullInput); + expect(result.question).toBe('What is 2+2?'); + expect(result.expectedOutcome).toBe('The answer should be 4'); + expect(result.referenceAnswer).toBe('The sum is 4'); + expect(result.candidateAnswer).toBe('The answer is 4'); + expect(result.config).toEqual({ rubric: 'Check correctness' }); + }); +}); + +describe('Schema type inference', () => { + it('PromptTemplateInput has expected shape', () => { + // Type-level test: ensure inferred types have expected properties + const input: PromptTemplateInput = { + question: 'test', + candidateAnswer: 'test', + }; + + // These should all type-check correctly + const _q: string = input.question; + const _c: string = input.candidateAnswer; + const _trace: PromptTemplateInput['traceSummary'] = undefined; + const _config: PromptTemplateInput['config'] = null; + const _ref: PromptTemplateInput['referenceAnswer'] = undefined; + const _outcome: PromptTemplateInput['expectedOutcome'] = undefined; + + expect(input.question).toBe('test'); + }); + + it('PromptTemplateInput allows all optional fields to be omitted', () => { + const minimalInput: PromptTemplateInput = { + question: 'test question', + candidateAnswer: 'test answer', + }; + + expect(minimalInput.expectedOutcome).toBeUndefined(); + expect(minimalInput.expectedMessages).toBeUndefined(); + expect(minimalInput.referenceAnswer).toBeUndefined(); + expect(minimalInput.outputMessages).toBeUndefined(); + expect(minimalInput.guidelineFiles).toBeUndefined(); + expect(minimalInput.inputFiles).toBeUndefined(); + expect(minimalInput.inputMessages).toBeUndefined(); + expect(minimalInput.traceSummary).toBeUndefined(); + expect(minimalInput.config).toBeUndefined(); + }); +}); From 303f5db31d3c188830944d7365daa27730ae677a Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 28 Jan 2026 07:31:54 +0000 Subject: [PATCH 2/9] chore: archive adopt-ts-template-prompts openspec Move OpenSpec to archive after implementation is complete. Co-Authored-By: Claude Opus 4.5 --- .../design.md | 169 ++++++++++++++++++ .../proposal.md | 37 ++++ .../specs/custom-evaluator-prompts/spec.md | 92 ++++++++++ .../tasks.md | 22 +++ 4 files changed, 320 insertions(+) create mode 100644 docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/design.md create mode 100644 docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/proposal.md create mode 100644 docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/specs/custom-evaluator-prompts/spec.md create mode 100644 docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/tasks.md diff --git a/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/design.md b/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/design.md new file mode 100644 index 00000000..e0350d4c --- /dev/null +++ b/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/design.md @@ -0,0 +1,169 @@ +# Design: TypeScript Template Literals for Evaluator Prompts + +## Architecture + +Follow the established code judge pattern: subprocess execution with an SDK wrapper that handles stdin/stdout. + +### SDK: `definePromptTemplate` + +Add to `@agentv/eval` package, mirroring `defineCodeJudge`: + +```typescript +// packages/eval/src/prompt-template.ts +import { readFileSync } from 'node:fs'; +import { toCamelCaseDeep } from './case-conversion.js'; +import { PromptTemplateInputSchema, type PromptTemplateInput } from './schemas.js'; + +export type PromptTemplateHandler = ( + input: PromptTemplateInput, +) => string | Promise; + +function readStdin(): string { + return readFileSync(0, 'utf8'); +} + +export async function runPromptTemplate(handler: PromptTemplateHandler): Promise { + try { + const stdin = readStdin(); + const rawInput = JSON.parse(stdin) as Record; + const camelInput = toCamelCaseDeep(rawInput); + const input = PromptTemplateInputSchema.parse(camelInput); + + const prompt = await handler(input); + + // Output raw string (not JSON) - the prompt itself + console.log(prompt); + } catch (error) { + console.error(error instanceof Error ? error.message : String(error)); + process.exit(1); + } +} + +export function definePromptTemplate(handler: PromptTemplateHandler): void { + runPromptTemplate(handler); +} +``` + +### Input Schema + +Reuse the same input shape as code judges for consistency: + +```typescript +// packages/eval/src/schemas.ts +export const PromptTemplateInputSchema = z.object({ + question: z.string(), + expectedOutcome: z.string().optional(), + expectedMessages: z.array(MessageSchema).optional(), + referenceAnswer: z.string().optional(), + candidateAnswer: z.string(), + outputMessages: z.array(MessageSchema).nullable().optional(), + guidelineFiles: z.array(z.string()).optional(), + inputFiles: z.array(z.string()).optional(), + inputMessages: z.array(MessageSchema).optional(), + traceSummary: z.string().nullable().optional(), + config: z.record(z.unknown()).nullable().optional(), +}); + +export type PromptTemplateInput = z.infer; +``` + +### Core: Loader Changes + +Update `resolveCustomPrompt` in `orchestrator.ts` to detect executable prompt files: + +```typescript +async function resolveCustomPrompt( + promptPath: string, + context: EvaluationContext, + cwd?: string, +): Promise { + const ext = path.extname(promptPath).toLowerCase(); + + // Executable prompt template (same pattern as code judges) + if (ext === '.ts' || ext === '.js') { + return executePromptTemplate(promptPath, context, cwd); + } + + // Static text file (existing behavior) + const content = await readFile(promptPath, 'utf8'); + return substituteVariables(content, context); +} + +async function executePromptTemplate( + scriptPath: string, + context: EvaluationContext, + cwd?: string, +): Promise { + const payload = buildCodeJudgePayload(context); // Reuse existing payload builder + const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2); + + // Execute using existing infrastructure + const stdout = await executeScript( + ['bun', 'run', scriptPath], + inputJson, + undefined, // timeout + cwd, + ); + + return stdout.trim(); +} +``` + +## User Experience + +### Writing a Prompt Template + +```typescript +// my-evaluator-prompt.ts +import { definePromptTemplate } from '@agentv/eval'; + +export default definePromptTemplate((ctx) => ` +You are evaluating a response to the following question: + +Question: ${ctx.question} + +Candidate Answer: +${ctx.candidateAnswer} + +${ctx.referenceAnswer ? `Reference Answer:\n${ctx.referenceAnswer}` : ''} + +${ctx.config?.rubric ? `Evaluation Criteria:\n${ctx.config.rubric}` : ''} + +Evaluate the candidate answer and provide a score from 0 to 1. +`); +``` + +### YAML Configuration + +```yaml +cases: + - id: example + question: "What is the capital of France?" + evaluator: + type: llm_judge + prompt: ./prompts/my-evaluator-prompt.ts # Detected as executable +``` + +## Trade-offs + +| Aspect | Subprocess Pattern | In-process (jiti) | +|--------|-------------------|-------------------| +| Consistency | Same as code judges | New pattern | +| Dependencies | None (existing infra) | Adds jiti | +| Performance | Process spawn overhead | Faster | +| Isolation | Sandboxed | In-process | +| Language support | Any (TS, Python, etc.) | TS/JS only | + +The subprocess pattern is preferred because: +1. **Consistency** - Same mental model as code judges +2. **No new dependencies** - Uses existing `executeScript` infrastructure +3. **Isolation** - User code runs in separate process +4. **Language agnostic** - Could support Python prompt templates in future + +## Alternatives Considered + +### In-process loading with jiti +Rejected: Adds dependency, inconsistent with code judge pattern, runs user code in main process. + +### Require pre-compiled JS only +Rejected: Worse DX - users already expect `bun run` to handle `.ts` files. diff --git a/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/proposal.md b/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/proposal.md new file mode 100644 index 00000000..034e8a7a --- /dev/null +++ b/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/proposal.md @@ -0,0 +1,37 @@ +# Adopt TypeScript Template Literals for Custom Evaluator Prompts + +## Summary +Enable the use of native TypeScript template literals for defining custom evaluator prompts using the same subprocess pattern as code judges. This provides type safety, complex logic support, and a consistent developer experience. + +## Problem +Currently, `LlmJudgeEvaluator` relies on string templates with `{{variable}}` placeholders. This approach: +- Lacks type safety: No compile-time check if variables exist in the context. +- Has limited logic: Conditional logic or loops require complex template syntax or are impossible. +- Is error-prone: Typos in placeholders are only caught at runtime. + +## Solution +Follow the established code judge pattern: + +1. Add a `definePromptTemplate` SDK wrapper to `@agentv/eval` that handles stdin/stdout, mirroring `defineCodeJudge`. +2. Update the evaluator loader to detect `.ts`/`.js` prompt files and execute them as subprocesses. +3. The script receives evaluation context via stdin (JSON), returns the prompt string via stdout. + +Users write prompt templates the same way they write code judges: + +```typescript +import { definePromptTemplate } from '@agentv/eval'; + +export default definePromptTemplate((context) => ` + Question: ${context.question} + Answer: ${context.candidateAnswer} + + ${context.config?.includeRubric ? `Rubric: ${context.referenceAnswer}` : ''} +`); +``` + +## Impact +- **Core**: `orchestrator.ts` loader logic to detect and execute `.ts`/`.js` prompts as subprocesses. +- **SDK**: New `definePromptTemplate` wrapper in `@agentv/eval`. +- **DX**: Consistent pattern with code judges - same mental model. +- **Dependencies**: None - uses existing subprocess infrastructure. +- **Backward Compatibility**: Existing string-based templates and `.txt` prompt files continue to work. diff --git a/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/specs/custom-evaluator-prompts/spec.md b/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/specs/custom-evaluator-prompts/spec.md new file mode 100644 index 00000000..8f61d3d7 --- /dev/null +++ b/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/specs/custom-evaluator-prompts/spec.md @@ -0,0 +1,92 @@ +# Spec: Custom Evaluator Prompts + +## ADDED Requirements + +### Requirement: SDK Wrapper for Prompt Templates +The `@agentv/eval` package MUST provide a `definePromptTemplate` helper that handles stdin/stdout, mirroring the `defineCodeJudge` pattern. + +#### Scenario: Using definePromptTemplate +Given a TypeScript file that uses `definePromptTemplate` +When the file is executed as a subprocess +Then it should read evaluation context from stdin (JSON) +And output the generated prompt string to stdout + +```typescript +import { definePromptTemplate } from '@agentv/eval'; + +export default definePromptTemplate((ctx) => ` + Question: ${ctx.question} + Answer: ${ctx.candidateAnswer} +`); +``` + +#### Scenario: Type safety with PromptTemplateInput +Given a developer writing a prompt template +When they use `definePromptTemplate` +Then TypeScript should provide autocomplete for `ctx.question`, `ctx.candidateAnswer`, `ctx.referenceAnswer`, etc. + +#### Scenario: Async prompt generation +Given a prompt template that needs async operations +When the handler returns a Promise +Then the wrapper should await and output the resolved string + +```typescript +export default definePromptTemplate(async (ctx) => { + const extraContext = await fetchSomeData(); + return `Question: ${ctx.question}\nContext: ${extraContext}`; +}); +``` + +### Requirement: Executable Prompt File Detection +The evaluator loader MUST detect `.ts` and `.js` prompt files and execute them as subprocesses. + +#### Scenario: Loading a TypeScript prompt template +Given an eval case with `prompt: ./my-prompt.ts` +When the evaluator runs +Then it should execute the file as a subprocess using `bun run` +And pass the evaluation context via stdin as JSON +And use stdout as the prompt string + +#### Scenario: Loading a JavaScript prompt template +Given an eval case with `prompt: ./my-prompt.js` +When the evaluator runs +Then it should execute the file as a subprocess +And use stdout as the prompt string + +#### Scenario: Backward compatibility with text files +Given an eval case with `prompt: ./my-prompt.txt` +When the evaluator runs +Then it should read the file as text (existing behavior) +And apply `{{variable}}` substitution + +### Requirement: Consistent Input Schema +The prompt template input MUST use the same schema as code judges for consistency. + +#### Scenario: Input fields available +Given a prompt template handler +Then the input should include: +- `question` - the eval case question +- `candidateAnswer` - the agent's response +- `referenceAnswer` - optional reference answer +- `expectedOutcome` - optional expected outcome +- `expectedMessages` - optional expected messages +- `outputMessages` - optional output messages from agent +- `guidelineFiles` - paths to guideline files +- `inputFiles` - paths to input files +- `inputMessages` - input messages to agent +- `traceSummary` - optional trace summary +- `config` - optional pass-through config from YAML + +### Requirement: Error Handling +The subprocess execution MUST handle errors gracefully. + +#### Scenario: Script exits with non-zero code +Given a prompt template script that throws an error +When it is executed +Then the evaluator should fail with a descriptive error message +And include the script's stderr in the error + +#### Scenario: Script outputs nothing +Given a prompt template script that outputs an empty string +When it is executed +Then the evaluator should use the empty string as the prompt diff --git a/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/tasks.md b/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/tasks.md new file mode 100644 index 00000000..6ae3c569 --- /dev/null +++ b/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/tasks.md @@ -0,0 +1,22 @@ +# Tasks: Adopt TypeScript Template Literals for Custom Evaluator Prompts + +## SDK (`@agentv/eval`) + +- [x] Add `PromptTemplateInput` type to `packages/eval/src/schemas.ts` (reuse CodeJudgeInput fields) +- [x] Add `definePromptTemplate` wrapper to `packages/eval/src/prompt-template.ts` +- [x] Export `definePromptTemplate` and `PromptTemplateInput` from `packages/eval/src/index.ts` + +## Core (`@agentv/core`) + +- [x] Add `executePromptTemplate` function to execute `.ts`/`.js` prompt files as subprocesses +- [x] Update `resolveCustomPrompt` in `orchestrator.ts` to detect and handle executable prompts + +## Testing + +- [x] Add unit tests for `definePromptTemplate` stdin/stdout handling +- [x] Add integration tests for executable prompt templates in eval runs + +## Documentation + +- [x] Create example prompt template in `examples/features/prompt-template-sdk/` +- [ ] Update skill reference docs with prompt template pattern From 25d23fa330a83e565240603ed16bd5ed96b415d5 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 28 Jan 2026 07:34:52 +0000 Subject: [PATCH 3/9] chore: archive adopt-ts-template-prompts openspec - Archive change to openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/ - Create new spec openspec/specs/custom-evaluator-prompts/ Co-Authored-By: Claude Opus 4.5 --- .../design.md | 0 .../proposal.md | 0 .../specs/custom-evaluator-prompts/spec.md | 0 .../tasks.md | 16 ++-- .../specs/custom-evaluator-prompts/spec.md | 94 +++++++++++++++++++ 5 files changed, 102 insertions(+), 8 deletions(-) rename {docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts => openspec/changes/archive/2026-01-28-adopt-ts-template-prompts}/design.md (100%) rename {docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts => openspec/changes/archive/2026-01-28-adopt-ts-template-prompts}/proposal.md (100%) rename {docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts => openspec/changes/archive/2026-01-28-adopt-ts-template-prompts}/specs/custom-evaluator-prompts/spec.md (100%) rename {docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts => openspec/changes/archive/2026-01-28-adopt-ts-template-prompts}/tasks.md (55%) create mode 100644 openspec/specs/custom-evaluator-prompts/spec.md diff --git a/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/design.md b/openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/design.md similarity index 100% rename from docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/design.md rename to openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/design.md diff --git a/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/proposal.md b/openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/proposal.md similarity index 100% rename from docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/proposal.md rename to openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/proposal.md diff --git a/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/specs/custom-evaluator-prompts/spec.md b/openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/specs/custom-evaluator-prompts/spec.md similarity index 100% rename from docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/specs/custom-evaluator-prompts/spec.md rename to openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/specs/custom-evaluator-prompts/spec.md diff --git a/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/tasks.md b/openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/tasks.md similarity index 55% rename from docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/tasks.md rename to openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/tasks.md index 6ae3c569..2690f6d8 100644 --- a/docs/openspec/changes/archive/2025-01-28-adopt-ts-template-prompts/tasks.md +++ b/openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/tasks.md @@ -2,21 +2,21 @@ ## SDK (`@agentv/eval`) -- [x] Add `PromptTemplateInput` type to `packages/eval/src/schemas.ts` (reuse CodeJudgeInput fields) -- [x] Add `definePromptTemplate` wrapper to `packages/eval/src/prompt-template.ts` -- [x] Export `definePromptTemplate` and `PromptTemplateInput` from `packages/eval/src/index.ts` +- [ ] Add `PromptTemplateInput` type to `packages/eval/src/schemas.ts` (reuse CodeJudgeInput fields) +- [ ] Add `definePromptTemplate` wrapper to `packages/eval/src/prompt-template.ts` +- [ ] Export `definePromptTemplate` and `PromptTemplateInput` from `packages/eval/src/index.ts` ## Core (`@agentv/core`) -- [x] Add `executePromptTemplate` function to execute `.ts`/`.js` prompt files as subprocesses -- [x] Update `resolveCustomPrompt` in `orchestrator.ts` to detect and handle executable prompts +- [ ] Add `executePromptTemplate` function to execute `.ts`/`.js` prompt files as subprocesses +- [ ] Update `resolveCustomPrompt` in `orchestrator.ts` to detect and handle executable prompts ## Testing -- [x] Add unit tests for `definePromptTemplate` stdin/stdout handling -- [x] Add integration tests for executable prompt templates in eval runs +- [ ] Add unit tests for `definePromptTemplate` stdin/stdout handling +- [ ] Add integration tests for executable prompt templates in eval runs ## Documentation -- [x] Create example prompt template in `examples/features/prompt-template-sdk/` +- [ ] Create example prompt template in `examples/features/prompt-template-sdk/` - [ ] Update skill reference docs with prompt template pattern diff --git a/openspec/specs/custom-evaluator-prompts/spec.md b/openspec/specs/custom-evaluator-prompts/spec.md new file mode 100644 index 00000000..5485705b --- /dev/null +++ b/openspec/specs/custom-evaluator-prompts/spec.md @@ -0,0 +1,94 @@ +# custom-evaluator-prompts Specification + +## Purpose +TBD - created by archiving change adopt-ts-template-prompts. Update Purpose after archive. +## Requirements +### Requirement: SDK Wrapper for Prompt Templates +The `@agentv/eval` package MUST provide a `definePromptTemplate` helper that handles stdin/stdout, mirroring the `defineCodeJudge` pattern. + +#### Scenario: Using definePromptTemplate +Given a TypeScript file that uses `definePromptTemplate` +When the file is executed as a subprocess +Then it should read evaluation context from stdin (JSON) +And output the generated prompt string to stdout + +```typescript +import { definePromptTemplate } from '@agentv/eval'; + +export default definePromptTemplate((ctx) => ` + Question: ${ctx.question} + Answer: ${ctx.candidateAnswer} +`); +``` + +#### Scenario: Type safety with PromptTemplateInput +Given a developer writing a prompt template +When they use `definePromptTemplate` +Then TypeScript should provide autocomplete for `ctx.question`, `ctx.candidateAnswer`, `ctx.referenceAnswer`, etc. + +#### Scenario: Async prompt generation +Given a prompt template that needs async operations +When the handler returns a Promise +Then the wrapper should await and output the resolved string + +```typescript +export default definePromptTemplate(async (ctx) => { + const extraContext = await fetchSomeData(); + return `Question: ${ctx.question}\nContext: ${extraContext}`; +}); +``` + +### Requirement: Executable Prompt File Detection +The evaluator loader MUST detect `.ts` and `.js` prompt files and execute them as subprocesses. + +#### Scenario: Loading a TypeScript prompt template +Given an eval case with `prompt: ./my-prompt.ts` +When the evaluator runs +Then it should execute the file as a subprocess using `bun run` +And pass the evaluation context via stdin as JSON +And use stdout as the prompt string + +#### Scenario: Loading a JavaScript prompt template +Given an eval case with `prompt: ./my-prompt.js` +When the evaluator runs +Then it should execute the file as a subprocess +And use stdout as the prompt string + +#### Scenario: Backward compatibility with text files +Given an eval case with `prompt: ./my-prompt.txt` +When the evaluator runs +Then it should read the file as text (existing behavior) +And apply `{{variable}}` substitution + +### Requirement: Consistent Input Schema +The prompt template input MUST use the same schema as code judges for consistency. + +#### Scenario: Input fields available +Given a prompt template handler +Then the input should include: +- `question` - the eval case question +- `candidateAnswer` - the agent's response +- `referenceAnswer` - optional reference answer +- `expectedOutcome` - optional expected outcome +- `expectedMessages` - optional expected messages +- `outputMessages` - optional output messages from agent +- `guidelineFiles` - paths to guideline files +- `inputFiles` - paths to input files +- `inputMessages` - input messages to agent +- `traceSummary` - optional trace summary +- `config` - optional pass-through config from YAML + +### Requirement: Error Handling +The subprocess execution MUST handle errors gracefully. + +#### Scenario: Script exits with non-zero code +Given a prompt template script that throws an error +When it is executed +Then the evaluator should fail with a descriptive error message +And include the script's stderr in the error + +#### Scenario: Script outputs nothing +Given a prompt template script that outputs an empty string +When it is executed +Then the evaluator should use the empty string as the prompt + From 3010527dd1d5936f2d08fc9b3974b8c41b764d50 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 28 Jan 2026 08:11:15 +0000 Subject: [PATCH 4/9] fix(eval): improve prompt template robustness and schema consistency - Reuse CodeJudgeInputSchema for PromptTemplateInputSchema (consistent payloads) - Add timeout support to executePromptTemplate (prevents hanging scripts) - Validate non-empty output from prompt templates - Throw error for missing .ts/.js prompt template files (fail-fast) - Update tests to reflect required fields Co-Authored-By: Claude Opus 4.5 --- .../evaluation/loaders/evaluator-parser.ts | 8 ++ packages/core/src/evaluation/orchestrator.ts | 35 +++++--- packages/eval/src/schemas.ts | 18 +--- .../eval/test/define-prompt-template.test.ts | 89 ++++++++++++------- 4 files changed, 93 insertions(+), 57 deletions(-) diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts index 7b468b6c..d243b60c 100644 --- a/packages/core/src/evaluation/loaders/evaluator-parser.ts +++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts @@ -542,6 +542,14 @@ export async function parseEvaluators( } } } else { + // Check if the prompt looks like an executable template path (.ts/.js) + // These must exist as files - don't fall back to inline prompt + const promptExt = path.extname(prompt).toLowerCase(); + if (promptExt === '.ts' || promptExt === '.js') { + throw new Error( + `Evaluator '${name}' in '${evalId}': prompt template file not found: ${resolved.displayPath}`, + ); + } logWarning( `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`, resolved.attempted.length > 0 diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index ce8a73c6..5bbdea0a 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -899,6 +899,7 @@ async function runEvaluatorList(options: { judgeProvider, outputMessages, traceSummary, + agentTimeoutMs, }); const weight = evaluator.weight ?? 1.0; scored.push({ score, name: evaluator.name, type: evaluator.type, weight }); @@ -1257,6 +1258,7 @@ async function runLlmJudgeEvaluator(options: { readonly judgeProvider?: Provider; readonly outputMessages?: readonly OutputMessage[]; readonly traceSummary?: TraceSummary; + readonly agentTimeoutMs?: number; }): Promise { const { config, @@ -1271,14 +1273,19 @@ async function runLlmJudgeEvaluator(options: { judgeProvider, outputMessages, traceSummary, + agentTimeoutMs, } = options; - const customPrompt = await resolveCustomPrompt(config, { - evalCase, - candidate, - outputMessages, - traceSummary, - config: config.config, - }); + const customPrompt = await resolveCustomPrompt( + config, + { + evalCase, + candidate, + outputMessages, + traceSummary, + config: config.config, + }, + agentTimeoutMs, + ); return evaluatorRegistry.llm_judge.evaluate({ evalCase, @@ -1310,6 +1317,7 @@ async function resolveCustomPrompt( readonly config?: Record; }, context?: ResolveCustomPromptContext, + timeoutMs?: number, ): Promise { const promptPath = promptConfig.resolvedPromptPath ?? promptConfig.promptPath; @@ -1321,7 +1329,7 @@ async function resolveCustomPrompt( if (!context) { throw new Error('Context required for executable prompt templates (.ts/.js files)'); } - return executePromptTemplate(promptPath, context, promptConfig.config); + return executePromptTemplate(promptPath, context, promptConfig.config, timeoutMs); } // Static text file (existing behavior) @@ -1340,6 +1348,7 @@ async function executePromptTemplate( scriptPath: string, context: ResolveCustomPromptContext, config?: Record, + timeoutMs?: number, ): Promise { // Build payload matching code judge input format for consistency const payload = { @@ -1362,8 +1371,14 @@ async function executePromptTemplate( const cwd = path.dirname(scriptPath); try { - const stdout = await executeScript(['bun', 'run', scriptPath], inputJson, undefined, cwd); - return stdout.trim(); + const stdout = await executeScript(['bun', 'run', scriptPath], inputJson, timeoutMs, cwd); + const prompt = stdout.trim(); + + if (!prompt) { + throw new Error('Prompt template produced empty output'); + } + + return prompt; } catch (error) { const message = error instanceof Error ? error.message : String(error); throw new Error(`Prompt template execution failed: ${message}`); diff --git a/packages/eval/src/schemas.ts b/packages/eval/src/schemas.ts index af6bde4f..61734d71 100644 --- a/packages/eval/src/schemas.ts +++ b/packages/eval/src/schemas.ts @@ -91,20 +91,8 @@ export type TokenUsage = z.infer; /** * Prompt template input schema (camelCase, converted from snake_case wire format). - * Uses the same fields as CodeJudgeInput for consistency. + * Uses the same schema as CodeJudgeInput since the orchestrator sends identical payloads. */ -export const PromptTemplateInputSchema = z.object({ - question: z.string(), - expectedOutcome: z.string().optional(), - expectedMessages: z.array(MessageSchema).optional(), - referenceAnswer: z.string().optional(), - candidateAnswer: z.string(), - outputMessages: z.array(MessageSchema).nullable().optional(), - guidelineFiles: z.array(z.string()).optional(), - inputFiles: z.array(z.string()).optional(), - inputMessages: z.array(MessageSchema).optional(), - traceSummary: TraceSummarySchema.nullable().optional(), - config: z.record(z.unknown()).nullable().optional(), -}); +export const PromptTemplateInputSchema = CodeJudgeInputSchema; -export type PromptTemplateInput = z.infer; +export type PromptTemplateInput = CodeJudgeInput; diff --git a/packages/eval/test/define-prompt-template.test.ts b/packages/eval/test/define-prompt-template.test.ts index ab0ab831..525b9d41 100644 --- a/packages/eval/test/define-prompt-template.test.ts +++ b/packages/eval/test/define-prompt-template.test.ts @@ -3,33 +3,34 @@ import { describe, expect, it } from 'bun:test'; import { type PromptTemplateInput, PromptTemplateInputSchema } from '../src/schemas.js'; describe('PromptTemplateInputSchema', () => { + // Minimal valid input with all required fields const validInput = { question: 'What is 2+2?', + expectedOutcome: 'The answer should be 4', + expectedMessages: [], candidateAnswer: 'The answer is 4', + guidelineFiles: [], + inputFiles: [], + inputMessages: [], }; - it('parses minimal valid input', () => { + it('parses valid input with all required fields', () => { const result = PromptTemplateInputSchema.parse(validInput); expect(result.question).toBe('What is 2+2?'); expect(result.candidateAnswer).toBe('The answer is 4'); - }); - - it('accepts optional expectedOutcome', () => { - const inputWithOutcome = { - ...validInput, - expectedOutcome: 'The answer should be 4', - }; - const result = PromptTemplateInputSchema.parse(inputWithOutcome); expect(result.expectedOutcome).toBe('The answer should be 4'); + expect(result.expectedMessages).toEqual([]); + expect(result.guidelineFiles).toEqual([]); + expect(result.inputFiles).toEqual([]); + expect(result.inputMessages).toEqual([]); }); - it('accepts optional expectedMessages', () => { - const inputWithMessages = { - ...validInput, - expectedMessages: [{ role: 'assistant', content: '4' }], + it('rejects input missing required fields', () => { + const minimalInput = { + question: 'What is 2+2?', + candidateAnswer: 'The answer is 4', }; - const result = PromptTemplateInputSchema.parse(inputWithMessages); - expect(result.expectedMessages?.[0].content).toBe('4'); + expect(() => PromptTemplateInputSchema.parse(minimalInput)).toThrow(); }); it('accepts optional referenceAnswer', () => { @@ -74,7 +75,16 @@ describe('PromptTemplateInputSchema', () => { expect(result.config).toEqual({ rubric: 'Check for correctness', strictMode: true }); }); - it('accepts optional guidelineFiles', () => { + it('accepts expectedMessages with content', () => { + const inputWithMessages = { + ...validInput, + expectedMessages: [{ role: 'assistant', content: '4' }], + }; + const result = PromptTemplateInputSchema.parse(inputWithMessages); + expect(result.expectedMessages[0].content).toBe('4'); + }); + + it('accepts guidelineFiles with paths', () => { const inputWithGuidelines = { ...validInput, guidelineFiles: ['/path/to/guideline1.txt', '/path/to/guideline2.txt'], @@ -83,7 +93,7 @@ describe('PromptTemplateInputSchema', () => { expect(result.guidelineFiles).toEqual(['/path/to/guideline1.txt', '/path/to/guideline2.txt']); }); - it('accepts optional inputFiles', () => { + it('accepts inputFiles with paths', () => { const inputWithFiles = { ...validInput, inputFiles: ['/path/to/input1.txt'], @@ -92,13 +102,13 @@ describe('PromptTemplateInputSchema', () => { expect(result.inputFiles).toEqual(['/path/to/input1.txt']); }); - it('accepts optional inputMessages', () => { + it('accepts inputMessages with content', () => { const inputWithMessages = { ...validInput, inputMessages: [{ role: 'user', content: 'What is 2+2?' }], }; const result = PromptTemplateInputSchema.parse(inputWithMessages); - expect(result.inputMessages?.[0].content).toBe('What is 2+2?'); + expect(result.inputMessages[0].content).toBe('What is 2+2?'); }); it('accepts optional outputMessages with toolCalls', () => { @@ -116,7 +126,7 @@ describe('PromptTemplateInputSchema', () => { expect(result.outputMessages?.[0].toolCalls?.[0].tool).toBe('read'); }); - it('accepts full input with all optional fields', () => { + it('accepts full input with all fields', () => { const fullInput = { question: 'What is 2+2?', expectedOutcome: 'The answer should be 4', @@ -149,34 +159,49 @@ describe('Schema type inference', () => { // Type-level test: ensure inferred types have expected properties const input: PromptTemplateInput = { question: 'test', + expectedOutcome: 'expected', + expectedMessages: [], candidateAnswer: 'test', + guidelineFiles: [], + inputFiles: [], + inputMessages: [], }; // These should all type-check correctly const _q: string = input.question; const _c: string = input.candidateAnswer; + const _outcome: string = input.expectedOutcome; const _trace: PromptTemplateInput['traceSummary'] = undefined; const _config: PromptTemplateInput['config'] = null; const _ref: PromptTemplateInput['referenceAnswer'] = undefined; - const _outcome: PromptTemplateInput['expectedOutcome'] = undefined; expect(input.question).toBe('test'); }); - it('PromptTemplateInput allows all optional fields to be omitted', () => { - const minimalInput: PromptTemplateInput = { + it('PromptTemplateInput requires core fields', () => { + const input: PromptTemplateInput = { question: 'test question', + expectedOutcome: 'expected outcome', + expectedMessages: [], candidateAnswer: 'test answer', + guidelineFiles: [], + inputFiles: [], + inputMessages: [], }; - expect(minimalInput.expectedOutcome).toBeUndefined(); - expect(minimalInput.expectedMessages).toBeUndefined(); - expect(minimalInput.referenceAnswer).toBeUndefined(); - expect(minimalInput.outputMessages).toBeUndefined(); - expect(minimalInput.guidelineFiles).toBeUndefined(); - expect(minimalInput.inputFiles).toBeUndefined(); - expect(minimalInput.inputMessages).toBeUndefined(); - expect(minimalInput.traceSummary).toBeUndefined(); - expect(minimalInput.config).toBeUndefined(); + // Required fields must be present + expect(input.question).toBe('test question'); + expect(input.expectedOutcome).toBe('expected outcome'); + expect(input.candidateAnswer).toBe('test answer'); + expect(input.expectedMessages).toEqual([]); + expect(input.guidelineFiles).toEqual([]); + expect(input.inputFiles).toEqual([]); + expect(input.inputMessages).toEqual([]); + + // Optional fields can be omitted + expect(input.referenceAnswer).toBeUndefined(); + expect(input.outputMessages).toBeUndefined(); + expect(input.traceSummary).toBeUndefined(); + expect(input.config).toBeUndefined(); }); }); From 6f1a54cd3c00f9391e9342dccab0c9db77af3d9d Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 28 Jan 2026 09:41:19 +0000 Subject: [PATCH 5/9] fix(examples): use default target for prompt-template-sdk example Co-Authored-By: Claude Opus 4.5 --- examples/features/prompt-template-sdk/evals/dataset.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/features/prompt-template-sdk/evals/dataset.yaml b/examples/features/prompt-template-sdk/evals/dataset.yaml index a820cd48..f899db95 100644 --- a/examples/features/prompt-template-sdk/evals/dataset.yaml +++ b/examples/features/prompt-template-sdk/evals/dataset.yaml @@ -3,9 +3,9 @@ description: Demonstrates TypeScript prompt templates for custom LLM judge prompts -# Uses the CLI target defined in .agentv/targets.yaml +# Uses the default target defined in .agentv/targets.yaml execution: - target: local_cli + target: default evalcases: - id: prompt-template-basic From 7450dc6986174d08790c0e8ed8b9c6941b2659ee Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 28 Jan 2026 10:13:54 +0000 Subject: [PATCH 6/9] refactor(eval): use explicit script arrays for executable prompt templates Change executable prompt templates to use explicit script arrays instead of auto-detecting runtime by file extension. This matches the code_judge pattern for consistency. Before: prompt: ../prompts/custom-evaluator.ts # ambiguous runtime After: prompt: script: [bun, run, ../prompts/custom-evaluator.ts] config: { ... } Benefits: - Consistent with code_judge pattern (one mental model) - No ambiguity about runtime (user explicitly specifies bun/node/python) - Future-proof (works with any runtime without code changes) - Aligns with "Built-ins for Primitives Only" design principle Co-Authored-By: Claude Opus 4.5 --- apps/cli/package.json | 5 +- .../prompt-template-sdk/evals/dataset.yaml | 21 +- .../design.md | 188 ++++++++++++++---- packages/core/package.json | 5 +- .../evaluation/loaders/evaluator-parser.ts | 84 +++++--- packages/core/src/evaluation/orchestrator.ts | 42 ++-- packages/core/src/evaluation/types.ts | 20 +- .../core/test/evaluation/orchestrator.test.ts | 8 +- 8 files changed, 269 insertions(+), 104 deletions(-) diff --git a/apps/cli/package.json b/apps/cli/package.json index eadb3a1c..48296e92 100644 --- a/apps/cli/package.json +++ b/apps/cli/package.json @@ -14,10 +14,7 @@ "bin": { "agentv": "./dist/cli.js" }, - "files": [ - "dist", - "README.md" - ], + "files": ["dist", "README.md"], "scripts": { "dev": "bun --watch src/index.ts", "build": "tsup && bun run copy-readme", diff --git a/examples/features/prompt-template-sdk/evals/dataset.yaml b/examples/features/prompt-template-sdk/evals/dataset.yaml index f899db95..876c8299 100644 --- a/examples/features/prompt-template-sdk/evals/dataset.yaml +++ b/examples/features/prompt-template-sdk/evals/dataset.yaml @@ -1,5 +1,6 @@ # Prompt Template SDK Demo # Demonstrates using TypeScript/JavaScript files for custom evaluator prompts. +# Uses the same explicit script pattern as code_judge for consistency. description: Demonstrates TypeScript prompt templates for custom LLM judge prompts @@ -24,7 +25,9 @@ evalcases: evaluators: - name: custom-prompt-eval type: llm_judge - prompt: ../prompts/custom-evaluator.ts + # Executable prompt template using explicit script array (matches code_judge pattern) + prompt: + script: [bun, run, ../prompts/custom-evaluator.ts] - id: prompt-template-with-config expected_outcome: The CLI explains async/await correctly. @@ -42,10 +45,12 @@ evalcases: evaluators: - name: strict-eval type: llm_judge - prompt: ../prompts/custom-evaluator.ts - config: - rubric: |- - - Must mention Promises - - Must explain the synchronous-looking syntax - - Should provide an example or use case - strictMode: true + # Executable prompt template with config + prompt: + script: [bun, run, ../prompts/custom-evaluator.ts] + config: + rubric: |- + - Must mention Promises + - Must explain the synchronous-looking syntax + - Should provide an example or use case + strictMode: true diff --git a/openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/design.md b/openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/design.md index e0350d4c..a169e1bd 100644 --- a/openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/design.md +++ b/openspec/changes/archive/2026-01-28-adopt-ts-template-prompts/design.md @@ -2,7 +2,39 @@ ## Architecture -Follow the established code judge pattern: subprocess execution with an SDK wrapper that handles stdin/stdout. +Follow the established code judge pattern: subprocess execution with an SDK wrapper that handles stdin/stdout, using **explicit script arrays** to specify the runtime. + +### Key Design Decision: Explicit Script Arrays + +Executable prompt templates use the same explicit script array pattern as `code_judge`: + +```yaml +# code_judge pattern (existing) +evaluator: + type: code_judge + script: [bun, run, ../scripts/verify.ts] + +# Executable prompt template (new - same pattern) +evaluator: + type: llm_judge + prompt: + script: [bun, run, ../prompts/custom-evaluator.ts] + config: + rubric: "..." +``` + +**Why explicit script arrays instead of auto-detection?** + +| Approach | Pros | Cons | +|----------|------|------| +| Auto-detect by extension (`.ts` → bun) | Less verbose | Ambiguous, magic behavior, limited to known runtimes | +| Explicit script array | Consistent with code_judge, supports any runtime | More verbose | + +We chose explicit script arrays because: +1. **Consistency** - Same pattern as code_judge, one mental model +2. **No ambiguity** - User explicitly chooses bun, node, python, deno, etc. +3. **Future-proof** - Works with any runtime without code changes +4. **Aligns with design principles** - "Built-ins for Primitives Only" - the primitive is "execute a script" ### SDK: `definePromptTemplate` @@ -67,44 +99,68 @@ export const PromptTemplateInputSchema = z.object({ export type PromptTemplateInput = z.infer; ``` +### Core: Type Definitions + +```typescript +// packages/core/src/evaluation/types.ts + +/** + * Executable prompt template configuration. + * Matches code_judge pattern for consistency. + */ +export type PromptScriptConfig = { + /** Command array to execute (e.g., ["bun", "run", "template.ts"]) */ + readonly script: readonly string[]; + /** Pass-through configuration for the prompt template */ + readonly config?: Record; +}; + +export type LlmJudgeEvaluatorConfig = { + readonly name: string; + readonly type: 'llm_judge'; + /** Text prompt (inline or file path) or executable script config */ + readonly prompt?: string | PromptScriptConfig; + // ... other fields + /** Resolved script array for executable prompts (matches code_judge pattern) */ + readonly resolvedPromptScript?: readonly string[]; +}; +``` + ### Core: Loader Changes -Update `resolveCustomPrompt` in `orchestrator.ts` to detect executable prompt files: +The evaluator parser resolves `prompt.script` to `resolvedPromptScript`: ```typescript -async function resolveCustomPrompt( - promptPath: string, - context: EvaluationContext, - cwd?: string, -): Promise { - const ext = path.extname(promptPath).toLowerCase(); +// packages/core/src/evaluation/loaders/evaluator-parser.ts +if (isJsonObject(rawPrompt)) { + // Executable prompt template: { script: [...], config: {...} } + const scriptArray = asStringArray(rawPrompt.script, ...); - // Executable prompt template (same pattern as code judges) - if (ext === '.ts' || ext === '.js') { - return executePromptTemplate(promptPath, context, cwd); - } + // Resolve the script path (last element) + const scriptPath = scriptArray[scriptArray.length - 1]; + const resolved = await resolveFileReference(scriptPath, searchRoots); - // Static text file (existing behavior) - const content = await readFile(promptPath, 'utf8'); - return substituteVariables(content, context); + if (resolved.resolvedPath) { + resolvedPromptScript = [...scriptArray.slice(0, -1), path.resolve(resolved.resolvedPath)]; + } } +``` +The orchestrator executes using the resolved script array: + +```typescript +// packages/core/src/evaluation/orchestrator.ts async function executePromptTemplate( - scriptPath: string, - context: EvaluationContext, - cwd?: string, + script: readonly string[], // e.g., ['bun', 'run', '/abs/path/template.ts'] + context: ResolveCustomPromptContext, + config?: Record, + timeoutMs?: number, ): Promise { - const payload = buildCodeJudgePayload(context); // Reuse existing payload builder + const payload = { /* ... same as code judge */ }; const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2); + const cwd = path.dirname(script[script.length - 1]); - // Execute using existing infrastructure - const stdout = await executeScript( - ['bun', 'run', scriptPath], - inputJson, - undefined, // timeout - cwd, - ); - + const stdout = await executeScript(script, inputJson, timeoutMs, cwd); return stdout.trim(); } ``` @@ -114,7 +170,7 @@ async function executePromptTemplate( ### Writing a Prompt Template ```typescript -// my-evaluator-prompt.ts +// prompts/custom-evaluator.ts import { definePromptTemplate } from '@agentv/eval'; export default definePromptTemplate((ctx) => ` @@ -136,34 +192,82 @@ Evaluate the candidate answer and provide a score from 0 to 1. ### YAML Configuration ```yaml -cases: +evalcases: - id: example question: "What is the capital of France?" - evaluator: - type: llm_judge - prompt: ./prompts/my-evaluator-prompt.ts # Detected as executable + execution: + evaluators: + - name: custom-eval + type: llm_judge + # Executable prompt template with explicit script array + prompt: + script: [bun, run, ../prompts/custom-evaluator.ts] + config: + rubric: | + - Must be factually correct + - Should be concise +``` + +### Supported Runtimes + +The explicit script array supports any runtime: + +```yaml +# TypeScript with Bun +prompt: + script: [bun, run, ./template.ts] + +# TypeScript with Node + tsx +prompt: + script: [npx, tsx, ./template.ts] + +# JavaScript with Node +prompt: + script: [node, ./template.js] + +# Python (future) +prompt: + script: [python, ./template.py] ``` ## Trade-offs -| Aspect | Subprocess Pattern | In-process (jiti) | -|--------|-------------------|-------------------| -| Consistency | Same as code judges | New pattern | -| Dependencies | None (existing infra) | Adds jiti | -| Performance | Process spawn overhead | Faster | -| Isolation | Sandboxed | In-process | -| Language support | Any (TS, Python, etc.) | TS/JS only | +| Aspect | Subprocess Pattern | In-process (jiti/dynamic import) | +|--------|-------------------|----------------------------------| +| Consistency | Same as code judges | New pattern, different from code_judge | +| Dependencies | None (existing infra) | Adds jiti dependency | +| Performance | Process spawn overhead | Faster execution | +| Isolation | Sandboxed in subprocess | Runs in main process | +| Language support | Any (TS, JS, Python, etc.) | TypeScript/JavaScript only | +| API compatibility | Works with existing SDK | Would require different SDK API | The subprocess pattern is preferred because: 1. **Consistency** - Same mental model as code judges 2. **No new dependencies** - Uses existing `executeScript` infrastructure 3. **Isolation** - User code runs in separate process -4. **Language agnostic** - Could support Python prompt templates in future +4. **Language agnostic** - Supports any runtime (bun, node, python, deno) +5. **SDK compatibility** - The `definePromptTemplate` SDK is designed for stdin/stdout ## Alternatives Considered ### In-process loading with jiti -Rejected: Adds dependency, inconsistent with code judge pattern, runs user code in main process. + +**Rejected.** While jiti provides lighter-weight TypeScript execution without subprocess overhead: +- Adds a new dependency +- Inconsistent with code_judge pattern (subprocess) +- Runs user code in the main process (less isolation) +- Would require a different API - the current SDK reads stdin/writes stdout +- Only works for JS/TS, not other languages + +If there's demand for a lighter-weight in-process option in the future, it could be added as a separate feature (e.g., `prompt_module: ./file.ts`) rather than replacing the subprocess approach. + +### Auto-detect runtime by file extension + +**Rejected.** The original design auto-detected runtime based on file extension (`.ts` → `bun run`). This was changed to explicit script arrays because: +- Ambiguous: What runtime does `.ts` use? bun? node? tsx? +- Inconsistent: code_judge requires explicit `script:` array +- Inflexible: Adding new runtimes requires code changes ### Require pre-compiled JS only -Rejected: Worse DX - users already expect `bun run` to handle `.ts` files. + +**Rejected.** Worse DX - users already expect `bun run` to handle `.ts` files directly. diff --git a/packages/core/package.json b/packages/core/package.json index eba6c241..a79c8e90 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -36,10 +36,7 @@ "test:watch": "bun test --watch", "diagnostics:azure": "bun src/diagnostics/azure-deployment-diag.ts" }, - "files": [ - "dist", - "README.md" - ], + "files": ["dist", "README.md"], "dependencies": { "@ai-sdk/anthropic": "^2.0.53", "@ai-sdk/azure": "^2.0.78", diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts index d243b60c..0d5e895b 100644 --- a/packages/core/src/evaluation/loaders/evaluator-parser.ts +++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts @@ -522,34 +522,56 @@ export async function parseEvaluators( continue; } - const prompt = asString(rawEvaluator.prompt); + // Parse prompt field - can be string (text template) or object (executable script) + const rawPrompt = rawEvaluator.prompt; + let prompt: string | undefined; let promptPath: string | undefined; - if (prompt) { + let resolvedPromptScript: string[] | undefined; + let promptScriptConfig: Record | undefined; + + if (isJsonObject(rawPrompt)) { + // Executable prompt template: { script: [...], config: {...} } + const scriptArray = asStringArray( + rawPrompt.script, + `prompt.script for evaluator '${name}' in '${evalId}'`, + ); + + if (!scriptArray) { + throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires script array`); + } + + // Resolve the script path (last element is typically the file path) + const scriptPath = scriptArray[scriptArray.length - 1]; + const resolved = await resolveFileReference(scriptPath, searchRoots); + + if (resolved.resolvedPath) { + // Replace the last element with the resolved path + resolvedPromptScript = [...scriptArray.slice(0, -1), path.resolve(resolved.resolvedPath)]; + } else { + throw new Error( + `Evaluator '${name}' in '${evalId}': prompt script file not found: ${resolved.displayPath}`, + ); + } + + // Extract config from prompt object + if (isJsonObject(rawPrompt.config)) { + promptScriptConfig = rawPrompt.config as Record; + } + } else if (typeof rawPrompt === 'string') { + // Text template prompt (existing behavior) + prompt = rawPrompt; const resolved = await resolveFileReference(prompt, searchRoots); if (resolved.resolvedPath) { promptPath = path.resolve(resolved.resolvedPath); - // Skip validation for executable prompt templates (.ts/.js files) - // These are executed as subprocesses, not parsed as text templates - const ext = path.extname(promptPath).toLowerCase(); - if (ext !== '.ts' && ext !== '.js') { - // Validate custom prompt content upfront - throws error if validation fails - try { - await validateCustomPromptContent(promptPath); - } catch (error) { - const message = error instanceof Error ? error.message : String(error); - // Add context and re-throw for the caller to handle - throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`); - } + // Validate custom prompt content upfront - throws error if validation fails + try { + await validateCustomPromptContent(promptPath); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + // Add context and re-throw for the caller to handle + throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`); } } else { - // Check if the prompt looks like an executable template path (.ts/.js) - // These must exist as files - don't fall back to inline prompt - const promptExt = path.extname(prompt).toLowerCase(); - if (promptExt === '.ts' || promptExt === '.js') { - throw new Error( - `Evaluator '${name}' in '${evalId}': prompt template file not found: ${resolved.displayPath}`, - ); - } logWarning( `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`, resolved.attempted.length > 0 @@ -590,8 +612,9 @@ export async function parseEvaluators( const weight = validateWeight(rawEvaluator.weight, name, evalId); - // Collect unrecognized properties as pass-through config (for executable prompt templates) - const knownProps = new Set(['name', 'type', 'prompt', 'model', 'rubrics', 'weight']); + // Collect unrecognized properties as pass-through config (for text prompt templates) + // Note: For script prompts, config comes from prompt.config instead + const knownProps = new Set(['name', 'type', 'prompt', 'model', 'rubrics', 'weight', 'config']); const config: Record = {}; for (const [key, value] of Object.entries(rawEvaluator)) { if (!knownProps.has(key) && value !== undefined) { @@ -599,15 +622,26 @@ export async function parseEvaluators( } } + // Merge top-level config with any extra properties (top-level config takes precedence) + const topLevelConfig = isJsonObject(rawEvaluator.config) + ? (rawEvaluator.config as Record) + : {}; + const mergedConfig = { ...config, ...topLevelConfig }; + + // Determine final config: prompt.config for script prompts, merged config for text prompts + const finalConfig = + promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : undefined); + evaluators.push({ name, type: 'llm_judge', prompt, promptPath, ...(promptPath ? { resolvedPromptPath: promptPath } : {}), + ...(resolvedPromptScript ? { resolvedPromptScript } : {}), ...(parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}), ...(weight !== undefined ? { weight } : {}), - ...(Object.keys(config).length > 0 ? { config } : {}), + ...(finalConfig ? { config: finalConfig } : {}), }); } diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 5bbdea0a..e277f535 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -1311,27 +1311,31 @@ interface ResolveCustomPromptContext { async function resolveCustomPrompt( promptConfig: { - readonly prompt?: string; + readonly prompt?: string | import('./types.js').PromptScriptConfig; readonly promptPath?: string; readonly resolvedPromptPath?: string; + readonly resolvedPromptScript?: readonly string[]; readonly config?: Record; }, context?: ResolveCustomPromptContext, timeoutMs?: number, ): Promise { + // Executable prompt template using script array (matches code_judge pattern) + if (promptConfig.resolvedPromptScript && promptConfig.resolvedPromptScript.length > 0) { + if (!context) { + throw new Error('Context required for executable prompt templates'); + } + return executePromptTemplate( + promptConfig.resolvedPromptScript, + context, + promptConfig.config, + timeoutMs, + ); + } + const promptPath = promptConfig.resolvedPromptPath ?? promptConfig.promptPath; if (promptPath) { - const ext = path.extname(promptPath).toLowerCase(); - - // Executable prompt template (same pattern as code judges) - if (ext === '.ts' || ext === '.js') { - if (!context) { - throw new Error('Context required for executable prompt templates (.ts/.js files)'); - } - return executePromptTemplate(promptPath, context, promptConfig.config, timeoutMs); - } - // Static text file (existing behavior) try { const content = await readTextFile(promptPath); @@ -1341,11 +1345,18 @@ async function resolveCustomPrompt( console.warn(`Could not read custom prompt at ${promptPath}: ${message}`); } } - return promptConfig.prompt; + + // Handle prompt as string - could be inline or the original prompt value + const promptValue = promptConfig.prompt; + if (typeof promptValue === 'string') { + return promptValue; + } + + return undefined; } async function executePromptTemplate( - scriptPath: string, + script: readonly string[], context: ResolveCustomPromptContext, config?: Record, timeoutMs?: number, @@ -1368,10 +1379,13 @@ async function executePromptTemplate( }; const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2); + + // Derive cwd from the last element of the script array (the script file path) + const scriptPath = script[script.length - 1]; const cwd = path.dirname(scriptPath); try { - const stdout = await executeScript(['bun', 'run', scriptPath], inputJson, timeoutMs, cwd); + const stdout = await executeScript(script, inputJson, timeoutMs, cwd); const prompt = stdout.trim(); if (!prompt) { diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index eb1e458c..9f5d6595 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -191,16 +191,30 @@ export type CodeEvaluatorConfig = { readonly target?: TargetAccessConfig; }; +/** + * Executable prompt template configuration. + * Matches code_judge pattern for consistency. + */ +export type PromptScriptConfig = { + /** Command array to execute (e.g., ["bun", "run", "template.ts"]) */ + readonly script: readonly string[]; + /** Pass-through configuration for the prompt template */ + readonly config?: Record; +}; + export type LlmJudgeEvaluatorConfig = { readonly name: string; readonly type: 'llm_judge'; - readonly prompt?: string; + /** Text prompt (inline or file path) or executable script config */ + readonly prompt?: string | PromptScriptConfig; readonly promptPath?: string; - /** Resolved absolute path for prompt file (used by executable .ts/.js prompts) */ + /** Resolved absolute path for prompt file (used for text template prompts) */ readonly resolvedPromptPath?: string; + /** Resolved script array for executable prompts (matches code_judge pattern) */ + readonly resolvedPromptScript?: readonly string[]; readonly rubrics?: readonly RubricItem[]; readonly weight?: number; - /** Pass-through configuration for custom evaluator prompts */ + /** Pass-through configuration for custom evaluator prompts (legacy, prefer prompt.config) */ readonly config?: Record; }; diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 4db19fcf..199aa619 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -1010,8 +1010,8 @@ Reference: \${input.reference_answer ?? 'none'}\`); { name: 'ts-prompt-eval', type: 'llm_judge', - promptPath: promptPath, - resolvedPromptPath: promptPath, + // Use explicit script array (matches code_judge pattern) + resolvedPromptScript: ['bun', 'run', promptPath], }, ], }, @@ -1071,8 +1071,8 @@ console.log('Question: ' + input.question + '\\nAnswer: ' + input.candidate_answ { name: 'js-prompt-eval', type: 'llm_judge', - promptPath: promptPath, - resolvedPromptPath: promptPath, + // Use explicit script array - node for JavaScript files + resolvedPromptScript: ['node', promptPath], }, ], }, From 48140e25249870628eaf37372c36e57fa21d4d3f Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 28 Jan 2026 10:28:56 +0000 Subject: [PATCH 7/9] feat(cli): add --eval-id as deprecated alias for --filter Maintains backward compatibility for users who were using --eval-id. Shows deprecation warning when used. Co-Authored-By: Claude Opus 4.5 --- apps/cli/src/commands/eval/index.ts | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/apps/cli/src/commands/eval/index.ts b/apps/cli/src/commands/eval/index.ts index 6a53e0c0..70dd84b2 100644 --- a/apps/cli/src/commands/eval/index.ts +++ b/apps/cli/src/commands/eval/index.ts @@ -40,6 +40,12 @@ export const evalCommand = command({ long: 'filter', description: 'Filter eval cases by ID pattern (glob supported, e.g., "summary-*")', }), + // Deprecated alias for --filter (backward compatibility) + evalId: option({ + type: optional(string), + long: 'eval-id', + description: '[Deprecated: use --filter] Filter eval cases by ID pattern', + }), workers: option({ type: number, long: 'workers', @@ -103,11 +109,17 @@ export const evalCommand = command({ }), }, handler: async (args) => { + // Support deprecated --eval-id as alias for --filter + const filter = args.filter ?? args.evalId; + if (args.evalId && !args.filter) { + console.warn('Warning: --eval-id is deprecated, use --filter instead'); + } + const resolvedPaths = await resolveEvalPaths(args.evalPaths, process.cwd()); const rawOptions: Record = { target: args.target, targets: args.targets, - filter: args.filter, + filter, workers: args.workers, out: args.out, outputFormat: args.outputFormat, From 006250e809626f359324a444dc6d44339f403f0a Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 28 Jan 2026 11:11:21 +0000 Subject: [PATCH 8/9] revert(cli): use --eval-id instead of --filter Reverts to --eval-id as the primary flag for filtering eval cases. This aligns with Jest/Vitest convention (--testNamePattern) where the flag name describes what is being filtered, not the action. Removes --filter alias to keep the CLI simple and match existing docs. Co-Authored-By: Claude Opus 4.5 --- apps/cli/src/commands/eval/index.ts | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/apps/cli/src/commands/eval/index.ts b/apps/cli/src/commands/eval/index.ts index 70dd84b2..345c78e3 100644 --- a/apps/cli/src/commands/eval/index.ts +++ b/apps/cli/src/commands/eval/index.ts @@ -35,16 +35,10 @@ export const evalCommand = command({ long: 'targets', description: 'Path to targets.yaml (overrides discovery)', }), - filter: option({ - type: optional(string), - long: 'filter', - description: 'Filter eval cases by ID pattern (glob supported, e.g., "summary-*")', - }), - // Deprecated alias for --filter (backward compatibility) evalId: option({ type: optional(string), long: 'eval-id', - description: '[Deprecated: use --filter] Filter eval cases by ID pattern', + description: 'Filter eval cases by ID pattern (glob supported, e.g., "summary-*")', }), workers: option({ type: number, @@ -109,17 +103,11 @@ export const evalCommand = command({ }), }, handler: async (args) => { - // Support deprecated --eval-id as alias for --filter - const filter = args.filter ?? args.evalId; - if (args.evalId && !args.filter) { - console.warn('Warning: --eval-id is deprecated, use --filter instead'); - } - const resolvedPaths = await resolveEvalPaths(args.evalPaths, process.cwd()); const rawOptions: Record = { target: args.target, targets: args.targets, - filter, + filter: args.evalId, workers: args.workers, out: args.out, outputFormat: args.outputFormat, From daa8ccb04ec66aea8f5ca79520b144449333f60f Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 28 Jan 2026 12:42:14 +0000 Subject: [PATCH 9/9] docs(skills): add TypeScript prompt template documentation Document the new definePromptTemplate SDK for creating dynamic LLM judge prompts with TypeScript. Includes YAML configuration example and available context fields. Co-Authored-By: Claude Opus 4.5 --- .../references/custom-evaluators.md | 60 ++++++++++++++++++- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/.claude/skills/agentv-eval-builder/references/custom-evaluators.md b/.claude/skills/agentv-eval-builder/references/custom-evaluators.md index 7b6a2958..098a92b1 100644 --- a/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +++ b/.claude/skills/agentv-eval-builder/references/custom-evaluators.md @@ -178,11 +178,65 @@ export default defineCodeJudge(async ({ question, candidateAnswer }) => { **See also:** `examples/features/code-judge-with-llm-calls/` -## LLM Judge Prompt Template +## LLM Judge Prompt Templates -LLM judges use markdown prompts. AgentV handles the output format automatically. +LLM judges support two types of prompt templates: -**Available Template Variables:** +### Text Templates (Markdown) + +Simple markdown files with variable substitution. AgentV handles the output format automatically. + +### TypeScript/JavaScript Templates + +For dynamic prompt generation with full programming capabilities. Uses the same subprocess pattern as code evaluators. + +**YAML Configuration:** + +```yaml +evaluators: + - name: custom-eval + type: llm_judge + prompt: + script: [bun, run, ../prompts/custom-evaluator.ts] + config: # Optional, passed to script + rubric: "Your rubric here" + strictMode: true +``` + +**TypeScript Template:** + +```typescript +#!/usr/bin/env bun +import { definePromptTemplate } from '@agentv/eval'; + +export default definePromptTemplate((ctx) => { + const rubric = ctx.config?.rubric as string | undefined; + + return `You are evaluating an AI assistant's response. + +## Question +${ctx.question} + +## Candidate Answer +${ctx.candidateAnswer} + +${ctx.referenceAnswer ? `## Reference Answer\n${ctx.referenceAnswer}` : ''} + +${rubric ? `## Evaluation Criteria\n${rubric}` : ''} + +Evaluate and provide a score from 0 to 1.`; +}); +``` + +**Available context fields:** `question`, `candidateAnswer`, `referenceAnswer`, `expectedOutcome`, `expectedMessages`, `outputMessages`, `config`, `traceSummary` + +**See also:** `examples/features/prompt-template-sdk/` + +--- + +## Text Template Variables + +**Available variables for markdown templates:** - `{{question}}` - The original question/task - `{{expected_outcome}}` - What the answer should accomplish - `{{candidate_answer}}` - The actual output to evaluate