EntityProcess · christso · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026
diff --git a/.claude/skills/agentv-eval-builder/references/custom-evaluators.md b/.claude/skills/agentv-eval-builder/references/custom-evaluators.md
@@ -178,11 +178,65 @@ export default defineCodeJudge(async ({ question, candidateAnswer }) => {
 
 **See also:** `examples/features/code-judge-with-llm-calls/`
 
-## LLM Judge Prompt Template
+## LLM Judge Prompt Templates
 
-LLM judges use markdown prompts. AgentV handles the output format automatically.
+LLM judges support two types of prompt templates:
 
-**Available Template Variables:**
+### Text Templates (Markdown)
+
+Simple markdown files with variable substitution. AgentV handles the output format automatically.
+
+### TypeScript/JavaScript Templates
+
+For dynamic prompt generation with full programming capabilities. Uses the same subprocess pattern as code evaluators.
+
+**YAML Configuration:**
+
+```yaml
+evaluators:
+  - name: custom-eval
+    type: llm_judge
+    prompt:
+      script: [bun, run, ../prompts/custom-evaluator.ts]
+      config:  # Optional, passed to script
+        rubric: "Your rubric here"
+        strictMode: true
+```
+
+**TypeScript Template:**
+
+```typescript
+#!/usr/bin/env bun
+import { definePromptTemplate } from '@agentv/eval';
+
+export default definePromptTemplate((ctx) => {
+  const rubric = ctx.config?.rubric as string | undefined;
+
+  return `You are evaluating an AI assistant's response.
+
+## Question
+${ctx.question}
+
+## Candidate Answer
+${ctx.candidateAnswer}
+
+${ctx.referenceAnswer ? `## Reference Answer\n${ctx.referenceAnswer}` : ''}
+
+${rubric ? `## Evaluation Criteria\n${rubric}` : ''}
+
+Evaluate and provide a score from 0 to 1.`;
+});
+```
+
+**Available context fields:** `question`, `candidateAnswer`, `referenceAnswer`, `expectedOutcome`, `expectedMessages`, `outputMessages`, `config`, `traceSummary`
+
+**See also:** `examples/features/prompt-template-sdk/`
+
+---
+
+## Text Template Variables
+
+**Available variables for markdown templates:**
 - `{{question}}` - The original question/task
 - `{{expected_outcome}}` - What the answer should accomplish
 - `{{candidate_answer}}` - The actual output to evaluate

diff --git a/apps/cli/package.json b/apps/cli/package.json
@@ -14,10 +14,7 @@
   "bin": {
     "agentv": "./dist/cli.js"
   },
-  "files": [
-    "dist",
-    "README.md"
-  ],
+  "files": ["dist", "README.md"],
   "scripts": {
     "dev": "bun --watch src/index.ts",
     "build": "tsup && bun run copy-readme",

diff --git a/apps/cli/src/commands/eval/index.ts b/apps/cli/src/commands/eval/index.ts
@@ -35,9 +35,9 @@ export const evalCommand = command({
       long: 'targets',
       description: 'Path to targets.yaml (overrides discovery)',
     }),
-    filter: option({
+    evalId: option({
       type: optional(string),
-      long: 'filter',
+      long: 'eval-id',
       description: 'Filter eval cases by ID pattern (glob supported, e.g., "summary-*")',
     }),
     workers: option({
@@ -107,7 +107,7 @@ export const evalCommand = command({
     const rawOptions: Record<string, unknown> = {
       target: args.target,
       targets: args.targets,
-      filter: args.filter,
+      filter: args.evalId,
       workers: args.workers,
       out: args.out,
       outputFormat: args.outputFormat,

diff --git a/examples/features/prompt-template-sdk/README.md b/examples/features/prompt-template-sdk/README.md
@@ -0,0 +1,58 @@
+# Prompt Template SDK
+
+This example demonstrates using TypeScript files for custom LLM judge prompts using the `definePromptTemplate` helper from `@agentv/eval`.
+
+## Features
+
+- **Type-safe prompt generation**: Full TypeScript support with autocomplete for context fields
+- **Conditional logic**: Use JavaScript/TypeScript conditionals for dynamic prompts
+- **Config pass-through**: Access custom config from YAML in your prompt template
+- **Same pattern as code judges**: Follows the familiar subprocess pattern
+
+## How It Works
+
+Instead of static text files with `{{variable}}` placeholders, you can use TypeScript files that export a prompt template:
+
+```typescript
+import { definePromptTemplate } from '@agentv/eval';
+
+export default definePromptTemplate((ctx) => `
+  Question: ${ctx.question}
+  Answer: ${ctx.candidateAnswer}
+
+  ${ctx.referenceAnswer ? `Reference: ${ctx.referenceAnswer}` : ''}
+`);
+```
+
+The template receives evaluation context via stdin (JSON) and outputs the prompt string to stdout.
+
+## Available Context Fields
+
+- `question` - The eval case question
+- `candidateAnswer` - The agent's response being evaluated
+- `referenceAnswer` - Optional reference answer
+- `expectedOutcome` - Optional expected outcome
+- `expectedMessages` - Optional expected messages
+- `outputMessages` - Optional output messages from agent
+- `guidelineFiles` - Paths to guideline files
+- `inputFiles` - Paths to input files
+- `inputMessages` - Input messages to agent
+- `traceSummary` - Optional trace summary with tool usage metrics
+- `config` - Optional pass-through config from YAML
+
+## Running
+
+```bash
+bun agentv eval examples/features/prompt-template-sdk/evals/dataset.yaml --dry-run
+```
+
+## File Structure
+
+```
+prompt-template-sdk/
+  evals/
+    dataset.yaml       # Eval cases using TypeScript prompt
+  prompts/
+    custom-evaluator.ts  # TypeScript prompt template
+  README.md
+```
diff --git a/examples/features/prompt-template-sdk/evals/dataset.yaml b/examples/features/prompt-template-sdk/evals/dataset.yaml
@@ -0,0 +1,56 @@
+# Prompt Template SDK Demo
+# Demonstrates using TypeScript/JavaScript files for custom evaluator prompts.
+# Uses the same explicit script pattern as code_judge for consistency.
+
+description: Demonstrates TypeScript prompt templates for custom LLM judge prompts
+
+# Uses the default target defined in .agentv/targets.yaml
+execution:
+  target: default
+
+evalcases:
+  - id: prompt-template-basic
+    expected_outcome: The CLI provides a clear answer about TypeScript benefits.
+
+    input_messages:
+      - role: user
+        content:
+          - type: text
+            value: What are the main benefits of TypeScript over JavaScript?
+
+    reference_answer: |-
+      TypeScript provides static type checking, better IDE support, and improved maintainability.
+
+    execution:
+      evaluators:
+        - name: custom-prompt-eval
+          type: llm_judge
+          # Executable prompt template using explicit script array (matches code_judge pattern)
+          prompt:
+            script: [bun, run, ../prompts/custom-evaluator.ts]
+
+  - id: prompt-template-with-config
+    expected_outcome: The CLI explains async/await correctly.
+
+    input_messages:
+      - role: user
+        content:
+          - type: text
+            value: Explain async/await in JavaScript.
+
+    reference_answer: |-
+      Async/await is syntactic sugar over Promises that makes asynchronous code look synchronous.
+
+    execution:
+      evaluators:
+        - name: strict-eval
+          type: llm_judge
+          # Executable prompt template with config
+          prompt:
+            script: [bun, run, ../prompts/custom-evaluator.ts]
+            config:
+              rubric: |-
+                - Must mention Promises
+                - Must explain the synchronous-looking syntax
+                - Should provide an example or use case
+              strictMode: true
diff --git a/examples/features/prompt-template-sdk/prompts/custom-evaluator.ts b/examples/features/prompt-template-sdk/prompts/custom-evaluator.ts
@@ -0,0 +1,48 @@
+#!/usr/bin/env bun
+/**
+ * Custom Prompt Template Demo
+ *
+ * Uses the declarative definePromptTemplate helper to generate
+ * a custom evaluation prompt with full TypeScript support.
+ */
+import { definePromptTemplate } from '@agentv/eval';
+
+export default definePromptTemplate((ctx) => {
+  // Access typed config from YAML
+  const rubric = ctx.config?.rubric as string | undefined;
+  const strictMode = ctx.config?.strictMode as boolean | undefined;
+
+  // Build conditional sections
+  const referenceSection = ctx.referenceAnswer
+    ? `\n## Reference Answer\n${ctx.referenceAnswer}`
+    : '';
+
+  const rubricSection = rubric ? `\n## Evaluation Rubric\n${rubric}` : '';
+
+  const strictWarning = strictMode
+    ? '\n**Note:** Strict mode enabled - minor inaccuracies should result in lower scores.'
+    : '';
+
+  return `You are evaluating an AI assistant's response.
+
+## Question
+${ctx.question}
+
+## Candidate Answer
+${ctx.candidateAnswer}
+${referenceSection}
+${rubricSection}
+${strictWarning}
+
+## Instructions
+Evaluate the candidate answer based on:
+1. Correctness - Does it accurately answer the question?
+2. Completeness - Does it address all parts of the question?
+3. Clarity - Is the response clear and well-structured?
+
+Respond with a JSON object containing:
+- score: A number from 0 to 1
+- reasoning: Brief explanation of your evaluation
+- hits: Array of positive aspects
+- misses: Array of issues or missing elements`;
+});