supermemoryai · gene-jelly · Dec 29, 2025 · Jan 2, 2026 · Jan 2, 2026 · Jan 3, 2026
diff --git a/src/benchmarks/locomo/index.ts b/src/benchmarks/locomo/index.ts
@@ -161,7 +161,9 @@ export class LoCoMoBenchmark implements Benchmark {
 
       const unifiedMessages: UnifiedMessage[] = messages.map((m) => ({
         role: m.speaker === speakerA ? ("user" as const) : ("assistant" as const),
-        content: m.text,
+        content: m.blip_caption
+          ? `${m.text} [shared image: ${m.blip_caption}]`
+          : m.text,
         speaker: m.speaker,
       }))
 

diff --git a/src/benchmarks/locomo/types.ts b/src/benchmarks/locomo/types.ts
@@ -2,6 +2,8 @@ export interface LoCoMoMessage {
   speaker: string
   dia_id: string
   text: string
+  blip_caption?: string
+  img_url?: string[]
 }
 
 export interface LoCoMoQA {

diff --git a/src/judges/cli.ts b/src/judges/cli.ts
@@ -0,0 +1,79 @@
+import { spawn } from "child_process"
+import type { Judge, JudgeConfig, JudgeInput, JudgeResult } from "../types/judge"
+import type { ProviderPrompts } from "../types/prompts"
+import { buildJudgePrompt, parseJudgeResponse, getJudgePrompt } from "./base"
+import { logger } from "../utils/logger"
+import { getModelConfig, ModelConfig } from "../utils/models"
+
+/**
+ * Call Claude CLI in print mode for text generation.
+ * Uses subprocess to avoid API key requirements.
+ */
+async function generateTextViaCli(prompt: string, modelAlias: string): Promise<string> {
+    return new Promise((resolve, reject) => {
+        const claude = spawn('claude', [
+            '-p', prompt,
+            '--output-format', 'json',
+            '--model', modelAlias,
+            '--max-budget-usd', '1.00',  // Allow $1 per evaluation (generous)
+        ], {
+            timeout: 180000,  // 3 minute timeout
+            cwd: process.cwd(),
+        })
+
+        let stdout = ''
+        let stderr = ''
+
+        claude.stdout.on('data', (data) => { stdout += data })
+        claude.stderr.on('data', (data) => { stderr += data })
+
+        claude.on('close', (code) => {
+            if (code === 0) {
+                try {
+                    const response = JSON.parse(stdout)
+                    resolve(response.result?.trim() || '')
+                } catch {
+                    resolve(stdout.trim())
+                }
+            } else {
+                reject(new Error(`Claude CLI exited with code ${code}: ${stderr}`))
+            }
+        })
+
+        claude.on('error', reject)
+    })
+}
+
+export class CliJudge implements Judge {
+    name = "cli"
+    private modelConfig: ModelConfig | null = null
+    private modelAlias: string = "sonnet"
+
+    async initialize(config: JudgeConfig): Promise<void> {
+        // For CLI, apiKey is ignored - we use the locally authenticated `claude` command
+        const modelAlias = config.model || "sonnet"
+        this.modelAlias = modelAlias
+        this.modelConfig = getModelConfig(modelAlias)
+        logger.info(`Initialized CLI judge with model: ${this.modelConfig.displayName} (${this.modelConfig.id})`)
+    }
+
+    async evaluate(input: JudgeInput): Promise<JudgeResult> {
+        if (!this.modelConfig) throw new Error("Judge not initialized")
+
+        const prompt = buildJudgePrompt(input)
+        const text = await generateTextViaCli(prompt, this.modelConfig.id)
+
+        return parseJudgeResponse(text)
+    }
+
+    getPromptForQuestionType(questionType: string, providerPrompts?: ProviderPrompts): string {
+        return getJudgePrompt(questionType, providerPrompts)
+    }
+
+    getModel(): import("ai").LanguageModel {
+        // CLI doesn't use AI SDK LanguageModel - throw if called
+        throw new Error("CLI judge does not expose an AI SDK model")
+    }
+}
+
+export default CliJudge
diff --git a/src/judges/index.ts b/src/judges/index.ts
@@ -2,11 +2,13 @@ import type { Judge, JudgeName } from "../types/judge"
 import { OpenAIJudge } from "./openai"
 import { AnthropicJudge } from "./anthropic"
 import { GoogleJudge } from "./google"
+import { CliJudge } from "./cli"
 
 const judges: Record<JudgeName, new () => Judge> = {
   openai: OpenAIJudge,
   anthropic: AnthropicJudge,
   google: GoogleJudge,
+  cli: CliJudge,
 }
 
 export function createJudge(name: JudgeName): Judge {
@@ -21,5 +23,5 @@ export function getAvailableJudges(): JudgeName[] {
   return Object.keys(judges) as JudgeName[]
 }
 
-export { OpenAIJudge, AnthropicJudge, GoogleJudge }
+export { OpenAIJudge, AnthropicJudge, GoogleJudge, CliJudge }
 export { buildJudgePrompt, parseJudgeResponse, getJudgePrompt } from "./base"
diff --git a/src/orchestrator/phases/answer.ts b/src/orchestrator/phases/answer.ts
@@ -1,4 +1,5 @@
 import { readFileSync, existsSync } from "fs"
+import { spawn } from "child_process"
 import { createOpenAI } from "@ai-sdk/openai"
 import { createAnthropic } from "@ai-sdk/anthropic"
 import { createGoogleGenerativeAI } from "@ai-sdk/google"
@@ -15,13 +16,52 @@ import { buildContextString } from "../../types/prompts"
 import { ConcurrentExecutor } from "../concurrent"
 import { resolveConcurrency } from "../../types/concurrency"
 
+/**
+ * Call Claude CLI in print mode for text generation.
+ * Uses subprocess to avoid API key requirements.
+ */
+async function generateTextViaCli(prompt: string, modelAlias: string): Promise<string> {
+  return new Promise((resolve, reject) => {
+    const claude = spawn("claude", [
+      "-p", prompt,
+      "--output-format", "json",
+      "--model", modelAlias,
+      "--max-budget-usd", "1.00",
+    ], {
+      timeout: 600000,  // 10 minutes for larger models like Opus
+      cwd: process.cwd(),
+    })
+
+    let stdout = ""
+    let stderr = ""
+
+    claude.stdout.on("data", (data) => { stdout += data })
+    claude.stderr.on("data", (data) => { stderr += data })
+
+    claude.on("close", (code) => {
+      if (code === 0) {
+        try {
+          const response = JSON.parse(stdout)
+          resolve(response.result?.trim() || "")
+        } catch {
+          resolve(stdout.trim())
+        }
+      } else {
+        reject(new Error(`Claude CLI exited with code ${code}: ${stderr}`))
+      }
+    })
+
+    claude.on("error", reject)
+  })
+}
+
 type LanguageModel =
   | ReturnType<typeof createOpenAI>
   | ReturnType<typeof createAnthropic>
   | ReturnType<typeof createGoogleGenerativeAI>
 
 function getAnsweringModel(modelAlias: string): {
-  client: LanguageModel
+  client: LanguageModel | null
   modelConfig: ModelConfig
 } {
   const modelConfig = getModelConfig(modelAlias || DEFAULT_ANSWERING_MODEL)
@@ -42,6 +82,12 @@ function getAnsweringModel(modelAlias: string): {
         client: createGoogleGenerativeAI({ apiKey: config.googleApiKey }),
         modelConfig,
       }
+    case "cli":
+      // CLI uses subprocess instead of API client
+      return {
+        client: null,
+        modelConfig,
+      }
   }
 }
 
@@ -120,18 +166,27 @@ export async function runAnswerPhase(
 
         const prompt = buildAnswerPrompt(question.question, context, questionDate, provider)
 
-        const params: Record<string, unknown> = {
-          model: client(modelConfig.id),
-          prompt,
-          maxTokens: modelConfig.defaultMaxTokens,
+        let text: string
+
+        if (modelConfig.provider === "cli") {
+          // Use CLI subprocess for Claude models
+          text = await generateTextViaCli(prompt, modelConfig.id)
+        } else {
+          // Use AI SDK for API-based models
+          const params: Record<string, unknown> = {
+            model: client!(modelConfig.id),
+            prompt,
+            maxTokens: modelConfig.defaultMaxTokens,
+          }
+
+          if (modelConfig.supportsTemperature) {
+            params.temperature = modelConfig.defaultTemperature
+          }
+
+          const result = await generateText(params as Parameters<typeof generateText>[0])
+          text = result.text
         }
 
-        if (modelConfig.supportsTemperature) {
-          params.temperature = modelConfig.defaultTemperature
-        }
-
-        const { text } = await generateText(params as Parameters<typeof generateText>[0])
-
         const durationMs = Date.now() - startTime
         checkpointManager.updatePhase(checkpoint, question.questionId, "answer", {
           status: "completed",

diff --git a/src/orchestrator/phases/search.ts b/src/orchestrator/phases/search.ts
@@ -57,7 +57,7 @@ export async function runSearchPhase(
       try {
         const results = await provider.search(question.question, {
           containerTag,
-          limit: 10,
+          limit: 19,
           threshold: 0.3,
         })
 

diff --git a/src/providers/anamnesis/embed.py b/src/providers/anamnesis/embed.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+"""Embed benchmark observations into ChromaDB.
+
+Called by the anamnesis provider's awaitIndexing phase.
+Reads observation IDs from argv, fetches narratives from SQLite,
+and adds them to the cm__claude-mem ChromaDB collection.
+
+Uses the same Python environment as chroma-mcp to avoid version mismatches.
+"""
+import json
+import os
+import sqlite3
+import sys
+
+# ChromaDB — must match the version used by chroma-mcp (1.5.x)
+import chromadb
+
+DB_PATH = os.environ.get("ANAMNESIS_DB", os.path.expanduser("~/.claude-mem/claude-mem.db"))
+VECTOR_PATH = os.environ.get("CHROMA_PATH", os.path.expanduser("~/.claude-mem/vector-db"))
+COLLECTION = "cm__claude-mem"
+BATCH_SIZE = 50  # ChromaDB handles batches well
+
+
+def embed_observations(ids: list[int]) -> dict:
+    """Fetch observations from SQLite and embed into ChromaDB."""
+    if not ids:
+        return {"embedded": 0, "skipped": 0, "errors": 0}
+
+    # Read observations from SQLite
+    db = sqlite3.connect(DB_PATH)
+    db.row_factory = sqlite3.Row
+    placeholders = ",".join("?" for _ in ids)
+    rows = db.execute(
+        f"SELECT id, title, subtitle, narrative, facts, namespace FROM observations WHERE id IN ({placeholders})",
+        ids,
+    ).fetchall()
+    db.close()
+
+    if not rows:
+        return {"embedded": 0, "skipped": 0, "errors": 0}
+
+    # Connect to ChromaDB
+    client = chromadb.PersistentClient(path=VECTOR_PATH)
+    col = client.get_collection(COLLECTION)
+
+    embedded = 0
+    skipped = 0
+    errors = 0
+
+    # Process in batches
+    for i in range(0, len(rows), BATCH_SIZE):
+        batch = rows[i : i + BATCH_SIZE]
+        batch_ids = []
+        batch_docs = []
+        batch_metas = []
+
+        for row in batch:
+            obs_id = str(row["id"])
+            # Build document text for embedding (same as what search would match against)
+            parts = []
+            if row["title"]:
+                parts.append(row["title"])
+            if row["subtitle"]:
+                parts.append(row["subtitle"])
+            if row["narrative"]:
+                parts.append(row["narrative"])
+
+            doc = "\n".join(parts)
+            if not doc.strip():
+                skipped += 1
+                continue
+
+            batch_ids.append(obs_id)
+            batch_docs.append(doc[:8000])  # ChromaDB has doc size limits
+            batch_metas.append({
+                "source": "memorybench",
+                "namespace": row["namespace"] or "",
+                "title": row["title"] or "",
+            })
+
+        if batch_ids:
+            try:
+                col.upsert(
+                    ids=batch_ids,
+                    documents=batch_docs,
+                    metadatas=batch_metas,
+                )
+                embedded += len(batch_ids)
+            except Exception as e:
+                print(json.dumps({"error": str(e), "batch_start": i}), file=sys.stderr)
+                errors += len(batch_ids)
+
+    return {"embedded": embedded, "skipped": skipped, "errors": errors}
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print(json.dumps({"error": "Usage: embed.py <id1,id2,...>"}))
+        sys.exit(1)
+
+    ids = [int(x) for x in sys.argv[1].split(",") if x.strip()]
+    result = embed_observations(ids)
+    print(json.dumps(result))