Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/benchmarks/locomo/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,9 @@ export class LoCoMoBenchmark implements Benchmark {

const unifiedMessages: UnifiedMessage[] = messages.map((m) => ({
role: m.speaker === speakerA ? ("user" as const) : ("assistant" as const),
content: m.text,
content: m.blip_caption
? `${m.text} [shared image: ${m.blip_caption}]`
: m.text,
speaker: m.speaker,
}))

Expand Down
2 changes: 2 additions & 0 deletions src/benchmarks/locomo/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ export interface LoCoMoMessage {
speaker: string
dia_id: string
text: string
blip_caption?: string
img_url?: string[]
}

export interface LoCoMoQA {
Expand Down
79 changes: 79 additions & 0 deletions src/judges/cli.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import { spawn } from "child_process"
import type { Judge, JudgeConfig, JudgeInput, JudgeResult } from "../types/judge"
import type { ProviderPrompts } from "../types/prompts"
import { buildJudgePrompt, parseJudgeResponse, getJudgePrompt } from "./base"
import { logger } from "../utils/logger"
import { getModelConfig, ModelConfig } from "../utils/models"

/**
* Call Claude CLI in print mode for text generation.
* Uses subprocess to avoid API key requirements.
*/
async function generateTextViaCli(prompt: string, modelAlias: string): Promise<string> {
return new Promise((resolve, reject) => {
const claude = spawn('claude', [
'-p', prompt,
'--output-format', 'json',
'--model', modelAlias,
'--max-budget-usd', '1.00', // Allow $1 per evaluation (generous)
], {
timeout: 180000, // 3 minute timeout
cwd: process.cwd(),
})

let stdout = ''
let stderr = ''

claude.stdout.on('data', (data) => { stdout += data })
claude.stderr.on('data', (data) => { stderr += data })

claude.on('close', (code) => {
if (code === 0) {
try {
const response = JSON.parse(stdout)
resolve(response.result?.trim() || '')
} catch {
resolve(stdout.trim())
}
} else {
reject(new Error(`Claude CLI exited with code ${code}: ${stderr}`))
}
})

claude.on('error', reject)
})
}

export class CliJudge implements Judge {
name = "cli"
private modelConfig: ModelConfig | null = null
private modelAlias: string = "sonnet"

async initialize(config: JudgeConfig): Promise<void> {
// For CLI, apiKey is ignored - we use the locally authenticated `claude` command
const modelAlias = config.model || "sonnet"
this.modelAlias = modelAlias
this.modelConfig = getModelConfig(modelAlias)
logger.info(`Initialized CLI judge with model: ${this.modelConfig.displayName} (${this.modelConfig.id})`)
}

async evaluate(input: JudgeInput): Promise<JudgeResult> {
if (!this.modelConfig) throw new Error("Judge not initialized")

const prompt = buildJudgePrompt(input)
const text = await generateTextViaCli(prompt, this.modelConfig.id)

return parseJudgeResponse(text)
}

getPromptForQuestionType(questionType: string, providerPrompts?: ProviderPrompts): string {
return getJudgePrompt(questionType, providerPrompts)
}

getModel(): import("ai").LanguageModel {
// CLI doesn't use AI SDK LanguageModel - throw if called
throw new Error("CLI judge does not expose an AI SDK model")
}
}

export default CliJudge
4 changes: 3 additions & 1 deletion src/judges/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@ import type { Judge, JudgeName } from "../types/judge"
import { OpenAIJudge } from "./openai"
import { AnthropicJudge } from "./anthropic"
import { GoogleJudge } from "./google"
import { CliJudge } from "./cli"

const judges: Record<JudgeName, new () => Judge> = {
openai: OpenAIJudge,
anthropic: AnthropicJudge,
google: GoogleJudge,
cli: CliJudge,
}

export function createJudge(name: JudgeName): Judge {
Expand All @@ -21,5 +23,5 @@ export function getAvailableJudges(): JudgeName[] {
return Object.keys(judges) as JudgeName[]
}

export { OpenAIJudge, AnthropicJudge, GoogleJudge }
export { OpenAIJudge, AnthropicJudge, GoogleJudge, CliJudge }
export { buildJudgePrompt, parseJudgeResponse, getJudgePrompt } from "./base"
77 changes: 66 additions & 11 deletions src/orchestrator/phases/answer.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { readFileSync, existsSync } from "fs"
import { spawn } from "child_process"
import { createOpenAI } from "@ai-sdk/openai"
import { createAnthropic } from "@ai-sdk/anthropic"
import { createGoogleGenerativeAI } from "@ai-sdk/google"
Expand All @@ -15,13 +16,52 @@ import { buildContextString } from "../../types/prompts"
import { ConcurrentExecutor } from "../concurrent"
import { resolveConcurrency } from "../../types/concurrency"

/**
* Call Claude CLI in print mode for text generation.
* Uses subprocess to avoid API key requirements.
*/
async function generateTextViaCli(prompt: string, modelAlias: string): Promise<string> {
return new Promise((resolve, reject) => {
const claude = spawn("claude", [
"-p", prompt,
"--output-format", "json",
"--model", modelAlias,
"--max-budget-usd", "1.00",
], {
timeout: 600000, // 10 minutes for larger models like Opus
cwd: process.cwd(),
Comment on lines +23 to +32

This comment was marked as outdated.

})

let stdout = ""
let stderr = ""

claude.stdout.on("data", (data) => { stdout += data })
claude.stderr.on("data", (data) => { stderr += data })

claude.on("close", (code) => {
if (code === 0) {
try {
const response = JSON.parse(stdout)
resolve(response.result?.trim() || "")
} catch {
resolve(stdout.trim())
}
} else {
reject(new Error(`Claude CLI exited with code ${code}: ${stderr}`))
}
})

claude.on("error", reject)
})
}

type LanguageModel =
| ReturnType<typeof createOpenAI>
| ReturnType<typeof createAnthropic>
| ReturnType<typeof createGoogleGenerativeAI>

function getAnsweringModel(modelAlias: string): {
client: LanguageModel
client: LanguageModel | null
modelConfig: ModelConfig
} {
const modelConfig = getModelConfig(modelAlias || DEFAULT_ANSWERING_MODEL)
Expand All @@ -42,6 +82,12 @@ function getAnsweringModel(modelAlias: string): {
client: createGoogleGenerativeAI({ apiKey: config.googleApiKey }),
modelConfig,
}
case "cli":
// CLI uses subprocess instead of API client
return {
client: null,
modelConfig,
}
}
}

Expand Down Expand Up @@ -120,18 +166,27 @@ export async function runAnswerPhase(

const prompt = buildAnswerPrompt(question.question, context, questionDate, provider)

const params: Record<string, unknown> = {
model: client(modelConfig.id),
prompt,
maxTokens: modelConfig.defaultMaxTokens,
let text: string

if (modelConfig.provider === "cli") {
// Use CLI subprocess for Claude models
text = await generateTextViaCli(prompt, modelConfig.id)
} else {
// Use AI SDK for API-based models
const params: Record<string, unknown> = {
model: client!(modelConfig.id),
prompt,
maxTokens: modelConfig.defaultMaxTokens,
}

if (modelConfig.supportsTemperature) {
params.temperature = modelConfig.defaultTemperature
}

const result = await generateText(params as Parameters<typeof generateText>[0])
text = result.text
}

if (modelConfig.supportsTemperature) {
params.temperature = modelConfig.defaultTemperature
}

const { text } = await generateText(params as Parameters<typeof generateText>[0])

const durationMs = Date.now() - startTime
checkpointManager.updatePhase(checkpoint, question.questionId, "answer", {
status: "completed",
Expand Down
2 changes: 1 addition & 1 deletion src/orchestrator/phases/search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ export async function runSearchPhase(
try {
const results = await provider.search(question.question, {
containerTag,
limit: 10,
limit: 19,
threshold: 0.3,
})

Expand Down
103 changes: 103 additions & 0 deletions src/providers/anamnesis/embed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#!/usr/bin/env python3
"""Embed benchmark observations into ChromaDB.
Called by the anamnesis provider's awaitIndexing phase.
Reads observation IDs from argv, fetches narratives from SQLite,
and adds them to the cm__claude-mem ChromaDB collection.
Uses the same Python environment as chroma-mcp to avoid version mismatches.
"""
import json
import os
import sqlite3
import sys

# ChromaDB — must match the version used by chroma-mcp (1.5.x)
import chromadb

DB_PATH = os.environ.get("ANAMNESIS_DB", os.path.expanduser("~/.claude-mem/claude-mem.db"))
VECTOR_PATH = os.environ.get("CHROMA_PATH", os.path.expanduser("~/.claude-mem/vector-db"))
COLLECTION = "cm__claude-mem"
BATCH_SIZE = 50 # ChromaDB handles batches well


def embed_observations(ids: list[int]) -> dict:
"""Fetch observations from SQLite and embed into ChromaDB."""
if not ids:
return {"embedded": 0, "skipped": 0, "errors": 0}

# Read observations from SQLite
db = sqlite3.connect(DB_PATH)
db.row_factory = sqlite3.Row
placeholders = ",".join("?" for _ in ids)
rows = db.execute(
f"SELECT id, title, subtitle, narrative, facts, namespace FROM observations WHERE id IN ({placeholders})",
ids,
).fetchall()
db.close()

if not rows:
return {"embedded": 0, "skipped": 0, "errors": 0}

# Connect to ChromaDB
client = chromadb.PersistentClient(path=VECTOR_PATH)
col = client.get_collection(COLLECTION)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: The use of get_collection() can raise a ValueError if the ChromaDB collection is missing, causing semantic search to fail silently and fall back to keyword search.
Severity: MEDIUM

Suggested Fix

Replace the call to client.get_collection(COLLECTION) with client.get_or_create_collection(COLLECTION). This ensures the collection is created if it does not already exist, preventing the ValueError and ensuring semantic search functionality is robust.

Prompt for AI Agent
Review the code at the location below. A potential bug has been identified by an AI
agent.
Verify if this is a real issue. If it is, propose a fix; if not, explain why it's not
valid.

Location: src/providers/anamnesis/embed.py#L44

Potential issue: The code calls `client.get_collection(COLLECTION)` in `embed.py` and
`search.py`, which will raise a `ValueError` if the ChromaDB collection does not exist.
This can occur in scenarios like a fresh installation or a corrupted state. The current
error handling catches this exception but leads to silent failures. In the embedding
phase, the error is logged as a warning, and embeddings are skipped. In the search
phase, it falls back to a keyword-only search. This results in a silent degradation of
the semantic search functionality, potentially impacting benchmark accuracy without
clear indication to the user.


embedded = 0
skipped = 0
errors = 0

# Process in batches
for i in range(0, len(rows), BATCH_SIZE):
batch = rows[i : i + BATCH_SIZE]
batch_ids = []
batch_docs = []
batch_metas = []

for row in batch:
obs_id = str(row["id"])
# Build document text for embedding (same as what search would match against)
parts = []
if row["title"]:
parts.append(row["title"])
if row["subtitle"]:
parts.append(row["subtitle"])
if row["narrative"]:
parts.append(row["narrative"])

doc = "\n".join(parts)
if not doc.strip():
skipped += 1
continue

batch_ids.append(obs_id)
batch_docs.append(doc[:8000]) # ChromaDB has doc size limits
batch_metas.append({
"source": "memorybench",
"namespace": row["namespace"] or "",
"title": row["title"] or "",
})

if batch_ids:
try:
col.upsert(
ids=batch_ids,
documents=batch_docs,
metadatas=batch_metas,
)
embedded += len(batch_ids)
except Exception as e:
print(json.dumps({"error": str(e), "batch_start": i}), file=sys.stderr)
errors += len(batch_ids)

return {"embedded": embedded, "skipped": skipped, "errors": errors}


if __name__ == "__main__":
if len(sys.argv) < 2:
print(json.dumps({"error": "Usage: embed.py <id1,id2,...>"}))
sys.exit(1)

ids = [int(x) for x in sys.argv[1].split(",") if x.strip()]
result = embed_observations(ids)
print(json.dumps(result))
Loading