From 7c81a0e7d4a07619b61a96b4ba014fb335bd3b61 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Mon, 29 Sep 2025 10:23:17 -0700 Subject: [PATCH 1/7] Switch to sonnet 4.5 --- .agents/base.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.agents/base.ts b/.agents/base.ts index eddc9d3162..394d66a79f 100644 --- a/.agents/base.ts +++ b/.agents/base.ts @@ -6,7 +6,7 @@ import type { SecretAgentDefinition } from './types/secret-agent-definition' const definition: SecretAgentDefinition = { id: 'base', publisher, - ...base('anthropic/claude-4-sonnet-20250522', 'normal'), + ...base('anthropic/claude-4.5-sonnet', 'normal'), } export default definition From 74330606e484c5d2f2390d5a79e03c279db47024 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Mon, 29 Sep 2025 10:23:50 -0700 Subject: [PATCH 2/7] [buffbench] Sonnet 4.5 --- evals/git-evals/run-eval-set.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/git-evals/run-eval-set.ts b/evals/git-evals/run-eval-set.ts index 2b6c6c7c2e..71ff206777 100644 --- a/evals/git-evals/run-eval-set.ts +++ b/evals/git-evals/run-eval-set.ts @@ -72,7 +72,7 @@ class RunEvalSetCommand extends Command { }), agent: Flags.string({ description: 'Codebuff agent id to use', - default: 'base-lite', + default: 'base', }), help: Flags.help({ char: 'h' }), } From 1ce959278da0528da2f4d3a24ac075016212c890 Mon Sep 17 00:00:00 2001 From: Charles Lien Date: Mon, 29 Sep 2025 11:01:50 -0700 Subject: [PATCH 3/7] [buffbench] add more logging to eval errors --- evals/git-evals/run-git-evals.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/evals/git-evals/run-git-evals.ts b/evals/git-evals/run-git-evals.ts index 696deb4181..fd82e72398 100644 --- a/evals/git-evals/run-git-evals.ts +++ b/evals/git-evals/run-git-evals.ts @@ -4,6 +4,7 @@ import path from 'path' import { disableLiveUserInputCheck } from '@codebuff/backend/live-user-inputs' import { promptAiSdkStructured } from '@codebuff/backend/llm-apis/vercel-ai-sdk/ai-sdk' +import { errorToObject } from '@codebuff/common/util/object' import { withTimeout } from '@codebuff/common/util/promise' import { generateCompactId } from '@codebuff/common/util/string' import { cloneDeep } from 'lodash' @@ -247,7 +248,7 @@ Explain your reasoning in detail.`, return { ...evalRun, judging_results: { - analysis: 'Judging failed due to error', + analysis: `Judging failed due to error:\n${judgingError instanceof Error ? errorToObject(judgingError) : JSON.stringify(judgingError)}`, strengths: [], weaknesses: ['Judging process encountered an error'], metrics: { From dfac77379e3ee4d419e743e21e0b28bf65e4c838 Mon Sep 17 00:00:00 2001 From: Charles Lien Date: Mon, 29 Sep 2025 11:05:57 -0700 Subject: [PATCH 4/7] [buffbench] fix error message --- evals/git-evals/run-git-evals.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/evals/git-evals/run-git-evals.ts b/evals/git-evals/run-git-evals.ts index fd82e72398..06e401b423 100644 --- a/evals/git-evals/run-git-evals.ts +++ b/evals/git-evals/run-git-evals.ts @@ -248,7 +248,11 @@ Explain your reasoning in detail.`, return { ...evalRun, judging_results: { - analysis: `Judging failed due to error:\n${judgingError instanceof Error ? errorToObject(judgingError) : JSON.stringify(judgingError)}`, + analysis: `Judging failed due to error:\n${JSON.stringify( + judgingError instanceof Error + ? errorToObject(judgingError) + : judgingError, + )}`, strengths: [], weaknesses: ['Judging process encountered an error'], metrics: { From 3fe6485bd1e67ddef571097c98a1020860e8851b Mon Sep 17 00:00:00 2001 From: Charles Lien Date: Mon, 29 Sep 2025 11:22:06 -0700 Subject: [PATCH 5/7] add routing priority for 4.5 --- backend/src/llm-apis/openrouter.ts | 5 +++++ common/src/old-constants.ts | 2 ++ 2 files changed, 7 insertions(+) diff --git a/backend/src/llm-apis/openrouter.ts b/backend/src/llm-apis/openrouter.ts index b2c72c6100..4e0d296aa0 100644 --- a/backend/src/llm-apis/openrouter.ts +++ b/backend/src/llm-apis/openrouter.ts @@ -12,6 +12,11 @@ const providerOrder = { 'Anthropic', 'Amazon Bedrock', ], + [models.openrouter_claude_sonnet_4_5]: [ + 'Google', + 'Anthropic', + 'Amazon Bedrock', + ], [models.openrouter_claude_opus_4]: ['Google', 'Anthropic'], } as const diff --git a/common/src/old-constants.ts b/common/src/old-constants.ts index 6bd48bf16b..41d49af0d1 100644 --- a/common/src/old-constants.ts +++ b/common/src/old-constants.ts @@ -194,6 +194,7 @@ export const geminiModels = { export type GeminiModel = (typeof geminiModels)[keyof typeof geminiModels] export const openrouterModels = { + openrouter_claude_sonnet_4_5: 'anthropic/claude-4.5-sonnet', openrouter_claude_sonnet_4: 'anthropic/claude-4-sonnet-20250522', openrouter_claude_opus_4: 'anthropic/claude-opus-4.1', openrouter_claude_3_5_haiku: 'anthropic/claude-3.5-haiku-20241022', @@ -259,6 +260,7 @@ export const shortModelNames = { 'gemini-2.5-pro': models.openrouter_gemini2_5_pro_preview, 'flash-2.5': models.openrouter_gemini2_5_flash, 'opus-4': models.openrouter_claude_opus_4, + 'sonnet-4.5': models.openrouter_claude_sonnet_4_5, 'sonnet-4': models.openrouter_claude_sonnet_4, 'sonnet-3.7': models.openrouter_claude_sonnet_4, 'sonnet-3.6': models.openrouter_claude_3_5_sonnet, From e6f5d09febf2de7d43c8ed327c6e9cc499bceec7 Mon Sep 17 00:00:00 2001 From: Charles Lien Date: Mon, 29 Sep 2025 12:52:59 -0700 Subject: [PATCH 6/7] tweak end_turn prompt examples --- backend/src/tools/definitions/tool/end-turn.ts | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/backend/src/tools/definitions/tool/end-turn.ts b/backend/src/tools/definitions/tool/end-turn.ts index 7b4c3a2656..86b18938b2 100644 --- a/backend/src/tools/definitions/tool/end-turn.ts +++ b/backend/src/tools/definitions/tool/end-turn.ts @@ -13,7 +13,15 @@ Only use this tool to hand control back to the user. - Before calling: finish all pending steps, resolve tool results, and include any outputs the user needs to review. - Effect: Signals the UI to wait for the user's reply; any pending tool results will be ignored. -Correct usage: +*INCORRECT USAGE*: +${getToolCallString('some_tool_that_produces_results', { query: 'some example search term' }, false)} + ${getToolCallString(toolName, {})} + +*CORRECT USAGE*: +All done! Would you like some more help with xyz? + +${getToolCallString(toolName, {})} + `.trim(), } satisfies ToolDescription From ed8f94940bdc19843d9bd879a568cf29b5cd1726 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Mon, 29 Sep 2025 12:54:06 -0700 Subject: [PATCH 7/7] Revert default eval agent to base-lite --- evals/git-evals/run-eval-set.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/git-evals/run-eval-set.ts b/evals/git-evals/run-eval-set.ts index 71ff206777..2b6c6c7c2e 100644 --- a/evals/git-evals/run-eval-set.ts +++ b/evals/git-evals/run-eval-set.ts @@ -72,7 +72,7 @@ class RunEvalSetCommand extends Command { }), agent: Flags.string({ description: 'Codebuff agent id to use', - default: 'base', + default: 'base-lite', }), help: Flags.help({ char: 'h' }), }