CodebuffAI
diff --git a/‎.agents/base/ask.ts‎
Lines changed: 1 addition & 23 deletions b/‎.agents/base/ask.ts‎
Lines changed: 1 addition & 23 deletions
diff --git a/‎.agents/base/base-prompts.ts‎
Lines changed: 1 addition & 19 deletions b/‎.agents/base/base-prompts.ts‎
Lines changed: 1 addition & 19 deletions
diff --git a/‎.agents/base2/base2-evals.ts‎
Lines changed: 8 additions & 0 deletions b/‎.agents/base2/base2-evals.ts‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎.agents/base2/base2.ts‎
Lines changed: 10 additions & 7 deletions b/‎.agents/base2/base2.ts‎
Lines changed: 10 additions & 7 deletions
diff --git a/‎.agents/commander.ts‎
Lines changed: 21 additions & 2 deletions b/‎.agents/commander.ts‎
Lines changed: 21 additions & 2 deletions
diff --git a/‎.agents/editor/best-of-n/editor-implementor.ts‎
Lines changed: 61 additions & 13 deletions b/‎.agents/editor/best-of-n/editor-implementor.ts‎
Lines changed: 61 additions & 13 deletions
@@ -99,17 +99,7 @@ Messages from the system are surrounded by <system>${closeXml('system')} or <sys
 
 ## Verifying Your Changes at the End of Your Response
 
-### User has a \`codebuff.json\`
-
-If the user has a \`codebuff.json\` with the appropriate \`fileChangeHooks\`, there is no need to run any commands.
-
-If the \`fileChangeHooks\` are not configured, inform the user about the \`fileChangeHooks\` parameter.
-
-### User has no \`codebuff.json\`
-
-If this is the case, inform the user know about the \`/init\` command (within Codebuff, not a terminal command).
-
-Check the knowledge files to see if the user has specified a further protocol for what terminal commands should be run to verify edits. For example, a \`knowledge.md\` file could specify that after every change you should run the tests or linting or run the type checker. If there are multiple commands to run, you should run them all using '&&' to concatenate them into one commands, e.g. \`npm run lint && npm run test\`.
+Check the knowledge files to see if the user has specified a protocol for what commands should be run to verify edits. If so, follow it. Otherwise, suggest the most relevant checks for the change.
 
 ## Example Response (Simplified - Demonstrating Rules)
 
@@ -152,18 +142,6 @@ What is included in knowledge files:
 
 If the user sends you the url to a page that is helpful now or could be helpful in the future (e.g. documentation for a library or api), you should always save the url in a knowledge file for future reference. Any links included in knowledge files are automatically scraped and the web page content is added to the knowledge file.
 
-# Codebuff Configuration (codebuff.json)
-
-## Schema
-
-The following describes the structure of the \`./codebuff.json\` configuration file that users might have in their project root. You can use this to understand user settings if they mention them.
-
-${PLACEHOLDER.CONFIG_SCHEMA}
-
-## Background Processes
-
-The user does not have access to these outputs. Please display any pertinent information to the user before referring to it.
-
 ${PLACEHOLDER.FILE_TREE_PROMPT}
 
 ${PLACEHOLDER.SYSTEM_INFO_PROMPT}
 
@@ -104,16 +104,6 @@ ${
 
 ## Verifying Your Changes at the End of Your Response
 
-### User has a \`codebuff.json\`
-
-If the user has a \`codebuff.json\` with the appropriate \`fileChangeHooks\`, there is no need to run any commands.
-
-If the \`fileChangeHooks\` are not configured, inform the user about the \`fileChangeHooks\` parameter.
-
-### User has no \`codebuff.json\`
-
-If this is the case, inform the user know about the \`/init\` command (within Codebuff, not a terminal command).
-
 Check the knowledge files to see if the user has specified a further protocol for what terminal commands should be run to verify edits. For example, a \`knowledge.md\` file could specify that after every change you should run the tests or linting or run the type checker. If there are multiple commands to run, you should run them all using '&&' to concatenate them into one commands, e.g. \`npm run lint && npm run test\`.
 
 ## Example Response (Simplified - Demonstrating Rules)
@@ -202,15 +192,7 @@ Once again: BE CONCISE!
 
 If the user sends you the url to a page that is helpful now or could be helpful in the future (e.g. documentation for a library or api), you should always save the url in a knowledge file for future reference. Any links included in knowledge files are automatically scraped and the web page content is added to the knowledge file.
 
-# Codebuff Configuration (codebuff.json)
-
-## Schema
-
-The following describes the structure of the \`./codebuff.json\` configuration file that users might have in their project root. You can use this to understand user settings if they mention them.
-
-${PLACEHOLDER.CONFIG_SCHEMA}
-
-## Background Processes
+# Background Processes
 
 The user does not have access to these outputs. Please display any pertinent information to the user before referring to it.
 
 
@@ -0,0 +1,8 @@
+import { createBase2 } from './base2'
+
+const definition = {
+  ...createBase2('default', { noAskUser: true }),
+  id: 'base2-evals',
+  displayName: 'Buffy the Evals Orchestrator',
+}
+export default definition
@@ -75,7 +75,7 @@ export function createBase2(
       isDefault && 'thinker',
       isLite && 'editor-gpt-5',
       isDefault && 'editor',
-      isMax && 'editor-multi-prompt2',
+      isMax && 'editor-multi-prompt',
       isMax && 'thinker-best-of-n-opus',
       !isLite && 'code-reviewer',
       'context-pruner',
@@ -138,7 +138,7 @@ Use the spawn_agents tool to spawn specialized agents to help you complete the u
     (isDefault || isMax) &&
       `- Spawn the ${isDefault ? 'thinker' : 'thinker-best-of-n-opus'} after gathering context to solve complex problems or when the user asks you to think about a problem.`,
     isMax &&
-      `- IMPORTANT: You must spawn the editor-multi-prompt2 agent to implement the changes after you have gathered all the context you need. You must spawn this agent for non-trivial changes, since it writes much better code than you would with the str_replace or write_file tools. Don't spawn the editor in parallel with context-gathering agents.`,
+      `- IMPORTANT: You must spawn the editor-multi-prompt agent to implement the changes after you have gathered all the context you need. You must spawn this agent for non-trivial changes, since it writes much better code than you would with the str_replace or write_file tools. Don't spawn the editor in parallel with context-gathering agents.`,
     '- Spawn commanders sequentially if the second command depends on the the first.',
     !isFast &&
       !isLite &&
@@ -165,6 +165,7 @@ ${buildArray(
   isFast &&
     '- Prioritize speed: quickly getting the user request done is your first priority. Do not call any unnecessary tools. Spawn more agents in parallel to speed up the process. Be extremely concise in your responses. Use 2 words where you would have used 2 sentences.',
   '- If a tool fails, try again, or try a different tool or approach.',
+  '- **Use <think></think> tags for moderate reasoning:** When you need to work through something moderately complex (e.g., understanding code flow, planning a small refactor, reasoning about edge cases, planning which agents to spawn), wrap your thinking in <think></think> tags. Spawn the thinker agent for anything more complex.',
   '- Context is managed for you. The context-pruner agent will automatically run as needed. Gather as much context as you need without worrying about it.',
   isSonnet &&
     `- **Don't create a summary markdown file:** The user doesn't want markdown files they didn't ask for. Don't create them.`,
@@ -192,7 +193,7 @@ ${
       ? '[ You implement the changes using the str_replace or write_file tools ]'
       : isLite
         ? '[ You implement the changes using the editor-gpt-5 agent ]'
-        : '[ You implement the changes using the editor-multi-prompt2 agent ]'
+        : '[ You implement the changes using the editor-multi-prompt agent ]'
 }
 
 ${
@@ -311,14 +312,16 @@ ${buildArray(
     'If needed, use the ask_user tool to ask the user for clarification on their request or alternate implementation strategies. It is good to get context on the codebase before asking questions so you can ask informed questions.',
   (isDefault || isMax) &&
     `- For any task requiring 3+ steps, use the write_todos tool to write out your step-by-step implementation plan. Include ALL of the applicable tasks in the list.${isFast ? '' : ' You should include a step to review the changes after you have implemented the changes.'}:${hasNoValidation ? '' : ' You should include at least one step to validate/test your changes: be specific about whether to typecheck, run tests, run lints, etc.'} You may be able to do reviewing and validation in parallel in the same step. Skip write_todos for simple tasks like quick edits or answering questions.`,
-  isDefault &&
-    `- For complex problems, spawn the thinker agent to help find the best solution, or when the user asks you to think about a problem.`,
+  (isDefault || isMax) &&
+    `- For quick problems, use <think></think> tags to think through the problem. For anything more complex, spawn the thinker agent to help find the best solution.`,
+  isMax &&
+    'You should use <think></think> tags all the time to help arrive at the best solution!',
   isLite &&
     '- IMPORTANT: You must spawn the editor-gpt-5 agent to implement the changes after you have gathered all the context you need. This agent will do the best job of implementing the changes so you must spawn it for all changes. Do not pass any prompt or params to the editor agent when spawning it. It will make its own best choices of what to do.',
   isDefault &&
     '- IMPORTANT: You must spawn the editor agent to implement the changes after you have gathered all the context you need. This agent will do the best job of implementing the changes so you must spawn it for all non-trivial changes. Do not pass any prompt or params to the editor agent when spawning it. It will make its own best choices of what to do.',
   isMax &&
-    `- IMPORTANT: You must spawn the editor-multi-prompt2 agent to implement non-trivial code changes, since it will generate the best code changes from multiple implementation proposals. This is the best way to make high quality code changes -- strongly prefer using this agent over the str_replace or write_file tools, unless the change is very straightforward and obvious.`,
+    `- IMPORTANT: You must spawn the editor-multi-prompt agent to implement non-trivial code changes, since it will generate the best code changes from multiple implementation proposals. This is the best way to make high quality code changes -- strongly prefer using this agent over the str_replace or write_file tools, unless the change is very straightforward and obvious.`,
   isFast &&
     '- Implement the changes using the str_replace or write_file tools. Implement all the changes in one go.',
   isFast &&
@@ -355,7 +358,7 @@ function buildImplementationStepPrompt({
     isMax &&
       `Keep working until the user's request is completely satisfied${!hasNoValidation ? ' and validated' : ''}, or until you require more information from the user.`,
     isMax &&
-      `You must spawn the 'editor-multi-prompt2' agent to implement code changes, since it will generate the best code changes.`,
+      `You must spawn the 'editor-multi-prompt' agent to implement code changes, since it will generate the best code changes.`,
     (isDefault || isMax) &&
       'Spawn code-reviewer to review the changes after you have implemented the changes and in parallel with typechecking or testing.',
     `After completing the user request, summarize your changes in a sentence${isFast ? '' : ' or a few short bullet points'}.${isSonnet ? " Don't create any summary markdown files or example documentation files, unless asked by the user." : ''} Don't repeat yourself, especially if you have already concluded and summarized the changes in a previous step -- just end your turn.`,
 
@@ -10,7 +10,8 @@ const commander: AgentDefinition = {
   model: 'anthropic/claude-haiku-4.5',
   displayName: 'Commander',
   spawnerPrompt:
-    'Runs a single terminal command and describes its output based on what information is requested.',
+    'Runs a single terminal command and describes its output using an LLM based on what information is requested.',
+
   inputSchema: {
     prompt: {
       type: 'string',
@@ -28,6 +29,11 @@ const commander: AgentDefinition = {
           type: 'number',
           description: 'Set to -1 for no timeout. Default 30',
         },
+        rawOutput: {
+          type: 'boolean',
+          description:
+            'If true, returns the full command output without summarization. Defaults to false.',
+        },
       },
       required: ['command'],
     },
@@ -60,16 +66,29 @@ Do not use any tools! Only analyze the output of the command.`,
     }
 
     const timeout_seconds = params?.timeout_seconds as number | undefined
+    const rawOutput = params?.rawOutput as boolean | undefined
 
     // Run the command
-    yield {
+    const { toolResult } = yield {
       toolName: 'run_terminal_command',
       input: {
         command,
         ...(timeout_seconds !== undefined && { timeout_seconds }),
       },
     }
 
+    if (rawOutput) {
+      // Return the raw command output without summarization
+      const result = toolResult?.[0]
+      const output = result?.type === 'json' ? result.value : ''
+      yield {
+        toolName: 'set_output',
+        input: { output },
+        includeToolCall: false,
+      }
+      return
+    }
+
     // Let the model analyze and describe the output
     yield 'STEP'
   },
 
@@ -19,28 +19,27 @@ export const createBestOfNImplementor = (options: {
         ? 'anthropic/claude-opus-4.5'
         : isGemini
           ? 'google/gemini-3-pro-preview'
-          : 'openai/gpt-5.2',
+          : 'openai/gpt-5.1',
     displayName: 'Implementation Generator',
     spawnerPrompt:
-      'Generates a complete implementation plan with all code changes',
+      'Generates a complete implementation using propose_* tools that draft changes without applying them',
 
     includeMessageHistory: true,
     inheritParentSystemPrompt: true,
 
-    toolNames: [],
+    toolNames: ['propose_write_file', 'propose_str_replace'],
     spawnableAgents: [],
 
     inputSchema: {},
-    outputMode: 'last_message',
+    outputMode: 'structured_output',
 
     instructionsPrompt: `You are an expert code editor with deep understanding of software engineering principles. You were spawned to generate an implementation for the user's request.
     
-Your task is to write out ALL the code changes needed to complete the user's request in a single comprehensive response.
+Your task is to write out ALL the code changes needed to complete the user's request.
 
-Important: You can not make any other tool calls besides editing files. You cannot read more files, write todos, spawn agents, or set output. Do not call any of these tools!
-
-Write out what changes you would make using the tool call format below. Use this exact format for each file change:
+IMPORTANT: Use propose_str_replace and propose_write_file tools to make your edits. These tools draft changes without actually applying them - they will be reviewed first. DO NOT use any other tools. Do not spawn any agents, read files, or set output.
 
+You can make multiple tool calls across multiple steps to complete the implementation. Only the file changes will be passed on, so you can say whatever you want to help you think. Do not write any final summary as that would be a waste of tokens because no one is reading it.
 <codebuff_tool_call>
 {
   "cb_tool_name": "str_replace",
@@ -116,15 +115,64 @@ More style notes:
 - Optional arguments are code smell and worse than required arguments.
 - New components often should be added to a new file, not added to an existing file.
 
-Write out your complete implementation now, formatting all changes as tool calls as shown above.`,
-
-    handleSteps: function* () {
-      yield 'STEP'
+Write out your complete implementation now. Do not write any final summary.`,
+
+    handleSteps: function* ({ agentState: initialAgentState }) {
+      const initialMessageHistoryLength =
+        initialAgentState.messageHistory.length
+
+      const { agentState } = yield 'STEP_ALL'
+
+      const postMessages = agentState.messageHistory.slice(
+        initialMessageHistoryLength,
+      )
+
+      // Extract tool calls from assistant messages
+      const toolCalls: { toolName: string; input: any }[] = []
+      for (const message of postMessages) {
+        if (message.role !== 'assistant' || !Array.isArray(message.content))
+          continue
+        for (const part of message.content) {
+          if (part.type === 'tool-call') {
+            toolCalls.push({
+              toolName: part.toolName,
+              input: part.input ?? (part as any).args ?? {},
+            })
+          }
+        }
+      }
+
+      // Extract tool results (unified diffs) from tool messages
+      const toolResults: any[] = []
+      for (const message of postMessages) {
+        if (message.role !== 'tool' || !Array.isArray(message.content)) continue
+        for (const part of message.content) {
+          if (part.type === 'json' && part.value) {
+            toolResults.push(part.value)
+          }
+        }
+      }
+
+      // Concatenate all unified diffs for the selector to review
+      const unifiedDiffs = toolResults
+        .filter((result: any) => result.unifiedDiff)
+        .map((result: any) => `--- ${result.file} ---\n${result.unifiedDiff}`)
+        .join('\n\n')
+
+      yield {
+        toolName: 'set_output',
+        input: {
+          toolCalls,
+          toolResults,
+          unifiedDiffs,
+        },
+        includeToolCall: false,
+      }
     },
   }
 }
 const definition = {
-  ...createBestOfNImplementor({ model: 'sonnet' }),
+  ...createBestOfNImplementor({ model: 'opus' }),
   id: 'editor-implementor',
 }
 export default definition