Skip to content

Commit 5c8a0a3

Browse files
authored
Preserve in-progress message history when agent run errors (#517)
1 parent 7608629 commit 5c8a0a3

4 files changed

Lines changed: 354 additions & 13 deletions

File tree

packages/agent-runtime/src/__tests__/main-prompt.test.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,7 @@ describe('mainPrompt', () => {
375375
it('should update consecutiveAssistantMessages when new prompt is received', async () => {
376376
const sessionState = getInitialSessionState(mockFileContext)
377377
sessionState.mainAgentState.stepsRemaining = 12
378+
const initialStepsRemaining = sessionState.mainAgentState.stepsRemaining
378379

379380
const action = {
380381
type: 'prompt' as const,
@@ -394,7 +395,7 @@ describe('mainPrompt', () => {
394395

395396
// When there's a new prompt, consecutiveAssistantMessages should be set to 1
396397
expect(newSessionState.mainAgentState.stepsRemaining).toBe(
397-
sessionState.mainAgentState.stepsRemaining - 1,
398+
initialStepsRemaining - 1,
398399
)
399400
})
400401

packages/agent-runtime/src/run-agent-step.ts

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,17 @@ export const runAgentStep = async (
536536
}
537537
}
538538

539+
/**
540+
* Runs the agent loop.
541+
*
542+
* IMPORTANT: This function mutates `params.agentState` in place throughout the
543+
* run (not just at return time). Fields like `messageHistory`, `systemPrompt`,
544+
* `toolDefinitions`, `creditsUsed`, and `output` are updated as work progresses
545+
* so that callers holding a reference to the same object (e.g. the SDK's
546+
* `sessionState.mainAgentState`) see in-progress work immediately — which
547+
* matters when an error is thrown mid-run and the normal return path is
548+
* skipped.
549+
*/
539550
export async function loopAgentSteps(
540551
params: {
541552
addAgentStep: AddAgentStepFn
@@ -800,12 +811,13 @@ export async function loopAgentSteps(
800811
return cachedAdditionalToolDefinitions
801812
}
802813

803-
let currentAgentState: AgentState = {
804-
...initialAgentState,
805-
messageHistory: initialMessages,
806-
systemPrompt: system,
807-
toolDefinitions,
808-
}
814+
// Mutate initialAgentState so that in-progress work propagates back to the
815+
// caller's shared reference (e.g. SDK's sessionState.mainAgentState) even if
816+
// an error is thrown before we return.
817+
initialAgentState.messageHistory = initialMessages
818+
initialAgentState.systemPrompt = system
819+
initialAgentState.toolDefinitions = toolDefinitions
820+
let currentAgentState: AgentState = initialAgentState
809821

810822
// Convert tool definitions to Anthropic format for accurate token counting
811823
// Tool definitions are stored as { [name]: { description, inputSchema } }
@@ -908,7 +920,8 @@ export async function loopAgentSteps(
908920
} = programmaticResult
909921
n = generateN
910922

911-
currentAgentState = programmaticAgentState
923+
Object.assign(initialAgentState, programmaticAgentState)
924+
currentAgentState = initialAgentState
912925
totalSteps = stepNumber
913926

914927
shouldEndTurn = endTurn
@@ -989,7 +1002,8 @@ export async function loopAgentSteps(
9891002
logger.error('No runId found for agent state after finishing agent run')
9901003
}
9911004

992-
currentAgentState = newAgentState
1005+
Object.assign(initialAgentState, newAgentState)
1006+
currentAgentState = initialAgentState
9931007
shouldEndTurn = llmShouldEndTurn
9941008
nResponses = generatedResponses
9951009

Lines changed: 315 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,315 @@
1+
import * as mainPromptModule from '@codebuff/agent-runtime/main-prompt'
2+
import { getInitialSessionState } from '@codebuff/common/types/session-state'
3+
import { getStubProjectFileContext } from '@codebuff/common/util/file'
4+
import { assistantMessage, userMessage } from '@codebuff/common/util/messages'
5+
import { afterEach, describe, expect, it, mock, spyOn } from 'bun:test'
6+
7+
import { CodebuffClient } from '../client'
8+
import * as databaseModule from '../impl/database'
9+
10+
interface ToolCallContentBlock {
11+
type: 'tool-call'
12+
toolCallId: string
13+
toolName: string
14+
input: Record<string, unknown>
15+
}
16+
17+
const setupDatabaseMocks = () => {
18+
spyOn(databaseModule, 'getUserInfoFromApiKey').mockResolvedValue({
19+
id: 'user-123',
20+
email: 'test@example.com',
21+
discord_id: null,
22+
referral_code: null,
23+
stripe_customer_id: null,
24+
banned: false,
25+
created_at: new Date('2024-01-01T00:00:00Z'),
26+
})
27+
spyOn(databaseModule, 'fetchAgentFromDatabase').mockResolvedValue(null)
28+
spyOn(databaseModule, 'startAgentRun').mockResolvedValue('run-1')
29+
spyOn(databaseModule, 'finishAgentRun').mockResolvedValue(undefined)
30+
spyOn(databaseModule, 'addAgentStep').mockResolvedValue('step-1')
31+
}
32+
33+
describe('Error preserves in-progress message history', () => {
34+
afterEach(() => {
35+
mock.restore()
36+
})
37+
38+
it('preserves in-progress assistant work on error (simulated via shared state mutation)', async () => {
39+
setupDatabaseMocks()
40+
41+
// Simulate the agent runtime:
42+
// 1. Mutates the shared session state with the user message and partial work
43+
// 2. Then throws due to a downstream timeout/service error
44+
spyOn(mainPromptModule, 'callMainPrompt').mockImplementation(
45+
async (params: Parameters<typeof mainPromptModule.callMainPrompt>[0]) => {
46+
const mainAgentState = params.action.sessionState.mainAgentState
47+
48+
// Match the real runtime's behavior: replace messageHistory with a new
49+
// array that includes the user prompt as its first entry. The SDK
50+
// detects runtime progress via reference inequality, so we must
51+
// reassign the array rather than pushing into it.
52+
mainAgentState.messageHistory = [
53+
...mainAgentState.messageHistory,
54+
{
55+
role: 'user',
56+
content: [{ type: 'text', text: 'Fix the bug in auth.ts' }],
57+
tags: ['USER_PROMPT'],
58+
},
59+
{
60+
role: 'assistant',
61+
content: [
62+
{ type: 'text', text: 'Let me read the auth file first.' },
63+
{
64+
type: 'tool-call',
65+
toolCallId: 'read-1',
66+
toolName: 'read_files',
67+
input: { paths: ['auth.ts'] },
68+
} as ToolCallContentBlock,
69+
],
70+
},
71+
{
72+
role: 'tool',
73+
toolCallId: 'read-1',
74+
toolName: 'read_files',
75+
content: [
76+
{
77+
type: 'json',
78+
value: [{ path: 'auth.ts', content: 'const auth = ...' }],
79+
},
80+
],
81+
},
82+
{
83+
role: 'assistant',
84+
content: [
85+
{ type: 'text', text: 'Found the issue, writing the fix now.' },
86+
{
87+
type: 'tool-call',
88+
toolCallId: 'write-1',
89+
toolName: 'write_file',
90+
input: { path: 'auth.ts', content: 'const auth = fixed' },
91+
} as ToolCallContentBlock,
92+
],
93+
},
94+
{
95+
role: 'tool',
96+
toolCallId: 'write-1',
97+
toolName: 'write_file',
98+
content: [{ type: 'json', value: { file: 'auth.ts', message: 'File written' } }],
99+
},
100+
]
101+
102+
// Now simulate a server timeout on the next LLM call
103+
const timeoutError = new Error('Service Unavailable') as Error & {
104+
statusCode: number
105+
responseBody: string
106+
}
107+
timeoutError.statusCode = 503
108+
timeoutError.responseBody = JSON.stringify({
109+
message: 'Request timeout after 30s',
110+
})
111+
throw timeoutError
112+
},
113+
)
114+
115+
const client = new CodebuffClient({ apiKey: 'test-key' })
116+
const result = await client.run({
117+
agent: 'base2',
118+
prompt: 'Fix the bug in auth.ts',
119+
})
120+
121+
// Error output with correct status code
122+
expect(result.output.type).toBe('error')
123+
const errorOutput = result.output as {
124+
type: 'error'
125+
message: string
126+
statusCode?: number
127+
}
128+
expect(errorOutput.statusCode).toBe(503)
129+
130+
const history = result.sessionState!.mainAgentState.messageHistory
131+
132+
// The user's prompt should appear exactly once
133+
const userPromptMessages = history.filter(
134+
(m) =>
135+
m.role === 'user' &&
136+
(m.content as Array<{ type: string; text?: string }>).some(
137+
(c) => c.type === 'text' && c.text?.includes('Fix the bug'),
138+
),
139+
)
140+
expect(userPromptMessages.length).toBe(1)
141+
142+
// Assistant text messages from both steps should be preserved
143+
const firstAssistantText = history.find(
144+
(m) =>
145+
m.role === 'assistant' &&
146+
(m.content as Array<{ type: string; text?: string }>).some(
147+
(c) => c.type === 'text' && c.text?.includes('read the auth file'),
148+
),
149+
)
150+
expect(firstAssistantText).toBeDefined()
151+
152+
const secondAssistantText = history.find(
153+
(m) =>
154+
m.role === 'assistant' &&
155+
(m.content as Array<{ type: string; text?: string }>).some(
156+
(c) => c.type === 'text' && c.text?.includes('writing the fix'),
157+
),
158+
)
159+
expect(secondAssistantText).toBeDefined()
160+
161+
// Both tool calls and both tool results should be preserved
162+
const readToolCall = history.find(
163+
(m) =>
164+
m.role === 'assistant' &&
165+
(m.content as Array<{ type: string; toolCallId?: string }>).some(
166+
(c) => c.type === 'tool-call' && c.toolCallId === 'read-1',
167+
),
168+
)
169+
expect(readToolCall).toBeDefined()
170+
171+
const writeToolCall = history.find(
172+
(m) =>
173+
m.role === 'assistant' &&
174+
(m.content as Array<{ type: string; toolCallId?: string }>).some(
175+
(c) => c.type === 'tool-call' && c.toolCallId === 'write-1',
176+
),
177+
)
178+
expect(writeToolCall).toBeDefined()
179+
180+
const readToolResult = history.find(
181+
(m) => m.role === 'tool' && m.toolCallId === 'read-1',
182+
)
183+
expect(readToolResult).toBeDefined()
184+
185+
const writeToolResult = history.find(
186+
(m) => m.role === 'tool' && m.toolCallId === 'write-1',
187+
)
188+
expect(writeToolResult).toBeDefined()
189+
})
190+
191+
it('a subsequent run after error includes the preserved in-progress history', async () => {
192+
setupDatabaseMocks()
193+
194+
// Run 1: agent does some work then hits an error
195+
spyOn(mainPromptModule, 'callMainPrompt').mockImplementation(
196+
async (params: Parameters<typeof mainPromptModule.callMainPrompt>[0]) => {
197+
const mainAgentState = params.action.sessionState.mainAgentState
198+
199+
mainAgentState.messageHistory = [
200+
...mainAgentState.messageHistory,
201+
{
202+
role: 'user',
203+
content: [{ type: 'text', text: 'Investigate the login bug' }],
204+
tags: ['USER_PROMPT'],
205+
},
206+
assistantMessage('I found the problem in auth.ts on line 42.'),
207+
{
208+
role: 'assistant',
209+
content: [
210+
{
211+
type: 'tool-call',
212+
toolCallId: 'read-login',
213+
toolName: 'read_files',
214+
input: { paths: ['login.ts'] },
215+
} as ToolCallContentBlock,
216+
],
217+
},
218+
{
219+
role: 'tool',
220+
toolCallId: 'read-login',
221+
toolName: 'read_files',
222+
content: [{ type: 'json', value: [{ path: 'login.ts', content: 'login code' }] }],
223+
},
224+
]
225+
226+
const error = new Error('Service Unavailable') as Error & {
227+
statusCode: number
228+
}
229+
error.statusCode = 503
230+
throw error
231+
},
232+
)
233+
234+
const client = new CodebuffClient({ apiKey: 'test-key' })
235+
const firstResult = await client.run({
236+
agent: 'base2',
237+
prompt: 'Investigate the login bug',
238+
})
239+
240+
expect(firstResult.output.type).toBe('error')
241+
242+
// Run 2: use the failed run as previousRun
243+
mock.restore()
244+
setupDatabaseMocks()
245+
246+
let historyReceivedByRuntime: unknown[] | undefined
247+
spyOn(mainPromptModule, 'callMainPrompt').mockImplementation(
248+
async (params: Parameters<typeof mainPromptModule.callMainPrompt>[0]) => {
249+
const { sendAction, promptId } = params
250+
historyReceivedByRuntime = [
251+
...params.action.sessionState.mainAgentState.messageHistory,
252+
]
253+
254+
const responseSessionState = getInitialSessionState(
255+
getStubProjectFileContext(),
256+
)
257+
responseSessionState.mainAgentState.messageHistory = [
258+
...params.action.sessionState.mainAgentState.messageHistory,
259+
userMessage('Now try again'),
260+
assistantMessage('Continuing with the fix.'),
261+
]
262+
263+
await sendAction({
264+
action: {
265+
type: 'prompt-response',
266+
promptId,
267+
sessionState: responseSessionState,
268+
output: { type: 'lastMessage', value: [] },
269+
},
270+
})
271+
272+
return {
273+
sessionState: responseSessionState,
274+
output: { type: 'lastMessage' as const, value: [] },
275+
}
276+
},
277+
)
278+
279+
const secondResult = await client.run({
280+
agent: 'base2',
281+
prompt: 'Now try again',
282+
previousRun: firstResult,
283+
})
284+
285+
// The runtime should have received history containing the work from the first run
286+
expect(historyReceivedByRuntime).toBeDefined()
287+
const receivedReadCall = historyReceivedByRuntime!.find(
288+
(m) =>
289+
(m as { role: string }).role === 'assistant' &&
290+
((m as { content: Array<{ type: string; toolCallId?: string }> })
291+
.content ?? []).some(
292+
(c) => c.type === 'tool-call' && c.toolCallId === 'read-login',
293+
),
294+
)
295+
expect(receivedReadCall).toBeDefined()
296+
297+
const receivedToolResult = historyReceivedByRuntime!.find(
298+
(m) =>
299+
(m as { role: string }).role === 'tool' &&
300+
(m as { toolCallId: string }).toolCallId === 'read-login',
301+
)
302+
expect(receivedToolResult).toBeDefined()
303+
304+
// Final result should preserve history
305+
const finalHistory = secondResult.sessionState!.mainAgentState.messageHistory
306+
const finalReadCall = finalHistory.find(
307+
(m) =>
308+
m.role === 'assistant' &&
309+
(m.content as Array<{ type: string; toolCallId?: string }>).some(
310+
(c) => c.type === 'tool-call' && c.toolCallId === 'read-login',
311+
),
312+
)
313+
expect(finalReadCall).toBeDefined()
314+
})
315+
})

0 commit comments

Comments
 (0)