Skip to content

Commit e5d3d6a

Browse files
authored
Merge pull request #5 from ObolNetwork/fix/streaming-sse-passthrough
fix: handle streaming requests in chat_handler
2 parents 7331044 + 6b5a049 commit e5d3d6a

1 file changed

Lines changed: 41 additions & 2 deletions

File tree

llms/main.py

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -544,8 +544,15 @@ def read_binary_file(url):
544544
async def process_chat(chat, provider_id=None):
545545
if not chat:
546546
raise Exception("No chat provided")
547-
if "stream" not in chat:
548-
chat["stream"] = False
547+
# WORKAROUND: Force stream=false for all provider requests.
548+
# chat_handler() (the /v1/chat/completions endpoint) does not natively support
549+
# streaming — it always collects the full response and returns JSON. If a client
550+
# sends stream=true and we pass it through, the upstream provider (e.g. Ollama)
551+
# returns SSE (text/event-stream), which response_json() cannot parse as JSON,
552+
# causing "Expecting value: line 1 column 1 (char 0)" errors and HTTP 500s.
553+
# Instead we force non-streaming here, and chat_handler converts the JSON response
554+
# back to SSE chunks if the client originally requested streaming.
555+
chat["stream"] = False
549556
# Some providers don't support empty tools
550557
if "tools" in chat and (chat["tools"] is None or len(chat["tools"]) == 0):
551558
del chat["tools"]
@@ -4127,11 +4134,43 @@ async def chat_handler(request):
41274134

41284135
try:
41294136
chat = await request.json()
4137+
# Remember whether the client requested streaming before process_chat
4138+
# forces it to False (see WORKAROUND in process_chat).
4139+
client_wants_stream = chat.get("stream", False)
41304140
context = {"chat": chat, "request": request, "user": g_app.get_username(request)}
41314141
metadata = chat.get("metadata", {})
41324142
context["threadId"] = metadata.get("threadId", None)
41334143
context["tools"] = metadata.get("tools", "all")
41344144
response = await g_app.chat_completion(chat, context)
4145+
if client_wants_stream:
4146+
# WORKAROUND: Convert the non-streaming JSON response to SSE chunks.
4147+
# Clients like the Vercel AI SDK (@ai-sdk/openai-compatible) send
4148+
# stream=true and parse the response as Server-Sent Events with
4149+
# "chat.completion.chunk" objects containing a "delta" field.
4150+
# Since process_chat forces stream=false and we always get back a
4151+
# regular "chat.completion" JSON object, we re-wrap it here as a
4152+
# single SSE chunk so streaming clients can parse it correctly.
4153+
sse_response = web.StreamResponse(
4154+
status=200,
4155+
headers={"Content-Type": "text/event-stream", "Cache-Control": "no-cache"},
4156+
)
4157+
await sse_response.prepare(request)
4158+
for choice in response.get("choices", []):
4159+
chunk = {
4160+
"id": response.get("id", ""),
4161+
"object": "chat.completion.chunk",
4162+
"created": response.get("created", 0),
4163+
"model": response.get("model", ""),
4164+
"choices": [{
4165+
"index": choice.get("index", 0),
4166+
"delta": {"role": "assistant", "content": choice.get("message", {}).get("content", "")},
4167+
"finish_reason": choice.get("finish_reason"),
4168+
}],
4169+
"usage": response.get("usage"),
4170+
}
4171+
await sse_response.write(f"data: {json.dumps(chunk)}\n\n".encode())
4172+
await sse_response.write(b"data: [DONE]\n\n")
4173+
return sse_response
41354174
return web.json_response(response)
41364175
except Exception as e:
41374176
return web.json_response(to_error_response(e), status=500)

0 commit comments

Comments
 (0)