@@ -544,8 +544,15 @@ def read_binary_file(url):
544544async def process_chat (chat , provider_id = None ):
545545 if not chat :
546546 raise Exception ("No chat provided" )
547- if "stream" not in chat :
548- chat ["stream" ] = False
547+ # WORKAROUND: Force stream=false for all provider requests.
548+ # chat_handler() (the /v1/chat/completions endpoint) does not natively support
549+ # streaming — it always collects the full response and returns JSON. If a client
550+ # sends stream=true and we pass it through, the upstream provider (e.g. Ollama)
551+ # returns SSE (text/event-stream), which response_json() cannot parse as JSON,
552+ # causing "Expecting value: line 1 column 1 (char 0)" errors and HTTP 500s.
553+ # Instead we force non-streaming here, and chat_handler converts the JSON response
554+ # back to SSE chunks if the client originally requested streaming.
555+ chat ["stream" ] = False
549556 # Some providers don't support empty tools
550557 if "tools" in chat and (chat ["tools" ] is None or len (chat ["tools" ]) == 0 ):
551558 del chat ["tools" ]
@@ -4127,11 +4134,43 @@ async def chat_handler(request):
41274134
41284135 try :
41294136 chat = await request .json ()
4137+ # Remember whether the client requested streaming before process_chat
4138+ # forces it to False (see WORKAROUND in process_chat).
4139+ client_wants_stream = chat .get ("stream" , False )
41304140 context = {"chat" : chat , "request" : request , "user" : g_app .get_username (request )}
41314141 metadata = chat .get ("metadata" , {})
41324142 context ["threadId" ] = metadata .get ("threadId" , None )
41334143 context ["tools" ] = metadata .get ("tools" , "all" )
41344144 response = await g_app .chat_completion (chat , context )
4145+ if client_wants_stream :
4146+ # WORKAROUND: Convert the non-streaming JSON response to SSE chunks.
4147+ # Clients like the Vercel AI SDK (@ai-sdk/openai-compatible) send
4148+ # stream=true and parse the response as Server-Sent Events with
4149+ # "chat.completion.chunk" objects containing a "delta" field.
4150+ # Since process_chat forces stream=false and we always get back a
4151+ # regular "chat.completion" JSON object, we re-wrap it here as a
4152+ # single SSE chunk so streaming clients can parse it correctly.
4153+ sse_response = web .StreamResponse (
4154+ status = 200 ,
4155+ headers = {"Content-Type" : "text/event-stream" , "Cache-Control" : "no-cache" },
4156+ )
4157+ await sse_response .prepare (request )
4158+ for choice in response .get ("choices" , []):
4159+ chunk = {
4160+ "id" : response .get ("id" , "" ),
4161+ "object" : "chat.completion.chunk" ,
4162+ "created" : response .get ("created" , 0 ),
4163+ "model" : response .get ("model" , "" ),
4164+ "choices" : [{
4165+ "index" : choice .get ("index" , 0 ),
4166+ "delta" : {"role" : "assistant" , "content" : choice .get ("message" , {}).get ("content" , "" )},
4167+ "finish_reason" : choice .get ("finish_reason" ),
4168+ }],
4169+ "usage" : response .get ("usage" ),
4170+ }
4171+ await sse_response .write (f"data: { json .dumps (chunk )} \n \n " .encode ())
4172+ await sse_response .write (b"data: [DONE]\n \n " )
4173+ return sse_response
41354174 return web .json_response (response )
41364175 except Exception as e :
41374176 return web .json_response (to_error_response (e ), status = 500 )
0 commit comments