diff --git a/extras/chat_template_examples/chat_template_gemma.jinja b/extras/chat_template_examples/chat_template_gemma.jinja new file mode 100644 index 0000000000..4652e1a524 --- /dev/null +++ b/extras/chat_template_examples/chat_template_gemma.jinja @@ -0,0 +1,352 @@ +{#- + Modifications to original chat template: + * Ignoring "response" field from tool definition (it was broken, but also it is inline now with other servings) + * Whenever template renders tool_call arguments, those are converted from string to dictlike format using from_json +#} +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {#- Patched out: Here was "response" field rendering for function declaration, however broken and not inline with other serving such as llama.cpp #} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {#- Patch: Usage of from_json to convert tool_call arguments from string to dictlike #} + {%- set function = tool_call['function'] -%} + {%- set args = function['arguments'] | from_json -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if args is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in args | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- if not enable_thinking | default(false) -%} + {{- '<|channel>thought\n' -}} + {%- endif -%} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/extras/chat_template_examples/chat_template_qwen35.jinja b/extras/chat_template_examples/chat_template_qwen35.jinja new file mode 100644 index 0000000000..a49778eaa4 --- /dev/null +++ b/extras/chat_template_examples/chat_template_qwen35.jinja @@ -0,0 +1,159 @@ +{#- + Modifications to original chat template: + * Whenever template renders tool_call arguments, those are converted from string to dictlike format using from_json +#} +{%- set image_count = namespace(value=0) %} +{%- set video_count = namespace(value=0) %} +{%- macro render_content(content, do_vision_count, is_system_content=false) %} + {%- if content is string %} + {{- content }} + {%- elif content is iterable and content is not mapping %} + {%- for item in content %} + {%- if 'image' in item or 'image_url' in item or item.type == 'image' %} + {%- if is_system_content %} + {{- raise_exception('System message cannot contain images.') }} + {%- endif %} + {%- if do_vision_count %} + {%- set image_count.value = image_count.value + 1 %} + {%- endif %} + {%- if add_vision_id %} + {{- 'Picture ' ~ image_count.value ~ ': ' }} + {%- endif %} + {{- '<|vision_start|><|image_pad|><|vision_end|>' }} + {%- elif 'video' in item or item.type == 'video' %} + {%- if is_system_content %} + {{- raise_exception('System message cannot contain videos.') }} + {%- endif %} + {%- if do_vision_count %} + {%- set video_count.value = video_count.value + 1 %} + {%- endif %} + {%- if add_vision_id %} + {{- 'Video ' ~ video_count.value ~ ': ' }} + {%- endif %} + {{- '<|vision_start|><|video_pad|><|vision_end|>' }} + {%- elif 'text' in item %} + {{- item.text }} + {%- else %} + {{- raise_exception('Unexpected item type in content.') }} + {%- endif %} + {%- endfor %} + {%- elif content is none or content is undefined %} + {{- '' }} + {%- else %} + {{- raise_exception('Unexpected content type.') }} + {%- endif %} +{%- endmacro %} +{%- if not messages %} + {{- raise_exception('No messages provided.') }} +{%- endif %} +{%- if tools and tools is iterable and tools is not mapping %} + {{- '<|im_start|>system\n' }} + {{- "# Tools\n\nYou have access to the following functions:\n\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n" }} + {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' }} + {%- if messages[0].role == 'system' %} + {%- set content = render_content(messages[0].content, false, true)|trim %} + {%- if content %} + {{- '\n\n' + content }} + {%- endif %} + {%- endif %} + {{- '<|im_end|>\n' }} +{%- else %} + {%- if messages[0].role == 'system' %} + {%- set content = render_content(messages[0].content, false, true)|trim %} + {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" %} + {%- set content = render_content(message.content, false)|trim %} + {%- if not(content.startswith('') and content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if ns.multi_step_tool %} + {{- raise_exception('No user query found in messages.') }} +{%- endif %} +{%- for message in messages %} + {%- set content = render_content(message.content, true)|trim %} + {%- if message.role == "system" %} + {%- if not loop.first %} + {{- raise_exception('System message must be at the beginning.') }} + {%- endif %} + {%- elif message.role == "user" %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- set reasoning_content = reasoning_content|trim %} + {%- if loop.index0 > ns.last_query_index %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content + '\n\n\n' + content }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {%- if loop.first %} + {%- if content|trim %} + {{- '\n\n\n\n' }} + {%- else %} + {{- '\n\n' }} + {%- endif %} + {%- else %} + {{- '\n\n\n' }} + {%- endif %} + {%- if tool_call.arguments is defined %} + {#- Patch: Convert to JSON before calling items, OVMS provides arguments as string #} + {%- for args_name, args_value in tool_call.arguments|from_json|items %} + {{- '\n' }} + {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %} + {{- args_value }} + {{- '\n\n' }} + {%- endfor %} + {%- endif %} + {{- '\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.previtem and loop.previtem.role != "tool" %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if not loop.last and loop.nextitem.role != "tool" %} + {{- '<|im_end|>\n' }} + {%- elif loop.last %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- else %} + {{- raise_exception('Unexpected message role.') }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- else %} + {{- '\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/extras/chat_template_examples/chat_template_qwen36.jinja b/extras/chat_template_examples/chat_template_qwen36.jinja index 7cc994bb05..fece90ac2f 100644 --- a/extras/chat_template_examples/chat_template_qwen36.jinja +++ b/extras/chat_template_examples/chat_template_qwen36.jinja @@ -3,7 +3,9 @@ * Expanded tool_call args serialization to handle booleans/None correctly — Jinja's tojson outputs JSON (true/false/null) but the model expects Python literals (True/False/None), so we patch them explicitly. -#}{%- set image_count = namespace(value=0) %} + * Whenever template renders tool_call arguments, those are converted from string to dictlike format using from_json +#} +{%- set image_count = namespace(value=0) %} {%- set video_count = namespace(value=0) %} {%- macro render_content(content, do_vision_count, is_system_content=false) %} {%- if content is string %} @@ -121,7 +123,8 @@ {{- '\n\n\n' }} {%- endif %} {%- if tool_call.arguments is defined %} - {%- for args_name, args_value in tool_call.arguments|items %} + {#- Patch: Convert to JSON before calling items, OVMS provides arguments as string #} + {%- for args_name, args_value in tool_call.arguments|from_json|items %} {{- '\n' }} {#- BEGINNING OF PATCH #} {#- {%- set args_value = args_value | string if args_value is string else args_value | tojson | safe %} #} @@ -173,4 +176,4 @@ {%- else %} {{- '\n' }} {%- endif %} -{%- endif %} +{%- endif %} \ No newline at end of file diff --git a/src/llm/servable_initializer.cpp b/src/llm/servable_initializer.cpp index 68913ee66e..11d796b3e0 100644 --- a/src/llm/servable_initializer.cpp +++ b/src/llm/servable_initializer.cpp @@ -150,14 +150,6 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptrtokenizer.get_original_chat_template(); std::string bosToken = properties->tokenizer.get_bos_token(); std::string eosToken = properties->tokenizer.get_eos_token(); - if (bosToken.empty()) { - SPDLOG_ERROR("BOS token was not found in model files."); - return; - } - if (eosToken.empty()) { - SPDLOG_ERROR("EOS token was not found in model files."); - return; - } if (chatTemplate.empty()) { SPDLOG_ERROR("Chat template was not found in model files."); return; diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp index 13b7e73a62..ce7e7a6928 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.cpp +++ b/src/llm/visual_language_model/continuous_batching/servable.cpp @@ -22,9 +22,16 @@ #include #include +#include "src/port/rapidjson_document.hpp" +#include "src/port/rapidjson_stringbuffer.hpp" +#include "src/port/rapidjson_writer.hpp" + #include "../../../logging.hpp" -#include "../../text_utils.hpp" #include "../../../tokenize/tokenize_parser.hpp" +#include "../../text_utils.hpp" +#if (PYTHON_DISABLE == 0) +#include "../../py_jinja_template_processor.hpp" +#endif namespace ovms { @@ -94,6 +101,40 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptrapiHandler->getProcessedJson().size() > 0) { + jsonForTemplate = vlmExecutionContext->apiHandler->getProcessedJson(); + } else { + jsonForTemplate = vlmExecutionContext->payload.body; + } + // Inject image tags into the JSON messages for Python Jinja template processing + if (!imageTags.empty()) { + rapidjson::Document jsonDoc; + jsonDoc.Parse(jsonForTemplate.c_str()); + if (!jsonDoc.HasParseError() && jsonDoc.IsObject() && jsonDoc.HasMember("messages") && jsonDoc["messages"].IsArray()) { + auto& messages = jsonDoc["messages"]; + for (const auto& [chatTurnIndex, imageTagString] : imageTags) { + if (chatTurnIndex < messages.Size()) { + auto& msg = messages[chatTurnIndex]; + if (msg.IsObject() && msg.HasMember("content") && msg["content"].IsString()) { + std::string newContent = imageTagString + msg["content"].GetString(); + msg["content"].SetString(newContent.c_str(), newContent.length(), jsonDoc.GetAllocator()); + } + } + } + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + jsonDoc.Accept(writer); + jsonForTemplate = buffer.GetString(); + } + } + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "VLM CB: Applying chat template using Python Jinja processor"); + bool success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, jsonForTemplate, vlmExecutionContext->inputText); + if (!success) { + return absl::Status(absl::StatusCode::kInvalidArgument, vlmExecutionContext->inputText); + } +#else constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded auto toolsStatus = vlmExecutionContext->apiHandler->parseToolsToJsonContainer(); if (!toolsStatus.ok()) { @@ -113,7 +154,16 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptrto_json_string() : std::string("")); SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM addGenerationPrompt: {}", addGenerationPrompt); } - vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); + try { + vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); + } catch (const std::exception& e) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what()); + return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one."); + } +#endif + if (vlmExecutionContext->inputText.empty()) { + return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty"); + } if (vlmExecutionContext->apiHandler->getOutputParser() != nullptr) { vlmExecutionContext->apiHandler->getOutputParser()->detectAndSetImplicitReasoningStart(vlmExecutionContext->inputText); } diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 033cb8641d..65bd99ae97 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -34,6 +34,10 @@ #pragma GCC diagnostic pop #pragma warning(pop) +#include "src/port/rapidjson_document.hpp" +#include "src/port/rapidjson_stringbuffer.hpp" +#include "src/port/rapidjson_writer.hpp" + #include "../../../config.hpp" #include "../../../http_payload.hpp" #include "../../../mediapipe_internal/mediapipe_utils.hpp" @@ -297,6 +301,40 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptrapiHandler->getProcessedJson().size() > 0) { + jsonForTemplate = vlmExecutionContext->apiHandler->getProcessedJson(); + } else { + jsonForTemplate = vlmExecutionContext->payload.body; + } + // Inject image tags into the JSON messages for Python Jinja template processing + if (!imageTags.empty()) { + rapidjson::Document jsonDoc; + jsonDoc.Parse(jsonForTemplate.c_str()); + if (!jsonDoc.HasParseError() && jsonDoc.IsObject() && jsonDoc.HasMember("messages") && jsonDoc["messages"].IsArray()) { + auto& messages = jsonDoc["messages"]; + for (const auto& [chatTurnIndex, imageTagString] : imageTags) { + if (chatTurnIndex < messages.Size()) { + auto& msg = messages[chatTurnIndex]; + if (msg.IsObject() && msg.HasMember("content") && msg["content"].IsString()) { + std::string newContent = imageTagString + msg["content"].GetString(); + msg["content"].SetString(newContent.c_str(), newContent.length(), jsonDoc.GetAllocator()); + } + } + } + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + jsonDoc.Accept(writer); + jsonForTemplate = buffer.GetString(); + } + } + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "VLM Legacy: Applying chat template using Python Jinja processor"); + bool success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, jsonForTemplate, vlmExecutionContext->inputText); + if (!success) { + return absl::Status(absl::StatusCode::kInvalidArgument, vlmExecutionContext->inputText); + } +#else constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded auto toolsStatus = vlmExecutionContext->apiHandler->parseToolsToJsonContainer(); if (!toolsStatus.ok()) { @@ -308,7 +346,16 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptrinputText = properties->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); + try { + vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); + } catch (const std::exception& e) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what()); + return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one."); + } +#endif + if (vlmExecutionContext->inputText.empty()) { + return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty"); + } if (vlmExecutionContext->apiHandler->getOutputParser() != nullptr) { vlmExecutionContext->apiHandler->getOutputParser()->detectAndSetImplicitReasoningStart(vlmExecutionContext->inputText); }