From ce69cf2df2976ac7f9e79ab257dcae6e59d56b61 Mon Sep 17 00:00:00 2001 From: Pawel Date: Mon, 2 Mar 2026 14:21:50 +0100 Subject: [PATCH 01/13] test --- demos/continuous_batching/agentic_ai/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demos/continuous_batching/agentic_ai/requirements.txt b/demos/continuous_batching/agentic_ai/requirements.txt index 5147552b66..09560739b5 100644 --- a/demos/continuous_batching/agentic_ai/requirements.txt +++ b/demos/continuous_batching/agentic_ai/requirements.txt @@ -1,4 +1,4 @@ -openai-agents==0.2.11 -openai==1.107.0 +openai-agents +openai python-dateutil mcp_weather_server \ No newline at end of file From ce36006bcde3c3377cd747408bfe6fb2526e58f8 Mon Sep 17 00:00:00 2001 From: Pawel Date: Fri, 13 Mar 2026 10:17:29 +0100 Subject: [PATCH 02/13] save --- demos/vlm_npu/generation_config.json | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 demos/vlm_npu/generation_config.json diff --git a/demos/vlm_npu/generation_config.json b/demos/vlm_npu/generation_config.json new file mode 100644 index 0000000000..febad2198f --- /dev/null +++ b/demos/vlm_npu/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 32000, + "pad_token_id": 32000, + "transformers_version": "4.53.3" +} \ No newline at end of file From f40743ef225708a8be0498d142b40bb854f18429 Mon Sep 17 00:00:00 2001 From: Pawel Date: Fri, 13 Mar 2026 10:31:35 +0100 Subject: [PATCH 03/13] fix --- demos/vlm_npu/README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/demos/vlm_npu/README.md b/demos/vlm_npu/README.md index 96736c80dc..9abcac4c73 100644 --- a/demos/vlm_npu/README.md +++ b/demos/vlm_npu/README.md @@ -37,11 +37,22 @@ Run `export_model.py` script to download and quantize the model: > **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" before running the export script to connect to the HF Hub. + **LLM** ```console python export_model.py text_generation --source_model microsoft/Phi-3.5-vision-instruct --target_device NPU --config_file_path models/config.json --model_repository_path models --overwrite_models ``` +Copy corrected config to the model directory. +Linux: +```bash +cp generation_config.json models/microsoft/Phi-3.5-vision-instruct +``` +Windows: +```bat +copy generation_config.json models\microsoft\Phi-3.5-vision-instruct +``` + Note that by default, NPU sets limitation on the prompt length (which in VLM also include image tokens) to 1024 tokens. You can modify that limit by using `--max_prompt_len` parameter. > **Note:** You can change the model used in the demo out of any topology [tested](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/#visual-language-models-vlms) with OpenVINO. From 4f1a489fc46098e04f6ada44a55df9bea99a2b60 Mon Sep 17 00:00:00 2001 From: Pawel Date: Fri, 13 Mar 2026 10:50:53 +0100 Subject: [PATCH 04/13] fix --- demos/vlm_npu/README.md | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/demos/vlm_npu/README.md b/demos/vlm_npu/README.md index 9abcac4c73..f6dbe2ffe7 100644 --- a/demos/vlm_npu/README.md +++ b/demos/vlm_npu/README.md @@ -41,16 +41,7 @@ Run `export_model.py` script to download and quantize the model: **LLM** ```console python export_model.py text_generation --source_model microsoft/Phi-3.5-vision-instruct --target_device NPU --config_file_path models/config.json --model_repository_path models --overwrite_models -``` - -Copy corrected config to the model directory. -Linux: -```bash -cp generation_config.json models/microsoft/Phi-3.5-vision-instruct -``` -Windows: -```bat -copy generation_config.json models\microsoft\Phi-3.5-vision-instruct +curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/przepeck/unicode_python_fix/demos/vlm_npu/generation_config.json -o models/microsoft/Phi-3.5-vision-instruct/generation_config.json ``` Note that by default, NPU sets limitation on the prompt length (which in VLM also include image tokens) to 1024 tokens. You can modify that limit by using `--max_prompt_len` parameter. From f4c2f169882320294f19354be5d9da68fef5b374 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Rzepecki?= Date: Mon, 16 Mar 2026 13:10:27 +0100 Subject: [PATCH 05/13] change branch to main Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- demos/vlm_npu/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/vlm_npu/README.md b/demos/vlm_npu/README.md index f6dbe2ffe7..6bdae23a95 100644 --- a/demos/vlm_npu/README.md +++ b/demos/vlm_npu/README.md @@ -41,7 +41,7 @@ Run `export_model.py` script to download and quantize the model: **LLM** ```console python export_model.py text_generation --source_model microsoft/Phi-3.5-vision-instruct --target_device NPU --config_file_path models/config.json --model_repository_path models --overwrite_models -curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/przepeck/unicode_python_fix/demos/vlm_npu/generation_config.json -o models/microsoft/Phi-3.5-vision-instruct/generation_config.json +curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/vlm_npu/generation_config.json -o models/microsoft/Phi-3.5-vision-instruct/generation_config.json ``` Note that by default, NPU sets limitation on the prompt length (which in VLM also include image tokens) to 1024 tokens. You can modify that limit by using `--max_prompt_len` parameter. From cd0b3250383cdcff7ad4d24aa7c2fef4a27baaf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Rzepecki?= Date: Mon, 16 Mar 2026 13:11:41 +0100 Subject: [PATCH 06/13] Reset requirements --- demos/continuous_batching/agentic_ai/requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/demos/continuous_batching/agentic_ai/requirements.txt b/demos/continuous_batching/agentic_ai/requirements.txt index 09560739b5..049d128853 100644 --- a/demos/continuous_batching/agentic_ai/requirements.txt +++ b/demos/continuous_batching/agentic_ai/requirements.txt @@ -1,4 +1,4 @@ -openai-agents -openai +openai-agents==0.2.11 +openai==1.107.0 python-dateutil -mcp_weather_server \ No newline at end of file +mcp_weather_server From a369559278dc97476eda7001dee46f303bee2916 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Rzepecki?= Date: Mon, 16 Mar 2026 13:12:06 +0100 Subject: [PATCH 07/13] Reset requirements.txt v2 From a7dfc85d61a7a17d48856998438fb45a727a0293 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Rzepecki?= Date: Mon, 16 Mar 2026 13:12:51 +0100 Subject: [PATCH 08/13] Update requirements.txt From 13fea1830b226371f70094135787d813c431b2e8 Mon Sep 17 00:00:00 2001 From: Pawel Date: Mon, 16 Mar 2026 13:28:04 +0100 Subject: [PATCH 09/13] reset requirements --- demos/continuous_batching/agentic_ai/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/continuous_batching/agentic_ai/requirements.txt b/demos/continuous_batching/agentic_ai/requirements.txt index 049d128853..5147552b66 100644 --- a/demos/continuous_batching/agentic_ai/requirements.txt +++ b/demos/continuous_batching/agentic_ai/requirements.txt @@ -1,4 +1,4 @@ openai-agents==0.2.11 openai==1.107.0 python-dateutil -mcp_weather_server +mcp_weather_server \ No newline at end of file From 6487712d887cb056a60e42c14c687c9dfaf5462f Mon Sep 17 00:00:00 2001 From: Pawel Date: Tue, 17 Mar 2026 11:21:55 +0100 Subject: [PATCH 10/13] public model used --- demos/vlm_npu/README.md | 103 +++++++-------------------- demos/vlm_npu/generation_config.json | 7 -- 2 files changed, 26 insertions(+), 84 deletions(-) delete mode 100644 demos/vlm_npu/generation_config.json diff --git a/demos/vlm_npu/README.md b/demos/vlm_npu/README.md index 6bdae23a95..eedfe53229 100644 --- a/demos/vlm_npu/README.md +++ b/demos/vlm_npu/README.md @@ -13,60 +13,17 @@ It is targeted on client machines equipped with NPU accelerator. **OVMS 2025.1 or higher** -**Model preparation**: Python 3.9 or higher with pip and HuggingFace account - **Model Server deployment**: Installed Docker Engine or OVMS binary package according to the [baremetal deployment guide](../../docs/deploying_server_baremetal.md) **(Optional) Client**: git and Python for using OpenAI client package and vLLM benchmark app - -## Model preparation -Here, the original Pytorch LLM model and the tokenizer will be converted to IR format and optionally quantized. -That ensures faster initialization time, better performance and lower memory consumption. -LLM engine parameters will be defined inside the `graph.pbtxt` file. - -Download export script, install it's dependencies and create directory for the models: -```console -curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/export_model.py -o export_model.py -pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/requirements.txt -pip3 install transformers==4.53.3 --force-reinstall -mkdir models -``` - -Run `export_model.py` script to download and quantize the model: - -> **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" before running the export script to connect to the HF Hub. - - -**LLM** -```console -python export_model.py text_generation --source_model microsoft/Phi-3.5-vision-instruct --target_device NPU --config_file_path models/config.json --model_repository_path models --overwrite_models -curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/vlm_npu/generation_config.json -o models/microsoft/Phi-3.5-vision-instruct/generation_config.json -``` - Note that by default, NPU sets limitation on the prompt length (which in VLM also include image tokens) to 1024 tokens. You can modify that limit by using `--max_prompt_len` parameter. > **Note:** You can change the model used in the demo out of any topology [tested](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/#visual-language-models-vlms) with OpenVINO. -You should have a model folder like below: -``` -tree models -models -├── config.json -└── microsoft - └── Phi-3.5-vision-instruct - ├── config.json - ├── generation_config.json - ├── graph.pbtxt - ├── openvino_detokenizer.bin - ├── openvino_detokenizer.xml - ├── openvino_model.bin - ├── openvino_model.xml - ├── openvino_tokenizer.bin - ├── openvino_tokenizer.xml - ├── special_tokens_map.json - ├── tokenizer_config.json - └── tokenizer.json +Create directory for the model: +```console +mkdir -p models ``` The default configuration should work in most cases but the parameters can be tuned via `export_model.py` script arguments. Run the script with `--help` argument to check available parameters and see the [LLM calculator documentation](../../docs/llm/reference.md) to learn more about configuration options. @@ -78,8 +35,7 @@ The default configuration should work in most cases but the parameters can be tu Running this command starts the container with NPU enabled: ```bash -docker run -d --rm --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \ --p 8000:8000 -v $(pwd)/models:/workspace:ro openvino/model_server:latest-gpu --rest_port 8000 --config_path /workspace/config.json +docker run -d --rm --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) -p 8000:8000 -v $(pwd)/models:/models:rw openvino/model_server:latest-gpu --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Phi-3.5-vision-instruct-int8-ov --task text_generation --target_device NPU ``` ::: @@ -92,10 +48,8 @@ Assuming you have unpacked model server package, make sure to: as mentioned in [deployment guide](../../docs/deploying_server_baremetal.md), in every new shell that will start OpenVINO Model Server. -Depending on how you prepared models in the first step of this demo, they are deployed to either CPU or GPU (it's defined in `config.json`). If you run on GPU make sure to have appropriate drivers installed, so the device is accessible for the model server. - ```bat -ovms --rest_port 8000 --config_path ./models/config.json +ovms --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Phi-3.5-vision-instruct-int8-ov --task text_generation --target_device NPU ``` ::: @@ -103,22 +57,19 @@ ovms --rest_port 8000 --config_path ./models/config.json Wait for the model to load. You can check the status with a simple command: ```console -curl http://localhost:8000/v1/config +curl http://localhost:8000/v3/models ``` ```json { - "microsoft/Phi-3.5-vision-instruct": { - "model_version_status": [ - { - "version": "1", - "state": "AVAILABLE", - "status": { - "error_code": "OK", - "error_message": "OK" - } - } - ] + "object": "list", + "data": [ + { + "id": "OpenVINO/Phi-3.5-vision-instruct-int8-ov", + "object": "model", + "created": 1773742559, + "owned_by": "OVMS" } + ] } ``` @@ -134,12 +85,11 @@ curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/m Referring to local filesystem images in requests requires passing additional parameter `--allowed_local_media_path` (described in [Model Server Parameters](../../docs/parameters.md) section) when starting docker container: ```bash -docker run -d --rm --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \ --p 8000:8000 -v $(pwd)/models:/workspace:ro -v $(pwd):/images:ro openvino/model_server:latest-gpu --rest_port 8000 --config_path /workspace/config.json --allowed_local_media_path /images +docker run -d --rm --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) -p 8000:8000 -v $(pwd)/models:/models:rw openvino/model_server:latest-gpu --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Phi-3.5-vision-instruct-int8-ov --task text_generation --target_device NPU --allowed_local_media_path /images ``` ```bash -curl http://localhost:8000/v3/chat/completions -H "Content-Type: application/json" -d "{ \"model\": \"microsoft/Phi-3.5-vision-instruct\", \"messages\":[{\"role\": \"user\", \"content\": [{\"type\": \"text\", \"text\": \"Describe what is one the picture.\"},{\"type\": \"image_url\", \"image_url\": {\"url\": \"/images/zebra.jpeg\"}}]}], \"max_completion_tokens\": 100}" +curl http://localhost:8000/v3/chat/completions -H "Content-Type: application/json" -d "{ \"model\": \"OpenVINO/Phi-3.5-vision-instruct-int8-ov\", \"messages\":[{\"role\": \"user\", \"content\": [{\"type\": \"text\", \"text\": \"Describe what is one the picture.\"},{\"type\": \"image_url\", \"image_url\": {\"url\": \"/images/zebra.jpeg\"}}]}], \"max_completion_tokens\": 100}" ``` ```json { @@ -155,7 +105,7 @@ curl http://localhost:8000/v3/chat/completions -H "Content-Type: application/js } ], "created": 1741731554, - "model": "microsoft/Phi-3.5-vision-instruct", + "model": "OpenVINO/Phi-3.5-vision-instruct-int8-ov", "object": "chat.completion", "usage": { "prompt_tokens": 19, @@ -172,7 +122,7 @@ curl http://localhost:8000/v3/chat/completions -H "Content-Type: application/js import requests import base64 base_url='http://127.0.0.1:8000/v3' -model_name = "microsoft/Phi-3.5-vision-instruct" +model_name = "OpenVINO/Phi-3.5-vision-instruct-int8-ov" def convert_image(Image): with open(Image,'rb' ) as file: @@ -203,20 +153,19 @@ print(response.text) { "finish_reason": "stop", "index": 0, - "logprobs": null, "message": { - "content": "The picture features a zebra standing in a grassy plain. Zebras are known for their distinctive black and white striped patterns, which help them blend in for camouflage purposes. The zebra pictured is standing on a green field with patches of grass, indicating it may be in its natural habitat. Zebras are typically social animals and are often found in savannahs and grasslands.", + "content": "The picture features a single zebra standing in a grassy field with well-defined black and white stripes, distinctive facial markings, and a mane that is black at the base tapering into white at the tips. The zebra pays no attention to the camera, and it is likely identified as a horse due to its body size and the visible horns that resemble small antlers, which could indicate a moment of embarrassment or enduring a little pr", "role": "assistant" } } ], - "created": 1741731554, - "model": "microsoft/Phi-3.5-vision-instruct", + "created": 1773738822, + "model": "OpenVINO/Phi-3.5-vision-instruct-int8-ov", "object": "chat.completion", "usage": { - "prompt_tokens": 19, - "completion_tokens": 83, - "total_tokens": 102 + "prompt_tokens": 26, + "completion_tokens": 100, + "total_tokens": 126 } } ``` @@ -237,7 +186,7 @@ pip3 install openai from openai import OpenAI import base64 base_url='http://localhost:8000/v3' -model_name = "microsoft/Phi-3.5-vision-instruct" +model_name = "OpenVINO/Phi-3.5-vision-instruct-int8-ov" client = OpenAI(api_key='unused', base_url=base_url) @@ -266,7 +215,7 @@ for chunk in stream: Output: ``` -The picture features a zebra standing in a grassy area. The zebra is characterized by its distinctive black and white striped pattern, which covers its entire body, including its legs, neck, and head. Zebras have small, rounded ears and a long, flowing tail. The background appears to be a natural grassy habitat, typical of a savanna or plain. +The picture features a single zebra standing in a grassy field with well-defined black and white stripes, distinctive facial markings, and a mane that is black at the base tapering into white at the tips. The zebra pays no attention to the camera, and it is likely identified as a horse due to its body size and the visible horns that resemble small antlers, which could indicate a moment of embarrassment or enduring a little prank. Its eyes are black, and its ears are partially open, showing some interest in its surroundings. Its tail is black with a white stripe and a black tip at its end. The zebra appears to be well-fed and healthy, walking between the lush green grasses and a small patch of yellow flowers. Overall, the scene captured is both peaceful and candid, with the zebra immersed in its natural habitat. ``` ::: diff --git a/demos/vlm_npu/generation_config.json b/demos/vlm_npu/generation_config.json deleted file mode 100644 index febad2198f..0000000000 --- a/demos/vlm_npu/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 1, - "eos_token_id": 32000, - "pad_token_id": 32000, - "transformers_version": "4.53.3" -} \ No newline at end of file From 5e184807a88105846b35714cb26568687c4c3887 Mon Sep 17 00:00:00 2001 From: Pawel Date: Tue, 17 Mar 2026 11:37:57 +0100 Subject: [PATCH 11/13] fix --- demos/vlm_npu/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/vlm_npu/README.md b/demos/vlm_npu/README.md index eedfe53229..7483c0a45e 100644 --- a/demos/vlm_npu/README.md +++ b/demos/vlm_npu/README.md @@ -231,7 +231,7 @@ cd vllm pip3 install -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu cd benchmarks curl -L https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -o ShareGPT_V3_unfiltered_cleaned_split.json # sample dataset -python benchmark_serving.py --backend openai-chat --dataset-name hf --dataset-path lmarena-ai/vision-arena-bench-v0.1 --hf-split train --host localhost --port 8000 --model microsoft/Phi-3.5-vision-instruct --endpoint /v3/chat/completions --num-prompts 10 --trust-remote-code --max-concurrency 1 +python benchmark_serving.py --backend openai-chat --dataset-name hf --dataset-path lmarena-ai/vision-arena-bench-v0.1 --hf-split train --host localhost --port 8000 --model OpenVINO/Phi-3.5-vision-instruct-int8-ov --endpoint /v3/chat/completions --num-prompts 10 --trust-remote-code --max-concurrency 1 ``` From 84412260a3c3348ee67f9f37b06ff009807086f6 Mon Sep 17 00:00:00 2001 From: Pawel Date: Tue, 17 Mar 2026 11:57:10 +0100 Subject: [PATCH 12/13] changing model -> fp16 --- demos/vlm_npu/README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/demos/vlm_npu/README.md b/demos/vlm_npu/README.md index 7483c0a45e..cd23f061fd 100644 --- a/demos/vlm_npu/README.md +++ b/demos/vlm_npu/README.md @@ -35,7 +35,7 @@ The default configuration should work in most cases but the parameters can be tu Running this command starts the container with NPU enabled: ```bash -docker run -d --rm --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) -p 8000:8000 -v $(pwd)/models:/models:rw openvino/model_server:latest-gpu --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Phi-3.5-vision-instruct-int8-ov --task text_generation --target_device NPU +docker run -d --rm --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) -p 8000:8000 -v $(pwd)/models:/models:rw openvino/model_server:latest-gpu --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Phi-3.5-vision-instruct-fp16-ov --task text_generation --target_device NPU ``` ::: @@ -49,7 +49,7 @@ Assuming you have unpacked model server package, make sure to: as mentioned in [deployment guide](../../docs/deploying_server_baremetal.md), in every new shell that will start OpenVINO Model Server. ```bat -ovms --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Phi-3.5-vision-instruct-int8-ov --task text_generation --target_device NPU +ovms --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Phi-3.5-vision-instruct-fp16-ov --task text_generation --target_device NPU ``` ::: @@ -64,7 +64,7 @@ curl http://localhost:8000/v3/models "object": "list", "data": [ { - "id": "OpenVINO/Phi-3.5-vision-instruct-int8-ov", + "id": "OpenVINO/Phi-3.5-vision-instruct-fp16-ov", "object": "model", "created": 1773742559, "owned_by": "OVMS" @@ -85,11 +85,11 @@ curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/m Referring to local filesystem images in requests requires passing additional parameter `--allowed_local_media_path` (described in [Model Server Parameters](../../docs/parameters.md) section) when starting docker container: ```bash -docker run -d --rm --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) -p 8000:8000 -v $(pwd)/models:/models:rw openvino/model_server:latest-gpu --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Phi-3.5-vision-instruct-int8-ov --task text_generation --target_device NPU --allowed_local_media_path /images +docker run -d --rm --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) -p 8000:8000 -v $(pwd)/models:/models:rw openvino/model_server:latest-gpu --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Phi-3.5-vision-instruct-fp16-ov --task text_generation --target_device NPU --allowed_local_media_path /images ``` ```bash -curl http://localhost:8000/v3/chat/completions -H "Content-Type: application/json" -d "{ \"model\": \"OpenVINO/Phi-3.5-vision-instruct-int8-ov\", \"messages\":[{\"role\": \"user\", \"content\": [{\"type\": \"text\", \"text\": \"Describe what is one the picture.\"},{\"type\": \"image_url\", \"image_url\": {\"url\": \"/images/zebra.jpeg\"}}]}], \"max_completion_tokens\": 100}" +curl http://localhost:8000/v3/chat/completions -H "Content-Type: application/json" -d "{ \"model\": \"OpenVINO/Phi-3.5-vision-instruct-fp16-ov\", \"messages\":[{\"role\": \"user\", \"content\": [{\"type\": \"text\", \"text\": \"Describe what is one the picture.\"},{\"type\": \"image_url\", \"image_url\": {\"url\": \"/images/zebra.jpeg\"}}]}], \"max_completion_tokens\": 100}" ``` ```json { @@ -105,7 +105,7 @@ curl http://localhost:8000/v3/chat/completions -H "Content-Type: application/js } ], "created": 1741731554, - "model": "OpenVINO/Phi-3.5-vision-instruct-int8-ov", + "model": "OpenVINO/Phi-3.5-vision-instruct-fp16-ov", "object": "chat.completion", "usage": { "prompt_tokens": 19, @@ -122,7 +122,7 @@ curl http://localhost:8000/v3/chat/completions -H "Content-Type: application/js import requests import base64 base_url='http://127.0.0.1:8000/v3' -model_name = "OpenVINO/Phi-3.5-vision-instruct-int8-ov" +model_name = "OpenVINO/Phi-3.5-vision-instruct-fp16-ov" def convert_image(Image): with open(Image,'rb' ) as file: @@ -160,7 +160,7 @@ print(response.text) } ], "created": 1773738822, - "model": "OpenVINO/Phi-3.5-vision-instruct-int8-ov", + "model": "OpenVINO/Phi-3.5-vision-instruct-fp16-ov", "object": "chat.completion", "usage": { "prompt_tokens": 26, @@ -186,7 +186,7 @@ pip3 install openai from openai import OpenAI import base64 base_url='http://localhost:8000/v3' -model_name = "OpenVINO/Phi-3.5-vision-instruct-int8-ov" +model_name = "OpenVINO/Phi-3.5-vision-instruct-fp16-ov" client = OpenAI(api_key='unused', base_url=base_url) @@ -231,7 +231,7 @@ cd vllm pip3 install -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu cd benchmarks curl -L https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -o ShareGPT_V3_unfiltered_cleaned_split.json # sample dataset -python benchmark_serving.py --backend openai-chat --dataset-name hf --dataset-path lmarena-ai/vision-arena-bench-v0.1 --hf-split train --host localhost --port 8000 --model OpenVINO/Phi-3.5-vision-instruct-int8-ov --endpoint /v3/chat/completions --num-prompts 10 --trust-remote-code --max-concurrency 1 +python benchmark_serving.py --backend openai-chat --dataset-name hf --dataset-path lmarena-ai/vision-arena-bench-v0.1 --hf-split train --host localhost --port 8000 --model OpenVINO/Phi-3.5-vision-instruct-fp16-ov --endpoint /v3/chat/completions --num-prompts 10 --trust-remote-code --max-concurrency 1 ``` From 3cf63266e34bd26e7cd14b7cf69479663161223d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Rzepecki?= Date: Tue, 17 Mar 2026 12:16:20 +0100 Subject: [PATCH 13/13] fix --- demos/vlm_npu/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/vlm_npu/README.md b/demos/vlm_npu/README.md index cd23f061fd..a9e3c62058 100644 --- a/demos/vlm_npu/README.md +++ b/demos/vlm_npu/README.md @@ -49,7 +49,7 @@ Assuming you have unpacked model server package, make sure to: as mentioned in [deployment guide](../../docs/deploying_server_baremetal.md), in every new shell that will start OpenVINO Model Server. ```bat -ovms --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Phi-3.5-vision-instruct-fp16-ov --task text_generation --target_device NPU +ovms --rest_port 8000 --model_repository_path models --source_model OpenVINO/Phi-3.5-vision-instruct-fp16-ov --task text_generation --target_device NPU ``` :::