From e7e25fb417b2615834da038a0ef03835077f2a49 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Wed, 18 Mar 2026 10:59:10 +0100 Subject: [PATCH 1/4] WA for qwen3 moe concurrency handling --- src/server.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/server.cpp b/src/server.cpp index eb9c223a05..655a0397c9 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -530,6 +530,10 @@ int Server::startServerFromSettings(ServerSettingsImpl& serverSettings, ModelsSe // OVMS Start int Server::start(int argc, char** argv) { + // This is WA for concurrency handling issue in iGPU. It is expected to be fixed in 2026.2 + if (getenv("MOE_USE_MICRO_GEMM_PREFILL") == nullptr) { + setenv("MOE_USE_MICRO_GEMM_PREFILL", "0", 0); + } auto paramsOrExit = parseArgs(argc, argv); // Check for error in parsing if (std::holds_alternative>(paramsOrExit)) { From d04022a07ba28eae5939de47c6895a71a602363a Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Wed, 18 Mar 2026 14:59:55 +0100 Subject: [PATCH 2/4] fix for windows --- src/server.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/server.cpp b/src/server.cpp index 655a0397c9..2b2b934b26 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -72,6 +72,7 @@ #if (PYTHON_DISABLE == 0) #include "python/pythoninterpretermodule.hpp" #endif +#include using grpc::ServerBuilder; @@ -505,6 +506,14 @@ int Server::startServerFromSettings(ServerSettingsImpl& serverSettings, ModelsSe OvmsExitGuard exitStatusGuard(*this); installSignalHandlers(); int result = OVMS_EX_OK; + // This is WA for concurrency handling issue in iGPU for qwen3-MOE models. It is expected to be fixed in 2026.2 + if (getenv("MOE_USE_MICRO_GEMM_PREFILL") == nullptr) { +#ifdef _WIN32 + _putenv_s("MOE_USE_MICRO_GEMM_PREFILL", "0"); +#else + setenv("MOE_USE_MICRO_GEMM_PREFILL", "0", 0); +#endif + } try { Status ret = startFromSettings(&serverSettings, &modelsSettings); @@ -530,10 +539,6 @@ int Server::startServerFromSettings(ServerSettingsImpl& serverSettings, ModelsSe // OVMS Start int Server::start(int argc, char** argv) { - // This is WA for concurrency handling issue in iGPU. It is expected to be fixed in 2026.2 - if (getenv("MOE_USE_MICRO_GEMM_PREFILL") == nullptr) { - setenv("MOE_USE_MICRO_GEMM_PREFILL", "0", 0); - } auto paramsOrExit = parseArgs(argc, argv); // Check for error in parsing if (std::holds_alternative>(paramsOrExit)) { From ad5084d8fd4c5619c1a12bf746ea5df2260e7ebf Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Wed, 18 Mar 2026 15:26:44 +0100 Subject: [PATCH 3/4] fix for windows --- src/server.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/server.cpp b/src/server.cpp index 2b2b934b26..0b06597994 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -72,7 +72,6 @@ #if (PYTHON_DISABLE == 0) #include "python/pythoninterpretermodule.hpp" #endif -#include using grpc::ServerBuilder; From 97bccdc4b800e85d3d554c1134ffca9d4a9bcccf Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Wed, 18 Mar 2026 16:00:31 +0100 Subject: [PATCH 4/4] readme updates --- demos/code_local_assistant/README.md | 6 ++---- demos/continuous_batching/README.md | 3 +-- demos/continuous_batching/agentic_ai/README.md | 5 ++--- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/demos/code_local_assistant/README.md b/demos/code_local_assistant/README.md index d1f491375e..9479818eaa 100644 --- a/demos/code_local_assistant/README.md +++ b/demos/code_local_assistant/README.md @@ -16,7 +16,6 @@ With the rise of AI PC capabilities, hosting own Visual Studio code assistant is :sync: OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov ```bat mkdir c:\models -set MOE_USE_MICRO_GEMM_PREFILL=0 # temporary workaround to improve accuracy with long context ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --cache_dir .ovcache --model_name Qwen3-Coder-30B-A3B-Instruct ``` > **Note:** For deployment, the model requires ~16GB disk space and recommended 19GB+ of VRAM on the GPU. @@ -26,7 +25,6 @@ ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-Coder-30B-A :sync: OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov ```bat mkdir c:\models -set MOE_USE_MICRO_GEMM_PREFILL=0 # temporary workaround to improve accuracy with long context ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --cache_dir .ovcache --model_name Qwen3-Coder-30B-A3B-Instruct ``` > **Note:** For deployment, the model requires ~16GB disk space and recommended 34GB+ of VRAM on the GPU. @@ -66,7 +64,7 @@ ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-8B-int4-cw- :sync: OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov ```bash mkdir -p models -docker run -d -p 8000:8000 --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \ +docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \ openvino/model_server:weekly \ --model_repository_path /models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --model_name Qwen3-Coder-30B-A3B-Instruct ``` @@ -77,7 +75,7 @@ docker run -d -p 8000:8000 --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 --user $(id -u): :sync: OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov ```bash mkdir -p models -docker run -d -p 8000:8000 --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \ +docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \ openvino/model_server:weekly \ --model_repository_path /models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --model_name Qwen3-Coder-30B-A3B-Instruct ``` diff --git a/demos/continuous_batching/README.md b/demos/continuous_batching/README.md index 79576dc503..ab8c7be951 100644 --- a/demos/continuous_batching/README.md +++ b/demos/continuous_batching/README.md @@ -35,7 +35,7 @@ That makes it easy to use and efficient especially on on Intel® Xeon® processo Running this command starts the container with CPU only target device: ```bash mkdir -p ${HOME}/models -docker run -it -p 8000:8000 --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 --user $(id -u):$(id -g) -v ${HOME}/models:/models/:rw openvino/model_server:weekly --model_repository_path /models --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --task text_generation --target_device CPU --tool_parser hermes3 --rest_port 8000 --model_name Qwen3-30B-A3B-Instruct-2507-int4-ov +docker run -it -p 8000:8000 --rm --user $(id -u):$(id -g) -v ${HOME}/models:/models/:rw openvino/model_server:weekly --model_repository_path /models --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --task text_generation --target_device CPU --tool_parser hermes3 --rest_port 8000 --model_name Qwen3-30B-A3B-Instruct-2507-int4-ov ``` > **Note:** In case you want to use GPU target device, add extra docker parameters `--device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1)` to `docker run` command. The parameter `--target_device` should be also updated to `GPU`. @@ -46,7 +46,6 @@ to `docker run` command. The parameter `--target_device` should be also updated After ovms is installed according to steps from [baremetal deployment guide](../../docs/deploying_server_baremetal.md), run the following command: ```bat -set MOE_USE_MICRO_GEMM_PREFILL=0 ovms.exe --model_repository_path c:\models --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --task text_generation --target_device GPU --tool_parser hermes3 --rest_port 8000 --model_name Qwen3-30B-A3B-Instruct-2507-int4-ov ``` diff --git a/demos/continuous_batching/agentic_ai/README.md b/demos/continuous_batching/agentic_ai/README.md index e2d831019e..50d54c7698 100644 --- a/demos/continuous_batching/agentic_ai/README.md +++ b/demos/continuous_batching/agentic_ai/README.md @@ -102,7 +102,6 @@ The current weather in Tokyo is Overcast with a temperature of 9.4°C (feels lik :sync: Qwen3-30B-A3B-Instruct-2507 Pull and start OVMS: ```bat -set MOE_USE_MICRO_GEMM_PREFILL=0 ovms.exe --rest_port 8000 --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --model_repository_path c:\models --tool_parser hermes3 --target_device GPU --task text_generation --cache_dir .cache ``` @@ -254,7 +253,7 @@ The current weather in Tokyo is as follows: The sky is mostly covered with cloud Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ --rest_port 8000 --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --model_repository_path /models --tool_parser hermes3 --task text_generation ``` @@ -371,7 +370,7 @@ The current weather in Tokyo is overcast with a temperature of 9.4°C (feels lik Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) -e MOE_USE_MICRO_GEMM_PREFILL=0 --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ --rest_port 8000 --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --model_repository_path /models --tool_parser hermes3 --target_device GPU --task text_generation --enable_tool_guided_generation true ```