diff --git a/demos/code_local_assistant/README.md b/demos/code_local_assistant/README.md index d1f491375e..9479818eaa 100644 --- a/demos/code_local_assistant/README.md +++ b/demos/code_local_assistant/README.md @@ -16,7 +16,6 @@ With the rise of AI PC capabilities, hosting own Visual Studio code assistant is :sync: OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov ```bat mkdir c:\models -set MOE_USE_MICRO_GEMM_PREFILL=0 # temporary workaround to improve accuracy with long context ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --cache_dir .ovcache --model_name Qwen3-Coder-30B-A3B-Instruct ``` > **Note:** For deployment, the model requires ~16GB disk space and recommended 19GB+ of VRAM on the GPU. @@ -26,7 +25,6 @@ ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-Coder-30B-A :sync: OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov ```bat mkdir c:\models -set MOE_USE_MICRO_GEMM_PREFILL=0 # temporary workaround to improve accuracy with long context ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --cache_dir .ovcache --model_name Qwen3-Coder-30B-A3B-Instruct ``` > **Note:** For deployment, the model requires ~16GB disk space and recommended 34GB+ of VRAM on the GPU. @@ -66,7 +64,7 @@ ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-8B-int4-cw- :sync: OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov ```bash mkdir -p models -docker run -d -p 8000:8000 --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \ +docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \ openvino/model_server:weekly \ --model_repository_path /models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --model_name Qwen3-Coder-30B-A3B-Instruct ``` @@ -77,7 +75,7 @@ docker run -d -p 8000:8000 --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 --user $(id -u): :sync: OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov ```bash mkdir -p models -docker run -d -p 8000:8000 --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \ +docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \ openvino/model_server:weekly \ --model_repository_path /models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --model_name Qwen3-Coder-30B-A3B-Instruct ``` diff --git a/demos/continuous_batching/README.md b/demos/continuous_batching/README.md index 79576dc503..ab8c7be951 100644 --- a/demos/continuous_batching/README.md +++ b/demos/continuous_batching/README.md @@ -35,7 +35,7 @@ That makes it easy to use and efficient especially on on Intel® Xeon® processo Running this command starts the container with CPU only target device: ```bash mkdir -p ${HOME}/models -docker run -it -p 8000:8000 --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 --user $(id -u):$(id -g) -v ${HOME}/models:/models/:rw openvino/model_server:weekly --model_repository_path /models --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --task text_generation --target_device CPU --tool_parser hermes3 --rest_port 8000 --model_name Qwen3-30B-A3B-Instruct-2507-int4-ov +docker run -it -p 8000:8000 --rm --user $(id -u):$(id -g) -v ${HOME}/models:/models/:rw openvino/model_server:weekly --model_repository_path /models --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --task text_generation --target_device CPU --tool_parser hermes3 --rest_port 8000 --model_name Qwen3-30B-A3B-Instruct-2507-int4-ov ``` > **Note:** In case you want to use GPU target device, add extra docker parameters `--device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1)` to `docker run` command. The parameter `--target_device` should be also updated to `GPU`. @@ -46,7 +46,6 @@ to `docker run` command. The parameter `--target_device` should be also updated After ovms is installed according to steps from [baremetal deployment guide](../../docs/deploying_server_baremetal.md), run the following command: ```bat -set MOE_USE_MICRO_GEMM_PREFILL=0 ovms.exe --model_repository_path c:\models --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --task text_generation --target_device GPU --tool_parser hermes3 --rest_port 8000 --model_name Qwen3-30B-A3B-Instruct-2507-int4-ov ``` diff --git a/demos/continuous_batching/agentic_ai/README.md b/demos/continuous_batching/agentic_ai/README.md index e2d831019e..50d54c7698 100644 --- a/demos/continuous_batching/agentic_ai/README.md +++ b/demos/continuous_batching/agentic_ai/README.md @@ -102,7 +102,6 @@ The current weather in Tokyo is Overcast with a temperature of 9.4°C (feels lik :sync: Qwen3-30B-A3B-Instruct-2507 Pull and start OVMS: ```bat -set MOE_USE_MICRO_GEMM_PREFILL=0 ovms.exe --rest_port 8000 --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --model_repository_path c:\models --tool_parser hermes3 --target_device GPU --task text_generation --cache_dir .cache ``` @@ -254,7 +253,7 @@ The current weather in Tokyo is as follows: The sky is mostly covered with cloud Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ --rest_port 8000 --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --model_repository_path /models --tool_parser hermes3 --task text_generation ``` @@ -371,7 +370,7 @@ The current weather in Tokyo is overcast with a temperature of 9.4°C (feels lik Pull and start OVMS: ```bash mkdir -p ${HOME}/models -docker run -d --user $(id -u):$(id -g) -e MOE_USE_MICRO_GEMM_PREFILL=0 --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ --rest_port 8000 --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --model_repository_path /models --tool_parser hermes3 --target_device GPU --task text_generation --enable_tool_guided_generation true ``` diff --git a/src/server.cpp b/src/server.cpp index eb9c223a05..0b06597994 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -505,6 +505,14 @@ int Server::startServerFromSettings(ServerSettingsImpl& serverSettings, ModelsSe OvmsExitGuard exitStatusGuard(*this); installSignalHandlers(); int result = OVMS_EX_OK; + // This is WA for concurrency handling issue in iGPU for qwen3-MOE models. It is expected to be fixed in 2026.2 + if (getenv("MOE_USE_MICRO_GEMM_PREFILL") == nullptr) { +#ifdef _WIN32 + _putenv_s("MOE_USE_MICRO_GEMM_PREFILL", "0"); +#else + setenv("MOE_USE_MICRO_GEMM_PREFILL", "0", 0); +#endif + } try { Status ret = startFromSettings(&serverSettings, &modelsSettings);