openvinotoolkit · dtrawins · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/demos/code_local_assistant/README.md b/demos/code_local_assistant/README.md
@@ -16,7 +16,6 @@ With the rise of AI PC capabilities, hosting own Visual Studio code assistant is
 :sync: OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov
 ```bat
 mkdir c:\models
-set MOE_USE_MICRO_GEMM_PREFILL=0  # temporary workaround to improve accuracy with long context
 ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --cache_dir .ovcache --model_name Qwen3-Coder-30B-A3B-Instruct
 ```
 > **Note:** For deployment, the model requires ~16GB disk space and recommended 19GB+ of VRAM on the GPU.
@@ -26,7 +25,6 @@ ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-Coder-30B-A
 :sync: OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov
 ```bat
 mkdir c:\models
-set MOE_USE_MICRO_GEMM_PREFILL=0  # temporary workaround to improve accuracy with long context
 ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --cache_dir .ovcache --model_name Qwen3-Coder-30B-A3B-Instruct
 ```
 > **Note:** For deployment, the model requires ~16GB disk space and recommended 34GB+ of VRAM on the GPU.
@@ -66,7 +64,7 @@ ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-8B-int4-cw-
 :sync: OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov
 ```bash
 mkdir -p models
-docker run -d -p 8000:8000 --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \
+docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \
     openvino/model_server:weekly \
     --model_repository_path /models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --model_name Qwen3-Coder-30B-A3B-Instruct
 ```
@@ -77,7 +75,7 @@ docker run -d -p 8000:8000 --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 --user $(id -u):
 :sync: OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov
 ```bash
 mkdir -p models
-docker run -d -p 8000:8000 --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \
+docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \
     openvino/model_server:weekly \
     --model_repository_path /models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --model_name Qwen3-Coder-30B-A3B-Instruct
 ```

diff --git a/demos/continuous_batching/README.md b/demos/continuous_batching/README.md
@@ -35,7 +35,7 @@ That makes it easy to use and efficient especially on on Intel® Xeon® processo
 Running this command starts the container with CPU only target device:
 ```bash
 mkdir -p ${HOME}/models
-docker run -it -p 8000:8000 --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 --user $(id -u):$(id -g) -v ${HOME}/models:/models/:rw openvino/model_server:weekly --model_repository_path /models --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --task text_generation --target_device CPU --tool_parser hermes3 --rest_port 8000 --model_name Qwen3-30B-A3B-Instruct-2507-int4-ov
+docker run -it -p 8000:8000 --rm --user $(id -u):$(id -g) -v ${HOME}/models:/models/:rw openvino/model_server:weekly --model_repository_path /models --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --task text_generation --target_device CPU --tool_parser hermes3 --rest_port 8000 --model_name Qwen3-30B-A3B-Instruct-2507-int4-ov
 ```
 > **Note:** In case you want to use GPU target device, add extra docker parameters `--device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1)`
 to `docker run` command. The parameter `--target_device` should be also updated to `GPU`. 
@@ -46,7 +46,6 @@ to `docker run` command. The parameter `--target_device` should be also updated
 After ovms is installed according to steps from [baremetal deployment guide](../../docs/deploying_server_baremetal.md), run the following command:
 
 ```bat
-set MOE_USE_MICRO_GEMM_PREFILL=0
 ovms.exe --model_repository_path c:\models --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --task text_generation --target_device GPU --tool_parser hermes3 --rest_port 8000 --model_name Qwen3-30B-A3B-Instruct-2507-int4-ov
 ```
 

diff --git a/demos/continuous_batching/agentic_ai/README.md b/demos/continuous_batching/agentic_ai/README.md
@@ -102,7 +102,6 @@ The current weather in Tokyo is Overcast with a temperature of 9.4°C (feels lik
 :sync: Qwen3-30B-A3B-Instruct-2507
 Pull and start OVMS:
 ```bat
-set MOE_USE_MICRO_GEMM_PREFILL=0
 ovms.exe --rest_port 8000 --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --model_repository_path c:\models --tool_parser hermes3 --target_device GPU --task text_generation --cache_dir .cache
 ```
 
@@ -254,7 +253,7 @@ The current weather in Tokyo is as follows: The sky is mostly covered with cloud
 Pull and start OVMS:
 ```bash
 mkdir -p ${HOME}/models
-docker run -d --user $(id -u):$(id -g) --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \
+docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \
 --rest_port 8000 --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --model_repository_path /models --tool_parser hermes3 --task text_generation
 ```
 
@@ -371,7 +370,7 @@ The current weather in Tokyo is overcast with a temperature of 9.4°C (feels lik
 Pull and start OVMS:
 ```bash
 mkdir -p ${HOME}/models
-docker run -d --user $(id -u):$(id -g) -e MOE_USE_MICRO_GEMM_PREFILL=0 --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \
+docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \
 --rest_port 8000 --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --model_repository_path /models --tool_parser hermes3 --target_device GPU --task text_generation --enable_tool_guided_generation true
 ```
 

diff --git a/src/server.cpp b/src/server.cpp
@@ -505,6 +505,14 @@ int Server::startServerFromSettings(ServerSettingsImpl& serverSettings, ModelsSe
     OvmsExitGuard exitStatusGuard(*this);
     installSignalHandlers();
     int result = OVMS_EX_OK;
+    // This is WA for concurrency handling issue in iGPU for qwen3-MOE models. It is expected to be fixed in 2026.2
+    if (getenv("MOE_USE_MICRO_GEMM_PREFILL") == nullptr) {
+#ifdef _WIN32
+        _putenv_s("MOE_USE_MICRO_GEMM_PREFILL", "0");
+#else
+        setenv("MOE_USE_MICRO_GEMM_PREFILL", "0", 0);
+#endif
+    }
 
     try {
         Status ret = startFromSettings(&serverSettings, &modelsSettings);