From e7e25fb417b2615834da038a0ef03835077f2a49 Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <Dariusz.Trawinski@intel.com>
Date: Wed, 18 Mar 2026 10:59:10 +0100
Subject: [PATCH 1/4] WA for qwen3 moe concurrency handling

---
 src/server.cpp | 4 ++++
 1 file changed, 4 insertions(+)
diff --git a/src/server.cpp b/src/server.cpp
index eb9c223a05..655a0397c9 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -530,6 +530,10 @@ int Server::startServerFromSettings(ServerSettingsImpl& serverSettings, ModelsSe
 
 // OVMS Start
 int Server::start(int argc, char** argv) {
+    // This is WA for concurrency handling issue in iGPU. It is expected to be fixed in 2026.2
+    if (getenv("MOE_USE_MICRO_GEMM_PREFILL") == nullptr) {
+        setenv("MOE_USE_MICRO_GEMM_PREFILL", "0", 0);
+    }
     auto paramsOrExit = parseArgs(argc, argv);
     // Check for error in parsing
     if (std::holds_alternative<std::pair<int, std::string>>(paramsOrExit)) {

From d04022a07ba28eae5939de47c6895a71a602363a Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <Dariusz.Trawinski@intel.com>
Date: Wed, 18 Mar 2026 14:59:55 +0100
Subject: [PATCH 2/4] fix for windows

---
 src/server.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/server.cpp b/src/server.cpp
index 655a0397c9..2b2b934b26 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -72,6 +72,7 @@
 #if (PYTHON_DISABLE == 0)
 #include "python/pythoninterpretermodule.hpp"
 #endif
+#include <cstdlib>
 
 using grpc::ServerBuilder;
 
@@ -505,6 +506,14 @@ int Server::startServerFromSettings(ServerSettingsImpl& serverSettings, ModelsSe
     OvmsExitGuard exitStatusGuard(*this);
     installSignalHandlers();
     int result = OVMS_EX_OK;
+    // This is WA for concurrency handling issue in iGPU for qwen3-MOE models. It is expected to be fixed in 2026.2
+    if (getenv("MOE_USE_MICRO_GEMM_PREFILL") == nullptr) {
+#ifdef _WIN32
+        _putenv_s("MOE_USE_MICRO_GEMM_PREFILL", "0");
+#else
+        setenv("MOE_USE_MICRO_GEMM_PREFILL", "0", 0);
+#endif
+    }
 
     try {
         Status ret = startFromSettings(&serverSettings, &modelsSettings);
@@ -530,10 +539,6 @@ int Server::startServerFromSettings(ServerSettingsImpl& serverSettings, ModelsSe
 
 // OVMS Start
 int Server::start(int argc, char** argv) {
-    // This is WA for concurrency handling issue in iGPU. It is expected to be fixed in 2026.2
-    if (getenv("MOE_USE_MICRO_GEMM_PREFILL") == nullptr) {
-        setenv("MOE_USE_MICRO_GEMM_PREFILL", "0", 0);
-    }
     auto paramsOrExit = parseArgs(argc, argv);
     // Check for error in parsing
     if (std::holds_alternative<std::pair<int, std::string>>(paramsOrExit)) {

From ad5084d8fd4c5619c1a12bf746ea5df2260e7ebf Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <Dariusz.Trawinski@intel.com>
Date: Wed, 18 Mar 2026 15:26:44 +0100
Subject: [PATCH 3/4] fix for windows

---
 src/server.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/server.cpp b/src/server.cpp
index 2b2b934b26..0b06597994 100644
--- a/src/server.cpp
+++ b/src/server.cpp
@@ -72,7 +72,6 @@
 #if (PYTHON_DISABLE == 0)
 #include "python/pythoninterpretermodule.hpp"
 #endif
-#include <cstdlib>
 
 using grpc::ServerBuilder;
 

From 97bccdc4b800e85d3d554c1134ffca9d4a9bcccf Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <Dariusz.Trawinski@intel.com>
Date: Wed, 18 Mar 2026 16:00:31 +0100
Subject: [PATCH 4/4] readme updates

---
 demos/code_local_assistant/README.md           | 6 ++----
 demos/continuous_batching/README.md            | 3 +--
 demos/continuous_batching/agentic_ai/README.md | 5 ++---
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/demos/code_local_assistant/README.md b/demos/code_local_assistant/README.md
index d1f491375e..9479818eaa 100644
--- a/demos/code_local_assistant/README.md
+++ b/demos/code_local_assistant/README.md
@@ -16,7 +16,6 @@ With the rise of AI PC capabilities, hosting own Visual Studio code assistant is
 :sync: OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov
 ```bat
 mkdir c:\models
-set MOE_USE_MICRO_GEMM_PREFILL=0  # temporary workaround to improve accuracy with long context
 ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --cache_dir .ovcache --model_name Qwen3-Coder-30B-A3B-Instruct
 ```
 > **Note:** For deployment, the model requires ~16GB disk space and recommended 19GB+ of VRAM on the GPU.
@@ -26,7 +25,6 @@ ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-Coder-30B-A
 :sync: OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov
 ```bat
 mkdir c:\models
-set MOE_USE_MICRO_GEMM_PREFILL=0  # temporary workaround to improve accuracy with long context
 ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --cache_dir .ovcache --model_name Qwen3-Coder-30B-A3B-Instruct
 ```
 > **Note:** For deployment, the model requires ~16GB disk space and recommended 34GB+ of VRAM on the GPU.
@@ -66,7 +64,7 @@ ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-8B-int4-cw-
 :sync: OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov
 ```bash
 mkdir -p models
-docker run -d -p 8000:8000 --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \
+docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \
     openvino/model_server:weekly \
     --model_repository_path /models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --model_name Qwen3-Coder-30B-A3B-Instruct
 ```
@@ -77,7 +75,7 @@ docker run -d -p 8000:8000 --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 --user $(id -u):
 :sync: OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov
 ```bash
 mkdir -p models
-docker run -d -p 8000:8000 --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \
+docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \
     openvino/model_server:weekly \
     --model_repository_path /models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --model_name Qwen3-Coder-30B-A3B-Instruct
 ```
diff --git a/demos/continuous_batching/README.md b/demos/continuous_batching/README.md
index 79576dc503..ab8c7be951 100644
--- a/demos/continuous_batching/README.md
+++ b/demos/continuous_batching/README.md
@@ -35,7 +35,7 @@ That makes it easy to use and efficient especially on on Intel® Xeon® processo
 Running this command starts the container with CPU only target device:
 ```bash
 mkdir -p ${HOME}/models
-docker run -it -p 8000:8000 --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 --user $(id -u):$(id -g) -v ${HOME}/models:/models/:rw openvino/model_server:weekly --model_repository_path /models --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --task text_generation --target_device CPU --tool_parser hermes3 --rest_port 8000 --model_name Qwen3-30B-A3B-Instruct-2507-int4-ov
+docker run -it -p 8000:8000 --rm --user $(id -u):$(id -g) -v ${HOME}/models:/models/:rw openvino/model_server:weekly --model_repository_path /models --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --task text_generation --target_device CPU --tool_parser hermes3 --rest_port 8000 --model_name Qwen3-30B-A3B-Instruct-2507-int4-ov
 ```
 > **Note:** In case you want to use GPU target device, add extra docker parameters `--device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1)`
 to `docker run` command. The parameter `--target_device` should be also updated to `GPU`. 
@@ -46,7 +46,6 @@ to `docker run` command. The parameter `--target_device` should be also updated
 After ovms is installed according to steps from [baremetal deployment guide](../../docs/deploying_server_baremetal.md), run the following command:
 
 ```bat
-set MOE_USE_MICRO_GEMM_PREFILL=0
 ovms.exe --model_repository_path c:\models --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --task text_generation --target_device GPU --tool_parser hermes3 --rest_port 8000 --model_name Qwen3-30B-A3B-Instruct-2507-int4-ov
 ```
 
diff --git a/demos/continuous_batching/agentic_ai/README.md b/demos/continuous_batching/agentic_ai/README.md
index e2d831019e..50d54c7698 100644
--- a/demos/continuous_batching/agentic_ai/README.md
+++ b/demos/continuous_batching/agentic_ai/README.md
@@ -102,7 +102,6 @@ The current weather in Tokyo is Overcast with a temperature of 9.4°C (feels lik
 :sync: Qwen3-30B-A3B-Instruct-2507
 Pull and start OVMS:
 ```bat
-set MOE_USE_MICRO_GEMM_PREFILL=0
 ovms.exe --rest_port 8000 --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --model_repository_path c:\models --tool_parser hermes3 --target_device GPU --task text_generation --cache_dir .cache
 ```
 
@@ -254,7 +253,7 @@ The current weather in Tokyo is as follows: The sky is mostly covered with cloud
 Pull and start OVMS:
 ```bash
 mkdir -p ${HOME}/models
-docker run -d --user $(id -u):$(id -g) --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \
+docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \
 --rest_port 8000 --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --model_repository_path /models --tool_parser hermes3 --task text_generation
 ```
 
@@ -371,7 +370,7 @@ The current weather in Tokyo is overcast with a temperature of 9.4°C (feels lik
 Pull and start OVMS:
 ```bash
 mkdir -p ${HOME}/models
-docker run -d --user $(id -u):$(id -g) -e MOE_USE_MICRO_GEMM_PREFILL=0 --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \
+docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \
 --rest_port 8000 --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --model_repository_path /models --tool_parser hermes3 --target_device GPU --task text_generation --enable_tool_guided_generation true
 ```