From b85d1064da15ba0348c8ddba071a783be50432ad Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Tue, 12 May 2026 16:42:19 +0000 Subject: [PATCH 01/11] Fix for fp16 export in qwen3vl & qwen3vlmoe models Signed-off-by: Dipankar Sarkar --- .../models/qwen3_vl/modeling_qwen3_vl.py | 16 ++++++++++++---- .../qwen3_vl_moe/modeling_qwen3_vl_moe.py | 17 +++++++++++++---- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py index 06237df6d..1aa8ad555 100644 --- a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py +++ b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py @@ -857,10 +857,14 @@ def get_dummy_inputs( vision_inputs = {} lang_inputs = {} - vision_inputs["pixel_values"] = torch.zeros((inputs_shapes["pixel_values"]), dtype=torch.float32) + vision_inputs["pixel_values"] = torch.zeros( + (inputs_shapes["pixel_values"]), dtype=self.model.config.torch_dtype + ) vision_inputs["image_grid_thw"] = torch.zeros((inputs_shapes["image_grid_thw"]), dtype=torch.int64) lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64) - lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=torch.float32) + lang_inputs["vision_embeds"] = torch.zeros( + (inputs_shapes["vision_embeds"]), dtype=self.model.config.torch_dtype + ) lang_inputs["position_ids"] = ( ( torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) @@ -871,7 +875,9 @@ def get_dummy_inputs( .repeat(4, 1, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) - lang_inputs["deepstack_features"] = torch.zeros((inputs_shapes["deepstack_features"]), dtype=torch.float32) + lang_inputs["deepstack_features"] = torch.zeros( + (inputs_shapes["deepstack_features"]), dtype=self.model.config.torch_dtype + ) # Add data for KV bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE @@ -886,7 +892,9 @@ def get_dummy_inputs( lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.text_config.num_hidden_layers)] for i in range(self.model.config.text_config.num_hidden_layers): for kv in ["key", "value"]: - lang_inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32)) + lang_inputs["past_key_values"][i].append( + torch.zeros(kv_cache_shape, dtype=self.model.config.torch_dtype) + ) if continuous_batching: lang_inputs["batch_index"] = torch.arange(bs).view(bs, 1) diff --git a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py index 81d0da2bc..013b83b89 100644 --- a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +++ b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -384,6 +384,7 @@ def forward( query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2) key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + breakpoint() query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos_cached, sin_cached) past_seen_tokens = past_key_values.get_seq_length(self.layer_idx) if past_key_values is not None else 0 blocking_config = getattr(self, "attn_blocking_config", AttentionBlockingConfig()) @@ -895,10 +896,14 @@ def get_dummy_inputs( vision_inputs = {} lang_inputs = {} - vision_inputs["pixel_values"] = torch.zeros((inputs_shapes["pixel_values"]), dtype=torch.float32) + vision_inputs["pixel_values"] = torch.zeros( + (inputs_shapes["pixel_values"]), dtype=self.model.config.torch_dtype + ) vision_inputs["image_grid_thw"] = torch.zeros((inputs_shapes["image_grid_thw"]), dtype=torch.int64) lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64) - lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=torch.float32) + lang_inputs["vision_embeds"] = torch.zeros( + (inputs_shapes["vision_embeds"]), dtype=self.model.config.torch_dtype + ) lang_inputs["position_ids"] = ( ( torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) @@ -909,7 +914,9 @@ def get_dummy_inputs( .repeat(4, 1, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) - lang_inputs["deepstack_features"] = torch.zeros((inputs_shapes["deepstack_features"]), dtype=torch.float32) + lang_inputs["deepstack_features"] = torch.zeros( + (inputs_shapes["deepstack_features"]), dtype=self.model.config.torch_dtype + ) # Add data for KV bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE @@ -924,7 +931,9 @@ def get_dummy_inputs( lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.text_config.num_hidden_layers)] for i in range(self.model.config.text_config.num_hidden_layers): for kv in ["key", "value"]: - lang_inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32)) + lang_inputs["past_key_values"][i].append( + torch.zeros(kv_cache_shape, dtype=self.model.config.torch_dtype) + ) if continuous_batching: lang_inputs["batch_index"] = torch.arange(bs).view(bs, 1) From bb8a9095bd19c37005b928e66bdea072bd922a31 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Tue, 12 May 2026 17:06:12 +0000 Subject: [PATCH 02/11] Cleaning done 1 Signed-off-by: Dipankar Sarkar --- .../transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py index 013b83b89..57d9be80f 100644 --- a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +++ b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -384,7 +384,6 @@ def forward( query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2) key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) - breakpoint() query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos_cached, sin_cached) past_seen_tokens = past_key_values.get_seq_length(self.layer_idx) if past_key_values is not None else 0 blocking_config = getattr(self, "attn_blocking_config", AttentionBlockingConfig()) From 432d0b09a7ca1e24676d4c34c85863bd9d34a220 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Tue, 19 May 2026 07:45:13 +0000 Subject: [PATCH 03/11] Addressing the review changes Signed-off-by: Dipankar Sarkar --- .../transformers/models/qwen3_vl/modeling_qwen3_vl.py | 8 ++++++-- .../models/qwen3_vl_moe/modeling_qwen3_vl_moe.py | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py index 1aa8ad555..167c15332 100644 --- a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py +++ b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py @@ -313,7 +313,7 @@ def forward( block_mask = row_mask & col_mask # shape: (num_blocks, seq_len, seq_len) # Combine all blocks into one mask - final_mask = torch.ones((seq_len, seq_len), dtype=torch.float32) + final_mask = torch.ones((seq_len, seq_len), dtype=self.config.dtype) final_mask[block_mask.any(dim=0)] = 0 final_mask = torch.where(final_mask == 1.0, torch.finfo(q.dtype).min, final_mask) @@ -1196,5 +1196,9 @@ def get_inputs_info(self): return [ IOInfo(name="input_ids", datatype=torch.int64, shape=("batch_size", "seq_len")), IOInfo(name="attention_mask", datatype=torch.int64, shape=("batch_size", "seq_len")), - IOInfo(name="pixel_values", datatype=torch.float32, shape=("batch_size", 3, "image_size", "image_size")), + IOInfo( + name="pixel_values", + datatype=self.config.torch_dtype, + shape=("batch_size", 3, "image_size", "image_size"), + ), ] diff --git a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py index 57d9be80f..b89cb0eda 100644 --- a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +++ b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -310,7 +310,7 @@ def forward( block_mask = row_mask & col_mask # shape: (num_blocks, seq_len, seq_len) # Combine all blocks into one mask - final_mask = torch.ones((seq_len, seq_len), dtype=torch.float32) + final_mask = torch.ones((seq_len, seq_len), dtype=self.config.dtype) final_mask[block_mask.any(dim=0)] = 0 final_mask = torch.where(final_mask == 1.0, torch.finfo(q.dtype).min, final_mask) @@ -1233,5 +1233,9 @@ def get_inputs_info(self): return [ IOInfo(name="input_ids", datatype=torch.int64, shape=("batch_size", "seq_len")), IOInfo(name="attention_mask", datatype=torch.int64, shape=("batch_size", "seq_len")), - IOInfo(name="pixel_values", datatype=torch.float32, shape=("batch_size", 3, "image_size", "image_size")), + IOInfo( + name="pixel_values", + datatype=self.config.torch_dtype, + shape=("batch_size", 3, "image_size", "image_size"), + ), ] From a02455eade5f26aec0d155b8e43e34d555ea5566 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Tue, 19 May 2026 14:43:23 +0000 Subject: [PATCH 04/11] Addressed the review comments Signed-off-by: Dipankar Sarkar --- .../transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py index b89cb0eda..fbb2a04cd 100644 --- a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +++ b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -388,6 +388,7 @@ def forward( past_seen_tokens = past_key_values.get_seq_length(self.layer_idx) if past_key_values is not None else 0 blocking_config = getattr(self, "attn_blocking_config", AttentionBlockingConfig()) use_blocking = blocking_config is not None and (blocking_config.mode != BlockingMode.NONE) + breakpoint() if use_blocking: attn_output, attn_weights = generic_blocked_attention_interface( module=self, From c0f95c0045893eb700e59daac2e1e89277111739 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Tue, 19 May 2026 19:14:35 +0000 Subject: [PATCH 05/11] Cleaning done 2 Signed-off-by: Dipankar Sarkar --- .../models/qwen3_vl_moe/modeling_qwen3_vl_moe.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py index fbb2a04cd..8d45ab19d 100644 --- a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +++ b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -388,7 +388,6 @@ def forward( past_seen_tokens = past_key_values.get_seq_length(self.layer_idx) if past_key_values is not None else 0 blocking_config = getattr(self, "attn_blocking_config", AttentionBlockingConfig()) use_blocking = blocking_config is not None and (blocking_config.mode != BlockingMode.NONE) - breakpoint() if use_blocking: attn_output, attn_weights = generic_blocked_attention_interface( module=self, @@ -794,8 +793,8 @@ def forward( x = deepstack_features.reshape(num_features, bs * split_size, C) deepstack_features_expanded = x[:, indices1, :] image_input_embeds = torch.where(selected.unsqueeze(-1), image_features_expanded, inputs_embeds) - # inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_input_embeds) - inputs_embeds = image_input_embeds + inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_input_embeds) + # inputs_embeds = image_input_embeds image_mask = selected.clone() From 7e42b793560eec6dd6ecb8545632936584995880 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Wed, 20 May 2026 03:53:54 +0000 Subject: [PATCH 06/11] Cleaning Done 3 Signed-off-by: Dipankar Sarkar --- .../transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py index 8d45ab19d..b89cb0eda 100644 --- a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +++ b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -793,8 +793,8 @@ def forward( x = deepstack_features.reshape(num_features, bs * split_size, C) deepstack_features_expanded = x[:, indices1, :] image_input_embeds = torch.where(selected.unsqueeze(-1), image_features_expanded, inputs_embeds) - inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_input_embeds) - # inputs_embeds = image_input_embeds + # inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_input_embeds) + inputs_embeds = image_input_embeds image_mask = selected.clone() From 75e36b2f44752051672c73c85bbbd76b23ca62d9 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Wed, 20 May 2026 08:53:46 +0000 Subject: [PATCH 07/11] Minor Fix and cleaning Signed-off-by: Dipankar Sarkar --- .../transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py index b89cb0eda..2d1d586d0 100644 --- a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +++ b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -793,8 +793,7 @@ def forward( x = deepstack_features.reshape(num_features, bs * split_size, C) deepstack_features_expanded = x[:, indices1, :] image_input_embeds = torch.where(selected.unsqueeze(-1), image_features_expanded, inputs_embeds) - # inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_input_embeds) - inputs_embeds = image_input_embeds + inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_input_embeds) image_mask = selected.clone() From 87f4538ce4f8c8bf29d00a8fa8c1ae4129816766 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Wed, 20 May 2026 14:06:32 +0000 Subject: [PATCH 08/11] Cleaning Done 4 Signed-off-by: Dipankar Sarkar --- .../image_text_to_text/models/qwen3_vl_moe/qwen3_vl_moe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/image_text_to_text/models/qwen3_vl_moe/qwen3_vl_moe.py b/examples/image_text_to_text/models/qwen3_vl_moe/qwen3_vl_moe.py index fee985bcd..4bd10004c 100644 --- a/examples/image_text_to_text/models/qwen3_vl_moe/qwen3_vl_moe.py +++ b/examples/image_text_to_text/models/qwen3_vl_moe/qwen3_vl_moe.py @@ -16,7 +16,7 @@ model_id = "Qwen/Qwen3-VL-30B-A3B-Instruct" config = AutoConfig.from_pretrained(model_id) -# For faster execution user can run with lesser layers, For Testing Purpose Only +# For faster execution user can run with lesser layers, For Testing Purpose Only. Please ensure to use the configuration given below as random configurations may fail due to deepstack # config.vision_config.depth = 9 # config.text_config.num_hidden_layers = 1 # config.vision_config.deepstack_visual_indexes = [8] @@ -87,6 +87,7 @@ width=536, mxfp6_matmul=True, mxint8_kv_cache=True, + split_model_io=True, aic_enable_depth_first=True, mos=1, use_onnx_subfunctions=True, From a0af1c6cc3e912cbeda6bdb785699d8128291e15 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Wed, 20 May 2026 14:29:25 +0000 Subject: [PATCH 09/11] Removing spli model io flag as qgenie is raising it as a concern Signed-off-by: Dipankar Sarkar --- examples/image_text_to_text/models/qwen3_vl_moe/qwen3_vl_moe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/image_text_to_text/models/qwen3_vl_moe/qwen3_vl_moe.py b/examples/image_text_to_text/models/qwen3_vl_moe/qwen3_vl_moe.py index 4bd10004c..0583e3ebb 100644 --- a/examples/image_text_to_text/models/qwen3_vl_moe/qwen3_vl_moe.py +++ b/examples/image_text_to_text/models/qwen3_vl_moe/qwen3_vl_moe.py @@ -87,7 +87,6 @@ width=536, mxfp6_matmul=True, mxint8_kv_cache=True, - split_model_io=True, aic_enable_depth_first=True, mos=1, use_onnx_subfunctions=True, From 627d537b34323ed5c3b311429836885018d8c156 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Fri, 22 May 2026 12:05:52 +0000 Subject: [PATCH 10/11] Adding split model io flag to example script Signed-off-by: Dipankar Sarkar --- examples/image_text_to_text/models/qwen3_vl_moe/dbg.log | 0 .../image_text_to_text/models/qwen3_vl_moe/qwen3_vl_moe.py | 3 ++- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 examples/image_text_to_text/models/qwen3_vl_moe/dbg.log diff --git a/examples/image_text_to_text/models/qwen3_vl_moe/dbg.log b/examples/image_text_to_text/models/qwen3_vl_moe/dbg.log new file mode 100644 index 000000000..e69de29bb diff --git a/examples/image_text_to_text/models/qwen3_vl_moe/qwen3_vl_moe.py b/examples/image_text_to_text/models/qwen3_vl_moe/qwen3_vl_moe.py index 0583e3ebb..a171d1535 100644 --- a/examples/image_text_to_text/models/qwen3_vl_moe/qwen3_vl_moe.py +++ b/examples/image_text_to_text/models/qwen3_vl_moe/qwen3_vl_moe.py @@ -79,12 +79,13 @@ ## Vision + Text ## qeff_model.compile( batch_size=batch_size, - prefill_seq_len=128, + prefill_seq_len=1, ctx_len=4096, num_cores=16, num_devices=4, height=354, width=536, + split_model_io=True, mxfp6_matmul=True, mxint8_kv_cache=True, aic_enable_depth_first=True, From 374b6edbce33440d15c9e465a1e015bb43fc0352 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Sat, 23 May 2026 10:54:55 +0000 Subject: [PATCH 11/11] Added spli flag to qwen3vl example script Signed-off-by: Dipankar Sarkar --- examples/image_text_to_text/models/qwen3_vl_moe/dbg.log | 0 examples/image_text_to_text/models/qwen3vl/qwen3_vl.py | 1 + 2 files changed, 1 insertion(+) delete mode 100644 examples/image_text_to_text/models/qwen3_vl_moe/dbg.log diff --git a/examples/image_text_to_text/models/qwen3_vl_moe/dbg.log b/examples/image_text_to_text/models/qwen3_vl_moe/dbg.log deleted file mode 100644 index e69de29bb..000000000 diff --git a/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py b/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py index b6e78604a..6b86ea874 100644 --- a/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py +++ b/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py @@ -84,6 +84,7 @@ num_devices=4, height=354, width=536, + split_model_io=True, mxfp6_matmul=True, mxint8_kv_cache=True, aic_enable_depth_first=True,