diff --git a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py index 06237df6d3..167c153322 100644 --- a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py +++ b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py @@ -313,7 +313,7 @@ def forward( block_mask = row_mask & col_mask # shape: (num_blocks, seq_len, seq_len) # Combine all blocks into one mask - final_mask = torch.ones((seq_len, seq_len), dtype=torch.float32) + final_mask = torch.ones((seq_len, seq_len), dtype=self.config.dtype) final_mask[block_mask.any(dim=0)] = 0 final_mask = torch.where(final_mask == 1.0, torch.finfo(q.dtype).min, final_mask) @@ -857,10 +857,14 @@ def get_dummy_inputs( vision_inputs = {} lang_inputs = {} - vision_inputs["pixel_values"] = torch.zeros((inputs_shapes["pixel_values"]), dtype=torch.float32) + vision_inputs["pixel_values"] = torch.zeros( + (inputs_shapes["pixel_values"]), dtype=self.model.config.torch_dtype + ) vision_inputs["image_grid_thw"] = torch.zeros((inputs_shapes["image_grid_thw"]), dtype=torch.int64) lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64) - lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=torch.float32) + lang_inputs["vision_embeds"] = torch.zeros( + (inputs_shapes["vision_embeds"]), dtype=self.model.config.torch_dtype + ) lang_inputs["position_ids"] = ( ( torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) @@ -871,7 +875,9 @@ def get_dummy_inputs( .repeat(4, 1, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) - lang_inputs["deepstack_features"] = torch.zeros((inputs_shapes["deepstack_features"]), dtype=torch.float32) + lang_inputs["deepstack_features"] = torch.zeros( + (inputs_shapes["deepstack_features"]), dtype=self.model.config.torch_dtype + ) # Add data for KV bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE @@ -886,7 +892,9 @@ def get_dummy_inputs( lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.text_config.num_hidden_layers)] for i in range(self.model.config.text_config.num_hidden_layers): for kv in ["key", "value"]: - lang_inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32)) + lang_inputs["past_key_values"][i].append( + torch.zeros(kv_cache_shape, dtype=self.model.config.torch_dtype) + ) if continuous_batching: lang_inputs["batch_index"] = torch.arange(bs).view(bs, 1) @@ -1188,5 +1196,9 @@ def get_inputs_info(self): return [ IOInfo(name="input_ids", datatype=torch.int64, shape=("batch_size", "seq_len")), IOInfo(name="attention_mask", datatype=torch.int64, shape=("batch_size", "seq_len")), - IOInfo(name="pixel_values", datatype=torch.float32, shape=("batch_size", 3, "image_size", "image_size")), + IOInfo( + name="pixel_values", + datatype=self.config.torch_dtype, + shape=("batch_size", 3, "image_size", "image_size"), + ), ] diff --git a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py index 81d0da2bc1..2d1d586d08 100644 --- a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +++ b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -310,7 +310,7 @@ def forward( block_mask = row_mask & col_mask # shape: (num_blocks, seq_len, seq_len) # Combine all blocks into one mask - final_mask = torch.ones((seq_len, seq_len), dtype=torch.float32) + final_mask = torch.ones((seq_len, seq_len), dtype=self.config.dtype) final_mask[block_mask.any(dim=0)] = 0 final_mask = torch.where(final_mask == 1.0, torch.finfo(q.dtype).min, final_mask) @@ -793,8 +793,7 @@ def forward( x = deepstack_features.reshape(num_features, bs * split_size, C) deepstack_features_expanded = x[:, indices1, :] image_input_embeds = torch.where(selected.unsqueeze(-1), image_features_expanded, inputs_embeds) - # inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_input_embeds) - inputs_embeds = image_input_embeds + inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_input_embeds) image_mask = selected.clone() @@ -895,10 +894,14 @@ def get_dummy_inputs( vision_inputs = {} lang_inputs = {} - vision_inputs["pixel_values"] = torch.zeros((inputs_shapes["pixel_values"]), dtype=torch.float32) + vision_inputs["pixel_values"] = torch.zeros( + (inputs_shapes["pixel_values"]), dtype=self.model.config.torch_dtype + ) vision_inputs["image_grid_thw"] = torch.zeros((inputs_shapes["image_grid_thw"]), dtype=torch.int64) lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64) - lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=torch.float32) + lang_inputs["vision_embeds"] = torch.zeros( + (inputs_shapes["vision_embeds"]), dtype=self.model.config.torch_dtype + ) lang_inputs["position_ids"] = ( ( torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) @@ -909,7 +912,9 @@ def get_dummy_inputs( .repeat(4, 1, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) - lang_inputs["deepstack_features"] = torch.zeros((inputs_shapes["deepstack_features"]), dtype=torch.float32) + lang_inputs["deepstack_features"] = torch.zeros( + (inputs_shapes["deepstack_features"]), dtype=self.model.config.torch_dtype + ) # Add data for KV bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE @@ -924,7 +929,9 @@ def get_dummy_inputs( lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.text_config.num_hidden_layers)] for i in range(self.model.config.text_config.num_hidden_layers): for kv in ["key", "value"]: - lang_inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32)) + lang_inputs["past_key_values"][i].append( + torch.zeros(kv_cache_shape, dtype=self.model.config.torch_dtype) + ) if continuous_batching: lang_inputs["batch_index"] = torch.arange(bs).view(bs, 1) @@ -1225,5 +1232,9 @@ def get_inputs_info(self): return [ IOInfo(name="input_ids", datatype=torch.int64, shape=("batch_size", "seq_len")), IOInfo(name="attention_mask", datatype=torch.int64, shape=("batch_size", "seq_len")), - IOInfo(name="pixel_values", datatype=torch.float32, shape=("batch_size", 3, "image_size", "image_size")), + IOInfo( + name="pixel_values", + datatype=self.config.torch_dtype, + shape=("batch_size", 3, "image_size", "image_size"), + ), ] diff --git a/examples/image_text_to_text/models/qwen3_vl_moe/qwen3_vl_moe.py b/examples/image_text_to_text/models/qwen3_vl_moe/qwen3_vl_moe.py index fee985bcd1..a171d1535f 100644 --- a/examples/image_text_to_text/models/qwen3_vl_moe/qwen3_vl_moe.py +++ b/examples/image_text_to_text/models/qwen3_vl_moe/qwen3_vl_moe.py @@ -16,7 +16,7 @@ model_id = "Qwen/Qwen3-VL-30B-A3B-Instruct" config = AutoConfig.from_pretrained(model_id) -# For faster execution user can run with lesser layers, For Testing Purpose Only +# For faster execution user can run with lesser layers, For Testing Purpose Only. Please ensure to use the configuration given below as random configurations may fail due to deepstack # config.vision_config.depth = 9 # config.text_config.num_hidden_layers = 1 # config.vision_config.deepstack_visual_indexes = [8] @@ -79,12 +79,13 @@ ## Vision + Text ## qeff_model.compile( batch_size=batch_size, - prefill_seq_len=128, + prefill_seq_len=1, ctx_len=4096, num_cores=16, num_devices=4, height=354, width=536, + split_model_io=True, mxfp6_matmul=True, mxint8_kv_cache=True, aic_enable_depth_first=True, diff --git a/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py b/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py index b6e78604ab..6b86ea874a 100644 --- a/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py +++ b/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py @@ -84,6 +84,7 @@ num_devices=4, height=354, width=536, + split_model_io=True, mxfp6_matmul=True, mxint8_kv_cache=True, aic_enable_depth_first=True,