Skip to content
24 changes: 18 additions & 6 deletions QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ def forward(
block_mask = row_mask & col_mask # shape: (num_blocks, seq_len, seq_len)

# Combine all blocks into one mask
final_mask = torch.ones((seq_len, seq_len), dtype=torch.float32)
final_mask = torch.ones((seq_len, seq_len), dtype=self.config.dtype)
final_mask[block_mask.any(dim=0)] = 0

final_mask = torch.where(final_mask == 1.0, torch.finfo(q.dtype).min, final_mask)
Expand Down Expand Up @@ -857,10 +857,14 @@ def get_dummy_inputs(

vision_inputs = {}
lang_inputs = {}
vision_inputs["pixel_values"] = torch.zeros((inputs_shapes["pixel_values"]), dtype=torch.float32)
vision_inputs["pixel_values"] = torch.zeros(
(inputs_shapes["pixel_values"]), dtype=self.model.config.torch_dtype
)
vision_inputs["image_grid_thw"] = torch.zeros((inputs_shapes["image_grid_thw"]), dtype=torch.int64)
lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64)
lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=torch.float32)
lang_inputs["vision_embeds"] = torch.zeros(
(inputs_shapes["vision_embeds"]), dtype=self.model.config.torch_dtype
)
lang_inputs["position_ids"] = (
(
torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64)
Expand All @@ -871,7 +875,9 @@ def get_dummy_inputs(
.repeat(4, 1, 1)
)
lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64)
lang_inputs["deepstack_features"] = torch.zeros((inputs_shapes["deepstack_features"]), dtype=torch.float32)
lang_inputs["deepstack_features"] = torch.zeros(
(inputs_shapes["deepstack_features"]), dtype=self.model.config.torch_dtype
)
# Add data for KV

bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
Expand All @@ -886,7 +892,9 @@ def get_dummy_inputs(
lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.text_config.num_hidden_layers)]
for i in range(self.model.config.text_config.num_hidden_layers):
for kv in ["key", "value"]:
lang_inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32))
lang_inputs["past_key_values"][i].append(
torch.zeros(kv_cache_shape, dtype=self.model.config.torch_dtype)
)

if continuous_batching:
lang_inputs["batch_index"] = torch.arange(bs).view(bs, 1)
Expand Down Expand Up @@ -1188,5 +1196,9 @@ def get_inputs_info(self):
return [
IOInfo(name="input_ids", datatype=torch.int64, shape=("batch_size", "seq_len")),
IOInfo(name="attention_mask", datatype=torch.int64, shape=("batch_size", "seq_len")),
IOInfo(name="pixel_values", datatype=torch.float32, shape=("batch_size", 3, "image_size", "image_size")),
IOInfo(
name="pixel_values",
datatype=self.config.torch_dtype,
shape=("batch_size", 3, "image_size", "image_size"),
),
]
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ def forward(
block_mask = row_mask & col_mask # shape: (num_blocks, seq_len, seq_len)

# Combine all blocks into one mask
final_mask = torch.ones((seq_len, seq_len), dtype=torch.float32)
final_mask = torch.ones((seq_len, seq_len), dtype=self.config.dtype)
final_mask[block_mask.any(dim=0)] = 0

final_mask = torch.where(final_mask == 1.0, torch.finfo(q.dtype).min, final_mask)
Expand Down Expand Up @@ -793,8 +793,7 @@ def forward(
x = deepstack_features.reshape(num_features, bs * split_size, C)
deepstack_features_expanded = x[:, indices1, :]
image_input_embeds = torch.where(selected.unsqueeze(-1), image_features_expanded, inputs_embeds)
# inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_input_embeds)
inputs_embeds = image_input_embeds
inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_input_embeds)

image_mask = selected.clone()

Expand Down Expand Up @@ -895,10 +894,14 @@ def get_dummy_inputs(

vision_inputs = {}
lang_inputs = {}
vision_inputs["pixel_values"] = torch.zeros((inputs_shapes["pixel_values"]), dtype=torch.float32)
vision_inputs["pixel_values"] = torch.zeros(
(inputs_shapes["pixel_values"]), dtype=self.model.config.torch_dtype
)
vision_inputs["image_grid_thw"] = torch.zeros((inputs_shapes["image_grid_thw"]), dtype=torch.int64)
lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64)
lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=torch.float32)
lang_inputs["vision_embeds"] = torch.zeros(
(inputs_shapes["vision_embeds"]), dtype=self.model.config.torch_dtype
)
lang_inputs["position_ids"] = (
(
torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64)
Expand All @@ -909,7 +912,9 @@ def get_dummy_inputs(
.repeat(4, 1, 1)
)
lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64)
lang_inputs["deepstack_features"] = torch.zeros((inputs_shapes["deepstack_features"]), dtype=torch.float32)
lang_inputs["deepstack_features"] = torch.zeros(
(inputs_shapes["deepstack_features"]), dtype=self.model.config.torch_dtype
)
# Add data for KV

bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
Expand All @@ -924,7 +929,9 @@ def get_dummy_inputs(
lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.text_config.num_hidden_layers)]
for i in range(self.model.config.text_config.num_hidden_layers):
for kv in ["key", "value"]:
lang_inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32))
lang_inputs["past_key_values"][i].append(
torch.zeros(kv_cache_shape, dtype=self.model.config.torch_dtype)
)

if continuous_batching:
lang_inputs["batch_index"] = torch.arange(bs).view(bs, 1)
Expand Down Expand Up @@ -1225,5 +1232,9 @@ def get_inputs_info(self):
return [
IOInfo(name="input_ids", datatype=torch.int64, shape=("batch_size", "seq_len")),
IOInfo(name="attention_mask", datatype=torch.int64, shape=("batch_size", "seq_len")),
IOInfo(name="pixel_values", datatype=torch.float32, shape=("batch_size", 3, "image_size", "image_size")),
IOInfo(
name="pixel_values",
datatype=self.config.torch_dtype,
shape=("batch_size", 3, "image_size", "image_size"),
),
]
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
model_id = "Qwen/Qwen3-VL-30B-A3B-Instruct"
config = AutoConfig.from_pretrained(model_id)

# For faster execution user can run with lesser layers, For Testing Purpose Only
# For faster execution user can run with lesser layers, For Testing Purpose Only. Please ensure to use the configuration given below as random configurations may fail due to deepstack
# config.vision_config.depth = 9
# config.text_config.num_hidden_layers = 1
# config.vision_config.deepstack_visual_indexes = [8]
Expand Down Expand Up @@ -79,12 +79,13 @@
## Vision + Text ##
qeff_model.compile(
batch_size=batch_size,
prefill_seq_len=128,
prefill_seq_len=1,
ctx_len=4096,
num_cores=16,
num_devices=4,
height=354,
width=536,
split_model_io=True,
mxfp6_matmul=True,
mxint8_kv_cache=True,
aic_enable_depth_first=True,
Expand Down
1 change: 1 addition & 0 deletions examples/image_text_to_text/models/qwen3vl/qwen3_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
num_devices=4,
height=354,
width=536,
split_model_io=True,
mxfp6_matmul=True,
mxint8_kv_cache=True,
aic_enable_depth_first=True,
Expand Down
Loading