inclusionAI · williechai · Sep 28, 2025 · Copilot · Sep 29, 2025 · Copilot
diff --git a/bailingmm_utils.py b/bailingmm_utils.py
@@ -41,6 +41,17 @@
 FPS_MIN_FRAMES = 4
 FPS_MAX_FRAMES = 128
 
+VideoInput = Union[
+    List["Image.Image"],
+    "np.ndarray",
+    "torch.Tensor",
+    List["np.ndarray"],
+    List["torch.Tensor"],
+    List[List["Image.Image"]],
+    List[List["np.ndarrray"]],
-    List[List["np.ndarrray"]],
+    List[List["np.ndarray"]],
-    List[List["np.ndarrray"]],
+    List[List["np.ndarray"]],
+    List[List["torch.Tensor"]],
+]
+
 def is_decord_available() -> bool:
     import importlib.util
     return importlib.util.find_spec("decord") is not None
@@ -504,3 +515,59 @@ def process_vision_info(
     if len(audio_inputs) == 0:
         audio_inputs = None
     return image_inputs, video_inputs, audio_inputs
+
+def get_closest_ratio(height: float, width: float, aspect_ratios: dict):
+    aspect_ratio = height / width
+    closest_ratio = min(aspect_ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio))
+    return aspect_ratios[closest_ratio], float(closest_ratio)
+
+def process_ratio(ori_h, ori_w):
+    ASPECT_RATIO_512 = {
+        "0.25": [256, 1024],
+        "0.26": [256, 992],
+        "0.27": [256, 960],
+        "0.28": [256, 928],
+        "0.32": [288, 896],
+        "0.33": [288, 864],
+        "0.35": [288, 832],
+        "0.4": [320, 800],
+        "0.42": [320, 768],
+        "0.48": [352, 736],
+        "0.5": [352, 704],
+        "0.52": [352, 672],
+        "0.57": [384, 672],
+        "0.6": [384, 640],
+        "0.68": [416, 608],
+        "0.72": [416, 576],
+        "0.78": [448, 576],
+        "0.82": [448, 544],
+        "0.88": [480, 544],
+        "0.94": [480, 512],
+        "1.0": [512, 512],
+        "1.07": [512, 480],
+        "1.13": [544, 480],
+        "1.21": [544, 448],
+        "1.29": [576, 448],
+        "1.38": [576, 416],
+        "1.46": [608, 416],
+        "1.67": [640, 384],
+        "1.75": [672, 384],
+        "2.0": [704, 352],
+        "2.09": [736, 352],
+        "2.4": [768, 320],
+        "2.5": [800, 320],
+        "2.89": [832, 288],
+        "3.0": [864, 288],
+        "3.11": [896, 288],
+        "3.62": [928, 256],
+        "3.75": [960, 256],
+        "3.88": [992, 256],
+        "4.0": [1024, 256],
+    }
+    closest_size, _ = get_closest_ratio(ori_h, ori_w, aspect_ratios=ASPECT_RATIO_512)
+    closest_size = list(map(lambda x: int(x), closest_size))
+    if closest_size[0] / ori_h > closest_size[1] / ori_w:
+        resize_size = closest_size[0], int(ori_w * closest_size[0] / ori_h)
+    else:
+        resize_size = int(ori_h * closest_size[1] / ori_w), closest_size[1]
+    return closest_size, resize_size
diff --git a/image_processing_bailingmm2.py b/image_processing_bailingmm2.py
@@ -30,7 +30,7 @@
     resize,
     to_channel_dimension_format,
 )
-from transformers.video_utils import VideoInput
+from bailingmm_utils import VideoInput
 from transformers.image_utils import (
     OPENAI_CLIP_MEAN,
     OPENAI_CLIP_STD,

diff --git a/modeling_bailing_moe_v2.py b/modeling_bailing_moe_v2.py
@@ -1561,6 +1561,8 @@ def forward(
         output_router_logits: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         second_per_grid_ts: Optional[torch.Tensor] = None,
+        image_mask=None,
+        audio_mask=None,
         **kwargs,
     ) -> Union[Tuple, MoeModelOutputWithPast]:
 
@@ -1595,7 +1597,12 @@ def forward(
 
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+            assert input_ids.size(1) == inputs_embeds.size(1), "{} vs {}".format(
+                input_ids.size,
+                inputs_embeds.size,
+            )
+            batch_size, seq_length = inputs_embeds.shape[:2]
+            #raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-            #raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-            #raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
             batch_size, seq_length = input_ids.shape[:2]
         elif inputs_embeds is not None:
@@ -1810,6 +1817,8 @@ def forward(
         return_dict: Optional[bool] = None,
         second_per_grid_ts: Optional[torch.Tensor] = None,
         num_logits_to_keep: Optional[int] = 0,
+        image_mask=None,
+        audio_mask=None,
         **kwargs,
     ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
         r"""
@@ -1865,6 +1874,8 @@ def forward(
             output_router_logits=output_router_logits,
             return_dict=return_dict,
             second_per_grid_ts=second_per_grid_ts,
+            image_mask=image_mask,
+            audio_mask=audio_mask,
             **kwargs,
         )