diff --git a/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/__init__.py b/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/__init__.py index c11444e6758..ec3d3e833c0 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/__init__.py +++ b/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/__init__.py @@ -14,7 +14,13 @@ # limitations under the License. """ -from .get_image_preprocessor import get_image_preprocessor -from .image_preprocessor_adaptive import AdaptiveImageProcessor +# Backward compatibility: this module has been migrated to +# fastdeploy.input.image_processors.adaptive_processor +# This file will be removed in a future version. + +from fastdeploy.input.image_processors.adaptive_processor import ( # noqa: F401 + AdaptiveImageProcessor, + get_image_preprocessor, +) __all__ = ["get_image_preprocessor", "AdaptiveImageProcessor"] diff --git a/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py b/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py index 0ff6f7d1ed5..ead34a0ce0b 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py +++ b/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py @@ -14,21 +14,10 @@ # limitations under the License. """ -"""get image preprocessor""" +# Backward compatibility: this module has been migrated to +# fastdeploy.input.image_processors.adaptive_processor +# This file will be removed in a future version. -from fastdeploy.utils import data_processor_logger - -from .image_preprocessor_adaptive import AdaptiveImageProcessor - - -def get_image_preprocessor(args): - """ - get_image_preprocessor from args - """ - - if args.vision_model_name_or_path is None: - return None - - data_processor_logger.info("use AdaptiveImageProcessor") - image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path) - return image_preprocess +from fastdeploy.input.image_processors.adaptive_processor import ( # noqa: F401 + get_image_preprocessor, +) diff --git a/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py b/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py index cd81274654e..deaa5494c12 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py +++ b/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py @@ -14,498 +14,12 @@ # limitations under the License. """ -"""image preprocessor adaptive""" - -from typing import List, Optional, Union - -import numpy as np -import paddle -import PIL -from paddleformers.transformers.feature_extraction_utils import BatchFeature -from paddleformers.transformers.image_processing_utils import BaseImageProcessor -from paddleformers.transformers.image_transforms import ( - convert_to_rgb, - normalize, - rescale, - resize, - to_channel_dimension_format, +# Backward compatibility: this module has been migrated to +# fastdeploy.input.image_processors.adaptive_processor +# This file will be removed in a future version. + +from fastdeploy.input.image_processors.adaptive_processor import ( # noqa: F401 + AdaptiveImageProcessor, + make_batched_images, + make_batched_videos, ) -from paddleformers.transformers.image_utils import ( - ChannelDimension, - ImageInput, - PILImageResampling, - get_image_size, - infer_channel_dimension_format, - is_valid_image, - make_list_of_images, - to_numpy_array, - valid_images, -) -from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType -from PIL import Image - -from fastdeploy.input.image_processors.common import is_scaled_image -from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize -from fastdeploy.utils import data_processor_logger - -OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] -OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711] - -IMAGE_FACTOR = 28 -MIN_PIXELS = 4 * 28 * 28 -MAX_PIXELS = 16384 * 28 * 28 -MAX_RATIO = 200 - - -VideoInput = Union[ - List["PIL.Image.Image"], - "np.ndarray", - "paddle.Tensor", - List["np.ndarray"], - List["paddle.Tensor"], - List[List["PIL.Image.Image"]], - List[List["np.ndarrray"]], - List[List["paddle.Tensor"]], -] - - -__all__ = [ - "AdaptiveImageProcessor", -] - - -def make_batched_images(images) -> List[List[ImageInput]]: - """ - Accepts images in list or nested list format, and makes a list of images for preprocessing. - images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): - The input image. - - Returns: - list: A list of images. - """ - if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): - return [img for img_list in images for img in img_list] - - elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): - return images - - elif is_valid_image(images): - return [images] - - raise ValueError(f"Could not make batched images from {images}") - - -# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos -def make_batched_videos(videos) -> List[VideoInput]: - """dummy""" - if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos - - elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - if isinstance(videos[0], Image.Image): - return [videos] - elif len(videos[0].shape) == 4: - return [list(video) for video in videos] - - elif is_valid_image(videos) and len(videos.shape) == 4: - return [list(videos)] - - raise ValueError(f"Could not make batched video from {videos}") - - -class AdaptiveImageProcessor(BaseImageProcessor): - r""" - Constructs a adaptive image processor that dynamically resizes images based on the original images. - - Args: - do_resize (`bool`, *optional*, defaults to `True`): - Whether to resize the image's (height, width) dimensions. - resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): - Resampling filter to use when resizing the image. - do_rescale (`bool`, *optional*, defaults to `True`): - Whether to rescale the image by the specified scale `rescale_factor`. - rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): - Scale factor to use if rescaling the image. - do_normalize (`bool`, *optional*, defaults to `True`): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`): - Mean to use if normalizing the image. This is a float or list of floats for each channel in the image. - image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`): - Standard deviation to use if normalizing the image. This is a float or list of floats for each channel - in the image. - do_convert_rgb (`bool`, *optional*, defaults to `True`): - Whether to convert the image to RGB. - min_pixels (`int`, *optional*, defaults to `56 * 56`): - The min pixels of the image to resize the image. - max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): - The max pixels of the image to resize the image. - patch_size (`int`, *optional*, defaults to 14): - The spacial patch size of the vision encoder. - temporal_conv_size (`int`, *optional*, defaults to 2): - The temporal conv size in resampler. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - - model_input_names = [ - "pixel_values", - "image_grid_thw", - "pixel_values_videos", - "video_grid_thw", - ] - - def __init__( - self, - do_resize: bool = True, - resample: PILImageResampling = PILImageResampling.BICUBIC, - do_rescale: bool = True, - rescale_factor: float = 1 / 255, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: bool = True, - min_pixels: int = 56 * 56, - max_pixels: int = 28 * 28 * 1280, - patch_size: int = 14, - temporal_conv_size: int = 2, - merge_size: int = 2, - **kwargs, - ) -> None: - """init""" - super().__init__(**kwargs) - self.do_resize = do_resize - self.resample = resample - self.do_rescale = do_rescale - self.rescale_factor = rescale_factor - self.do_normalize = do_normalize - self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN - self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD - self.min_pixels = min_pixels - self.max_pixels = max_pixels - self.patch_size = patch_size - self.temporal_conv_size = temporal_conv_size - self.merge_size = merge_size - self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} - self.do_convert_rgb = do_convert_rgb - - def set_pixels(self, min_pixels=None, max_pixels=None, msg=""): - """设定pixels""" - if min_pixels is not None: - assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int" - data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}") - self.min_pixels = min_pixels - self.size["min_pixels"] = int(min_pixels) - if max_pixels is not None: - assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int" - data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}") - self.max_pixels = max_pixels - self.size["max_pixels"] = int(max_pixels) - - def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None): - """dummy""" - actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels - actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels - resized_height, resized_width = smart_resize( - height, - width, - factor=self.patch_size * self.merge_size, - min_pixels=actual_min_pixels, - max_pixels=actual_max_pixels, - ) - return (resized_height, resized_width), ( - resized_height // self.patch_size, - resized_width // self.patch_size, - ) - - def _preprocess( - self, - images: Union[ImageInput, VideoInput], - do_resize: bool = True, - resample: PILImageResampling = None, - do_rescale: bool = True, - rescale_factor: float = 1 / 255, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: bool = False, - data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - predetermined_grid_thw=None, - ): - """ - Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`. - - Args: - images (`ImageInput`): - Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. - If pixel values range from 0 to 1, set `do_rescale=False`. - vision_info (`List[Dict]`, *optional*): - Optional list of dictionaries containing additional information about vision inputs. - do_resize (`bool`, *optional*, defaults to `self.do_resize`): - Whether to resize the image. - resample (`PILImageResampling`, *optional*, defaults to `self.resample`): - Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums. - do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): - Whether to rescale the image. - rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): - Scale factor to use if rescaling the image. - do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): - Mean to use if normalizing the image. - Can be a float or a list of floats corresponding to the number of channels in the image. - image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): - Standard deviation to use if normalizing the image. - Can be a float or a list of floats corresponding to the number of channels in the image. - do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): - Whether to convert the image to RGB. - data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`): - The channel dimension format for the output image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - Unset: Use the channel dimension format of the input image. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - """ - images = make_list_of_images(images) - - if do_convert_rgb: - images = [convert_to_rgb(image) for image in images] - - # All transformations expect numpy arrays. - images = [to_numpy_array(image) for image in images] - - if is_scaled_image(images[0]) and do_rescale: - data_processor_logger.warning( - "It looks like you are trying to rescale already rescaled images. If the input" - " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." - ) - if input_data_format is None: - # We assume that all images have the same channel dimension format. - input_data_format = infer_channel_dimension_format(images[0]) - - height, width = get_image_size(images[0], channel_dim=input_data_format) - resized_height, resized_width = height, width - processed_images = [] - - if predetermined_grid_thw is not None: - assert len(predetermined_grid_thw) == len( - images - ), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}" - - for img_idx, image in enumerate(images): - if do_resize: - if predetermined_grid_thw is not None: - (resized_height, resized_width) = predetermined_grid_thw[img_idx] - resized_height *= self.patch_size - resized_width *= self.patch_size - else: - resized_height, resized_width = smart_resize( - height, - width, - factor=self.patch_size * self.merge_size, - min_pixels=self.min_pixels, - max_pixels=self.max_pixels, - ) - image = image.astype("uint8") # TODO : 需要手动加上,否则多除255 导致结果会出错 - # 直接fromarray,不要靠paddleformers里面的 - image = Image.fromarray(image) - image = resize( - image, - size=(resized_height, resized_width), - resample=resample, - data_format=input_data_format, - ) - if do_rescale: - image = rescale(image, scale=rescale_factor, data_format=input_data_format) - - if do_normalize: - image = normalize( - image=image, - mean=image_mean, - std=image_std, - data_format=input_data_format, - ) - - image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W] - - processed_images.append(image) - patches = np.array(processed_images) - if data_format == ChannelDimension.LAST: - patches = patches.transpose([0, 3, 1, 2]) - - channel = patches.shape[1] # [time, C, H, W] - grid_t = patches.shape[0] - grid_h, grid_w = ( - resized_height // self.patch_size, - resized_width // self.patch_size, - ) - patches = patches.reshape( - [ - grid_t, - channel, - grid_h // self.merge_size, - self.merge_size, - self.patch_size, - grid_w // self.merge_size, - self.merge_size, - self.patch_size, - ] - ) - # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz] - patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7]) - - flatten_patches = patches.reshape( - [ - grid_t * grid_h * grid_w, - channel * self.patch_size * self.patch_size, - ] - ) # [grid_t * grid_h * grid_w, C * psz * psz] - - return flatten_patches, (grid_t, grid_h, grid_w) - - def preprocess( - self, - images: ImageInput, - videos: VideoInput = None, - do_resize: bool = True, - size: Optional[Union[int, List[int]]] = None, - resample: PILImageResampling = None, - do_rescale: bool = True, - rescale_factor: float = 1 / 255, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: bool = False, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - predetermined_grid_thw=None, - ): - """ - Args: - images (`ImageInput`): - Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If - passing in images with pixel values between 0 and 1, set `do_rescale=False`. - videos (`VideoInput`): - Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If - passing in videos with pixel values between 0 and 1, set `do_rescale=False`. - do_resize (`bool`, *optional*, defaults to `self.do_resize`): - Whether to resize the image. - size (`Dict[str, int]`, *optional*, defaults to `self.size`): - Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with - the longest edge resized to keep the input aspect ratio. - resample (`int`, *optional*, defaults to `self.resample`): - Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only - has an effect if `do_resize` is set to `True`. - do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): - Whether to rescale the image. - rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): - Rescale factor to rescale the image by if `do_rescale` is set to `True`. - do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): - Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. - image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): - Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to - `True`. - do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): - Whether to convert the image to RGB. - return_tensors (`str` or `TensorType`, *optional*): - The type of tensors to return. Can be one of: - - Unset: Return a list of `np.ndarray`. - - `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`. - - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. - data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): - The channel dimension format for the output image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - Unset: Use the channel dimension format of the input image. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. If unset, the channel dimension format is inferred - from the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - - """ - do_resize = do_resize if do_resize is not None else self.do_resize - size = size if size is not None else self.size - resample = resample if resample is not None else self.resample - do_rescale = do_rescale if do_rescale is not None else self.do_rescale - rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor - do_normalize = do_normalize if do_normalize is not None else self.do_normalize - image_mean = image_mean if image_mean is not None else self.image_mean - image_std = image_std if image_std is not None else self.image_std - do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb - - if images is not None: - images = make_batched_images(images) - if videos is not None: - videos = make_batched_videos(videos) - - if images is not None and not valid_images(images): - raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") - - if images is not None: - pixel_values, vision_grid_thws = [], [] - for img_idx, image in enumerate(images): - if predetermined_grid_thw is not None: - predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]] - else: - predetermined_grid_thw_one = None - patches, image_grid_thw = self._preprocess( - image, - do_resize=do_resize, - resample=resample, - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - data_format=data_format, - do_convert_rgb=do_convert_rgb, - input_data_format=input_data_format, - predetermined_grid_thw=predetermined_grid_thw_one, - ) - pixel_values.extend(patches) - vision_grid_thws.append(image_grid_thw) - pixel_values = np.array(pixel_values) - vision_grid_thws = np.array(vision_grid_thws) - data = { - "pixel_values": pixel_values, - "image_grid_thw": vision_grid_thws, - } - - if videos is not None: - pixel_values, vision_grid_thws = [], [] - for images in videos: - patches, video_grid_thw = self._preprocess( - images, - do_resize=do_resize, - resample=resample, - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - data_format=data_format, - do_convert_rgb=do_convert_rgb, - input_data_format=input_data_format, - predetermined_grid_thw=predetermined_grid_thw, - ) - pixel_values.extend(patches) - vision_grid_thws.append(video_grid_thw) - pixel_values = np.array(pixel_values) - vision_grid_thws = np.array(vision_grid_thws) - - data = { - "pixel_values_videos": pixel_values, - "video_grid_thw": vision_grid_thws, - } - - return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/fastdeploy/input/image_processors/__init__.py b/fastdeploy/input/image_processors/__init__.py index a9cc79cc9d7..0f5df8e741b 100644 --- a/fastdeploy/input/image_processors/__init__.py +++ b/fastdeploy/input/image_processors/__init__.py @@ -11,3 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from fastdeploy.input.image_processors.adaptive_processor import ( # noqa: F401 + AdaptiveImageProcessor, + get_image_preprocessor, +) +from fastdeploy.input.image_processors.paddleocr_processor import ( # noqa: F401 + ImageProcessor as PaddleOCRImageProcessor, +) +from fastdeploy.input.image_processors.qwen3_processor import ( # noqa: F401 + ImageProcessor as Qwen3ImageProcessor, +) +from fastdeploy.input.image_processors.qwen_processor import ( # noqa: F401 + ImageProcessor as QwenImageProcessor, +) diff --git a/fastdeploy/input/image_processors/adaptive_processor.py b/fastdeploy/input/image_processors/adaptive_processor.py new file mode 100644 index 00000000000..47e677e4917 --- /dev/null +++ b/fastdeploy/input/image_processors/adaptive_processor.py @@ -0,0 +1,527 @@ +""" +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +"""image preprocessor adaptive""" + +from typing import List, Optional, Union + +import numpy as np +import paddle +import PIL +from paddleformers.transformers.feature_extraction_utils import BatchFeature +from paddleformers.transformers.image_processing_utils import BaseImageProcessor +from paddleformers.transformers.image_transforms import ( + convert_to_rgb, + normalize, + rescale, + resize, + to_channel_dimension_format, +) +from paddleformers.transformers.image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_valid_image, + make_list_of_images, + to_numpy_array, + valid_images, +) +from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType +from PIL import Image + +from fastdeploy.input.image_processors.common import is_scaled_image +from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize +from fastdeploy.utils import data_processor_logger + +OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] +OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711] + +IMAGE_FACTOR = 28 +MIN_PIXELS = 4 * 28 * 28 +MAX_PIXELS = 16384 * 28 * 28 +MAX_RATIO = 200 + + +VideoInput = Union[ + List["PIL.Image.Image"], + "np.ndarray", + "paddle.Tensor", + List["np.ndarray"], + List["paddle.Tensor"], + List[List["PIL.Image.Image"]], + List[List["np.ndarrray"]], + List[List["paddle.Tensor"]], +] + + +__all__ = [ + "AdaptiveImageProcessor", + "get_image_preprocessor", + "make_batched_images", + "make_batched_videos", +] + + +def make_batched_images(images) -> List[List[ImageInput]]: + """ + Accepts images in list or nested list format, and makes a list of images for preprocessing. + images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): + The input image. + + Returns: + list: A list of images. + """ + if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): + return [img for img_list in images for img in img_list] + + elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): + return images + + elif is_valid_image(images): + return [images] + + raise ValueError(f"Could not make batched images from {images}") + + +# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos +def make_batched_videos(videos) -> List[VideoInput]: + """dummy""" + if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): + return videos + + elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): + if isinstance(videos[0], Image.Image): + return [videos] + elif len(videos[0].shape) == 4: + return [list(video) for video in videos] + + elif is_valid_image(videos) and len(videos.shape) == 4: + return [list(videos)] + + raise ValueError(f"Could not make batched video from {videos}") + + +class AdaptiveImageProcessor(BaseImageProcessor): + r""" + Constructs a adaptive image processor that dynamically resizes images based on the original images. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use when resizing the image. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`): + Mean to use if normalizing the image. This is a float or list of floats for each channel in the image. + image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`): + Standard deviation to use if normalizing the image. This is a float or list of floats for each channel + in the image. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + min_pixels (`int`, *optional*, defaults to `56 * 56`): + The min pixels of the image to resize the image. + max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): + The max pixels of the image to resize the image. + patch_size (`int`, *optional*, defaults to 14): + The spacial patch size of the vision encoder. + temporal_conv_size (`int`, *optional*, defaults to 2): + The temporal conv size in resampler. + merge_size (`int`, *optional*, defaults to 2): + The merge size of the vision encoder to llm encoder. + """ + + model_input_names = [ + "pixel_values", + "image_grid_thw", + "pixel_values_videos", + "video_grid_thw", + ] + + def __init__( + self, + do_resize: bool = True, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: float = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + min_pixels: int = 56 * 56, + max_pixels: int = 28 * 28 * 1280, + patch_size: int = 14, + temporal_conv_size: int = 2, + merge_size: int = 2, + **kwargs, + ) -> None: + """init""" + super().__init__(**kwargs) + self.do_resize = do_resize + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.min_pixels = min_pixels + self.max_pixels = max_pixels + self.patch_size = patch_size + self.temporal_conv_size = temporal_conv_size + self.merge_size = merge_size + self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} + self.do_convert_rgb = do_convert_rgb + + def set_pixels(self, min_pixels=None, max_pixels=None, msg=""): + """设定pixels""" + if min_pixels is not None: + assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int" + data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}") + self.min_pixels = min_pixels + self.size["min_pixels"] = int(min_pixels) + if max_pixels is not None: + assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int" + data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}") + self.max_pixels = max_pixels + self.size["max_pixels"] = int(max_pixels) + + def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None): + """dummy""" + actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels + actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels + resized_height, resized_width = smart_resize( + height, + width, + factor=self.patch_size * self.merge_size, + min_pixels=actual_min_pixels, + max_pixels=actual_max_pixels, + ) + return (resized_height, resized_width), ( + resized_height // self.patch_size, + resized_width // self.patch_size, + ) + + def _preprocess( + self, + images: Union[ImageInput, VideoInput], + do_resize: bool = True, + resample: PILImageResampling = None, + do_rescale: bool = True, + rescale_factor: float = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = False, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + predetermined_grid_thw=None, + ): + """ + Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`. + + Args: + images (`ImageInput`): + Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. + If pixel values range from 0 to 1, set `do_rescale=False`. + vision_info (`List[Dict]`, *optional*): + Optional list of dictionaries containing additional information about vision inputs. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Scale factor to use if rescaling the image. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Mean to use if normalizing the image. + Can be a float or a list of floats corresponding to the number of channels in the image. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Standard deviation to use if normalizing the image. + Can be a float or a list of floats corresponding to the number of channels in the image. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + images = make_list_of_images(images) + + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if is_scaled_image(images[0]) and do_rescale: + data_processor_logger.warning( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + height, width = get_image_size(images[0], channel_dim=input_data_format) + resized_height, resized_width = height, width + processed_images = [] + + if predetermined_grid_thw is not None: + assert len(predetermined_grid_thw) == len( + images + ), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}" + + for img_idx, image in enumerate(images): + if do_resize: + if predetermined_grid_thw is not None: + (resized_height, resized_width) = predetermined_grid_thw[img_idx] + resized_height *= self.patch_size + resized_width *= self.patch_size + else: + resized_height, resized_width = smart_resize( + height, + width, + factor=self.patch_size * self.merge_size, + min_pixels=self.min_pixels, + max_pixels=self.max_pixels, + ) + image = image.astype("uint8") # TODO : 需要手动加上,否则多除255 导致结果会出错 + # 直接fromarray,不要靠paddleformers里面的 + image = Image.fromarray(image) + image = resize( + image, + size=(resized_height, resized_width), + resample=resample, + data_format=input_data_format, + ) + if do_rescale: + image = rescale(image, scale=rescale_factor, data_format=input_data_format) + + if do_normalize: + image = normalize( + image=image, + mean=image_mean, + std=image_std, + data_format=input_data_format, + ) + + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W] + + processed_images.append(image) + patches = np.array(processed_images) + if data_format == ChannelDimension.LAST: + patches = patches.transpose([0, 3, 1, 2]) + + channel = patches.shape[1] # [time, C, H, W] + grid_t = patches.shape[0] + grid_h, grid_w = ( + resized_height // self.patch_size, + resized_width // self.patch_size, + ) + patches = patches.reshape( + [ + grid_t, + channel, + grid_h // self.merge_size, + self.merge_size, + self.patch_size, + grid_w // self.merge_size, + self.merge_size, + self.patch_size, + ] + ) + # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz] + patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7]) + + flatten_patches = patches.reshape( + [ + grid_t * grid_h * grid_w, + channel * self.patch_size * self.patch_size, + ] + ) # [grid_t * grid_h * grid_w, C * psz * psz] + + return flatten_patches, (grid_t, grid_h, grid_w) + + def preprocess( + self, + images: ImageInput, + videos: VideoInput = None, + do_resize: bool = True, + size: Optional[Union[int, List[int]]] = None, + resample: PILImageResampling = None, + do_rescale: bool = True, + rescale_factor: float = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = False, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + predetermined_grid_thw=None, + ): + """ + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + videos (`VideoInput`): + Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If + passing in videos with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + if images is not None: + images = make_batched_images(images) + if videos is not None: + videos = make_batched_videos(videos) + + if images is not None and not valid_images(images): + raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") + + if images is not None: + pixel_values, vision_grid_thws = [], [] + for img_idx, image in enumerate(images): + if predetermined_grid_thw is not None: + predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]] + else: + predetermined_grid_thw_one = None + patches, image_grid_thw = self._preprocess( + image, + do_resize=do_resize, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + data_format=data_format, + do_convert_rgb=do_convert_rgb, + input_data_format=input_data_format, + predetermined_grid_thw=predetermined_grid_thw_one, + ) + pixel_values.extend(patches) + vision_grid_thws.append(image_grid_thw) + pixel_values = np.array(pixel_values) + vision_grid_thws = np.array(vision_grid_thws) + data = { + "pixel_values": pixel_values, + "image_grid_thw": vision_grid_thws, + } + + if videos is not None: + pixel_values, vision_grid_thws = [], [] + for images in videos: + patches, video_grid_thw = self._preprocess( + images, + do_resize=do_resize, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + data_format=data_format, + do_convert_rgb=do_convert_rgb, + input_data_format=input_data_format, + predetermined_grid_thw=predetermined_grid_thw, + ) + pixel_values.extend(patches) + vision_grid_thws.append(video_grid_thw) + pixel_values = np.array(pixel_values) + vision_grid_thws = np.array(vision_grid_thws) + + data = { + "pixel_values_videos": pixel_values, + "video_grid_thw": vision_grid_thws, + } + + return BatchFeature(data=data, tensor_type=return_tensors) + + +def get_image_preprocessor(args): + """ + get_image_preprocessor from args + """ + + if args.vision_model_name_or_path is None: + return None + + data_processor_logger.info("use AdaptiveImageProcessor") + image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path) + return image_preprocess diff --git a/fastdeploy/input/image_processors/paddleocr_processor.py b/fastdeploy/input/image_processors/paddleocr_processor.py new file mode 100644 index 00000000000..a28f03075df --- /dev/null +++ b/fastdeploy/input/image_processors/paddleocr_processor.py @@ -0,0 +1,227 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +"""Image processor class for PaddleOCR-VL.""" + +import json +from pathlib import Path +from typing import Dict, List, Optional, Union + +import numpy as np +from paddleformers.transformers.feature_extraction_utils import BatchFeature +from paddleformers.transformers.image_processing_utils import BaseImageProcessor +from paddleformers.transformers.image_utils import ( + ImageInput, + is_valid_image, + make_list_of_images, + to_numpy_array, +) + +from fastdeploy.input.image_processors.common import ( + smart_resize_paddleocr as smart_resize, +) + +_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] +_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711] + + +def make_batched_images(images) -> List[List[ImageInput]]: + """ + Accepts images in list or nested list format, and makes a list of images for preprocessing. + + Args: + images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): + The input image. + + Returns: + list: A list of images. + """ + if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): + return [img for img_list in images for img in img_list] + + elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): + return images + + elif is_valid_image(images): + return [images] + + raise ValueError(f"Could not make batched images from {images}") + + +def adjust_size(size, patch_size): + num_patches = size // patch_size + if num_patches % 2 != 0: + num_patches -= 1 + return num_patches * patch_size + + +class ImageProcessor(BaseImageProcessor): + model_input_names = [ + "pixel_values", + "image_grid_thw", + "pixel_values_videos", + "video_grid_thw", + ] + + def __init__( + self, + do_resize: bool = True, + resample: int = 3, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + min_pixels: int = 28 * 28 * 130, + max_pixels: int = 28 * 28 * 1280, + patch_size: int = 14, + temporal_patch_size: int = 1, + merge_size: int = 2, + **kwargs, + ) -> None: + super().__init__() + self.do_resize = do_resize + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD + self.min_pixels = min_pixels + self.max_pixels = max_pixels + self.patch_size = patch_size + self.temporal_patch_size = temporal_patch_size + self.merge_size = merge_size + self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} # not used + self.do_convert_rgb = do_convert_rgb + + @classmethod + def from_pretrained(cls, pretrained_model_dir): + pretrained_model_dir = Path(pretrained_model_dir) + image_processor_config_path = pretrained_model_dir / "preprocessor_config.json" + with open(image_processor_config_path, "r", encoding="utf-8") as f: + image_processor_config = json.load(f) + return cls(**image_processor_config) + + def _preprocess( + self, + images, + do_resize: Optional[bool] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: Optional[bool] = None, + ): + images = make_list_of_images(images) + + if do_convert_rgb: + images = [image.convert("RGB") for image in images] + + width, height = images[0].size + resized_height, resized_width = height, width + processed_images = [] + + for image in images: + if do_resize: + resized_height, resized_width = smart_resize( + height, + width, + factor=self.patch_size * self.merge_size, + min_pixels=self.min_pixels, + max_pixels=self.max_pixels, + ) + + image = image.resize((resized_width, resized_height), resample=self.resample) + + image = to_numpy_array(image) + + if do_rescale: + image = (image * rescale_factor).astype(np.float32) + + if do_normalize: + image = image.astype(np.float32) + image -= np.array(image_mean, dtype=np.float32) + image /= np.array(image_std, dtype=np.float32) + + processed_images.append(image) + + patches = np.array(processed_images) + patches = patches.transpose(0, 3, 1, 2) + if patches.shape[0] == 1: + patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1)) + channel = patches.shape[1] + grid_t = patches.shape[0] // self.temporal_patch_size + grid_h, grid_w = ( + resized_height // self.patch_size, + resized_width // self.patch_size, + ) + + patches = patches.reshape( + grid_t, + self.temporal_patch_size, + channel, + grid_h, + self.patch_size, + grid_w, + self.patch_size, + ) + patches = patches.transpose(0, 3, 5, 2, 1, 4, 6) + assert self.temporal_patch_size == 1 + flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size) + return flatten_patches, np.array([grid_t, grid_h, grid_w]) + + def preprocess( + self, + images, + videos=None, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: Optional[bool] = None, + return_tensors=None, + ): + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + if videos is not None: + raise NotImplementedError("Videos are not yet supported") + + patches, image_grid_thw = self._preprocess( + images, + do_resize=do_resize, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_convert_rgb=do_convert_rgb, + ) + pixel_values = np.array(patches) + data = {"pixel_values": pixel_values, "grid_thw": image_grid_thw} + return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/fastdeploy/input/image_processors/qwen3_processor.py b/fastdeploy/input/image_processors/qwen3_processor.py new file mode 100644 index 00000000000..5927a0f9699 --- /dev/null +++ b/fastdeploy/input/image_processors/qwen3_processor.py @@ -0,0 +1,333 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from typing import List, Optional, Union + +import numpy as np +import paddle +import PIL +from paddleformers.transformers.feature_extraction_utils import BatchFeature +from paddleformers.transformers.image_processing_utils import BaseImageProcessor +from paddleformers.transformers.image_transforms import ( + normalize, + rescale, + resize, + to_channel_dimension_format, +) +from paddleformers.transformers.image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + make_list_of_images, + to_numpy_array, + valid_images, +) +from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType +from PIL import Image + +from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize +from fastdeploy.utils import data_processor_logger + +IMAGE_MEAN = [0.5, 0.5, 0.5] +IMAGE_STD = [0.5, 0.5, 0.5] + +MIN_PIXELS = 65536 +MAX_PIXELS = 16777216 + + +VideoInput = Union[ + List["PIL.Image.Image"], + "np.ndarray", + "paddle.Tensor", + List["np.ndarray"], + List["paddle.Tensor"], + List[List["PIL.Image.Image"]], + List[List["np.ndarray"]], + List[List["paddle.Tensor"]], +] + + +class ImageProcessor(BaseImageProcessor): + """ + Adaptive image processor for dynamic image resizing and preprocessing. + + This processor handles image resizing, rescaling, normalization and format conversion. + It dynamically adjusts image dimensions based on original size and specified constraints. + """ + + def __init__( + self, + patch_size: int = 16, + merge_size: int = 2, + temporal_patch_size: int = 2, + min_pixels: int = MIN_PIXELS, + max_pixels: int = MAX_PIXELS, + image_mean: Union[float, List[float]] = IMAGE_MEAN, + image_std: Union[float, List[float]] = IMAGE_STD, + rescale_factor: float = 1 / 255, + do_rescale: bool = True, + do_normalize: bool = True, + resample: PILImageResampling = PILImageResampling.BICUBIC, + **kwargs, + ) -> None: + """ + Initialize image processor with configuration parameters. + + Args: + patch_size (int): Spatial patch size for vision encoder + merge_size (int): Merge size between vision and LLM encoders + temporal_patch_size (int): Temporal patch size for video processing + min_pixels (int): Minimum allowed pixels in resized image + max_pixels (int): Maximum allowed pixels in resized image + image_mean (float/list): Mean values for normalization per channel + image_std (float/list): Std values for normalization per channel + rescale_factor (float): Scaling factor for pixel values (default 1/255) + do_rescale (bool): Whether to rescale images + do_normalize (bool): Whether to normalize images + resample: Resampling method for image resizing + **kwargs: Additional base class arguments + """ + super().__init__(**kwargs) + self.patch_size = patch_size + self.merge_size = merge_size + self.temporal_patch_size = temporal_patch_size + + self.min_pixels = min_pixels + self.max_pixels = max_pixels + + self.image_mean = image_mean + self.image_std = image_std + self.rescale_factor = rescale_factor + self.do_rescale = do_rescale + self.do_normalize = do_normalize + + self.resample = resample + + def _preprocess( + self, + images: Union[ImageInput, VideoInput], + min_pixels: int, + max_pixels: int, + image_mean: Optional[Union[float, List[float]]], + image_std: Optional[Union[float, List[float]]], + rescale_factor: float, + do_rescale: bool, + do_normalize: bool, + resample: PILImageResampling, + data_format: Optional[ChannelDimension], + input_data_format: Optional[Union[str, ChannelDimension]], + ): + """ + Internal method for image preprocessing pipeline. + + Args: + images: Input image or batch of images + min_pixels: Minimum allowed pixels in output + max_pixels: Maximum allowed pixels in output + image_mean: Normalization mean values + image_std: Normalization std values + rescale_factor: Pixel value scaling factor + do_rescale: Whether to rescale pixel values + do_normalize: Whether to normalize pixel values + resample: Resampling method + data_format: Output channel format + input_data_format: Input channel format + + Returns: + tuple: (flatten_patches, grid_dimensions) + - flatten_patches: Flattened image patches + - grid_dimensions: Grid dimensions [t, h, w] + """ + images = make_list_of_images(images) + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if is_scaled_image(images[0]) and do_rescale: + data_processor_logger.warning( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + # Get original dimensions and calculate optimal resize dimensions + height, width = get_image_size(images[0], channel_dim=input_data_format) + resized_height, resized_width = smart_resize( + height, + width, + factor=self.patch_size * self.merge_size, # Combine patch and merge factors + min_pixels=min_pixels, + max_pixels=max_pixels, + ) + + processed_images = [] + for image in images: + if height != resized_height or width != resized_width: + # Convert to uint8 before resizing to avoid double scaling + image = image.astype("uint8") + # Convert to PIL Image and resize + image = Image.fromarray(image) + image = resize( + image, + size=(resized_height, resized_width), + resample=resample, + data_format=input_data_format, + ) + + if do_rescale and do_normalize: + # Adjust mean and std for combined rescale+normalize + image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor) + image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor) + do_rescale = False # Skip separate rescale step + + # mutual exclusion and upper branch + if do_rescale: + image = image.astype(np.float32) + image = rescale(image, scale=rescale_factor, data_format=input_data_format) + + if do_normalize: + image = image.astype(np.float32) + image = normalize( + image=image, + mean=image_mean, + std=image_std, + data_format=input_data_format, + ) + + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W] + processed_images.append(image) + + # Convert processed images to numpy array + patches = np.array(processed_images) + + # Pad temporal dimension if needed + if patches.shape[0] % self.temporal_patch_size != 0: + repeats = np.repeat( + patches[-1][np.newaxis], + self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size), + axis=0, + ) + patches = np.concatenate([patches, repeats], axis=0) + + # Convert to channels-first format if needed + if data_format == ChannelDimension.LAST: + patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W] + + grid_t, channel = patches.shape[:2] + grid_t = grid_t // self.temporal_patch_size + + grid_h, grid_w = ( + resized_height // self.patch_size, + resized_width // self.patch_size, + ) + # Reshape into hierarchical patch structure + patches = patches.reshape( + [ + grid_t, + self.temporal_patch_size, + channel, + grid_h // self.merge_size, + self.merge_size, + self.patch_size, + grid_w // self.merge_size, + self.merge_size, + self.patch_size, + ] + ) + # Reorder dimensions for better memory access pattern + # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz] + patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8]) + + flatten_patches = patches.reshape( + [ + grid_t * grid_h * grid_w, + channel * self.temporal_patch_size * self.patch_size * self.patch_size, + ] + ) + + return flatten_patches, np.array([grid_t, grid_h, grid_w]) + + def preprocess( + self, + images: Union[ImageInput, VideoInput], + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + rescale_factor: Optional[float] = None, + do_rescale: Optional[bool] = None, + do_normalize: Optional[bool] = None, + resample: Optional[PILImageResampling] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST, + ): + """ + Main preprocessing method for images/videos. + + Args: + images: Input image/video data + min_pixels: Override for minimum pixels + max_pixels: Override for maximum pixels + image_mean: Override for normalization mean + image_std: Override for normalization std + rescale_factor: Override for rescaling factor + do_rescale: Override for rescaling flag + do_normalize: Override for normalization flag + resample: Override for resampling method + return_tensors: Desired output tensor format + data_format: Output channel dimension format + input_data_format: Input channel dimension format + + Returns: + BatchFeature: Processed features containing: + - pixel_values: Preprocessed pixel data + - grid_thw: Grid dimensions [temporal, height, width] + + Raises: + ValueError: For invalid image types or dimensions + """ + min_pixels = min_pixels if min_pixels is not None else self.min_pixels + max_pixels = max_pixels if max_pixels is not None else self.max_pixels + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + + if images is not None and not valid_images(images): + raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") + + pixel_values, grid_thw = self._preprocess( + images, + min_pixels=min_pixels, + max_pixels=max_pixels, + image_mean=image_mean, + image_std=image_std, + rescale_factor=rescale_factor, + do_rescale=do_rescale, + do_normalize=do_normalize, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + ) + data = {"pixel_values": pixel_values, "grid_thw": grid_thw} + return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/fastdeploy/input/image_processors/qwen_processor.py b/fastdeploy/input/image_processors/qwen_processor.py new file mode 100644 index 00000000000..7c3df2b69bf --- /dev/null +++ b/fastdeploy/input/image_processors/qwen_processor.py @@ -0,0 +1,332 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from typing import List, Optional, Union + +import numpy as np +import paddle +import PIL +from paddleformers.transformers.feature_extraction_utils import BatchFeature +from paddleformers.transformers.image_processing_utils import BaseImageProcessor +from paddleformers.transformers.image_transforms import ( + normalize, + rescale, + resize, + to_channel_dimension_format, +) +from paddleformers.transformers.image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + make_list_of_images, + to_numpy_array, + valid_images, +) +from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType +from PIL import Image + +from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize +from fastdeploy.utils import data_processor_logger + +OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] +OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711] + +MIN_PIXELS = 4 * 28 * 28 +MAX_PIXELS = 16384 * 28 * 28 + + +VideoInput = Union[ + List["PIL.Image.Image"], + "np.ndarray", + "paddle.Tensor", + List["np.ndarray"], + List["paddle.Tensor"], + List[List["PIL.Image.Image"]], + List[List["np.ndarray"]], + List[List["paddle.Tensor"]], +] + + +class ImageProcessor(BaseImageProcessor): + """ + Adaptive image processor for dynamic image resizing and preprocessing. + + This processor handles image resizing, rescaling, normalization and format conversion. + It dynamically adjusts image dimensions based on original size and specified constraints. + """ + + def __init__( + self, + patch_size: int = 14, + merge_size: int = 2, + temporal_patch_size: int = 2, + min_pixels: int = MIN_PIXELS, + max_pixels: int = MAX_PIXELS, + image_mean: Union[float, List[float]] = OPENAI_CLIP_MEAN, + image_std: Union[float, List[float]] = OPENAI_CLIP_STD, + rescale_factor: float = 1 / 255, + do_rescale: bool = True, + do_normalize: bool = True, + resample: PILImageResampling = PILImageResampling.BICUBIC, + **kwargs, + ) -> None: + """ + Initialize image processor with configuration parameters. + + Args: + patch_size (int): Spatial patch size for vision encoder + merge_size (int): Merge size between vision and LLM encoders + temporal_patch_size (int): Temporal patch size for video processing + min_pixels (int): Minimum allowed pixels in resized image + max_pixels (int): Maximum allowed pixels in resized image + image_mean (float/list): Mean values for normalization per channel + image_std (float/list): Std values for normalization per channel + rescale_factor (float): Scaling factor for pixel values (default 1/255) + do_rescale (bool): Whether to rescale images + do_normalize (bool): Whether to normalize images + resample: Resampling method for image resizing + **kwargs: Additional base class arguments + """ + super().__init__(**kwargs) + self.patch_size = patch_size + self.merge_size = merge_size + self.temporal_patch_size = temporal_patch_size + + self.min_pixels = min_pixels + self.max_pixels = max_pixels + + self.image_mean = image_mean + self.image_std = image_std + self.rescale_factor = rescale_factor + self.do_rescale = do_rescale + self.do_normalize = do_normalize + + self.resample = resample + + def _preprocess( + self, + images: Union[ImageInput, VideoInput], + min_pixels: int, + max_pixels: int, + image_mean: Optional[Union[float, List[float]]], + image_std: Optional[Union[float, List[float]]], + rescale_factor: float, + do_rescale: bool, + do_normalize: bool, + resample: PILImageResampling, + data_format: Optional[ChannelDimension], + input_data_format: Optional[Union[str, ChannelDimension]], + ): + """ + Internal method for image preprocessing pipeline. + + Args: + images: Input image or batch of images + min_pixels: Minimum allowed pixels in output + max_pixels: Maximum allowed pixels in output + image_mean: Normalization mean values + image_std: Normalization std values + rescale_factor: Pixel value scaling factor + do_rescale: Whether to rescale pixel values + do_normalize: Whether to normalize pixel values + resample: Resampling method + data_format: Output channel format + input_data_format: Input channel format + + Returns: + tuple: (flatten_patches, grid_dimensions) + - flatten_patches: Flattened image patches + - grid_dimensions: Grid dimensions [t, h, w] + """ + images = make_list_of_images(images) + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if is_scaled_image(images[0]) and do_rescale: + data_processor_logger.warning( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + # Get original dimensions and calculate optimal resize dimensions + height, width = get_image_size(images[0], channel_dim=input_data_format) + resized_height, resized_width = smart_resize( + height, + width, + factor=self.patch_size * self.merge_size, # Combine patch and merge factors + min_pixels=min_pixels, + max_pixels=max_pixels, + ) + + processed_images = [] + for image in images: + if height != resized_height or width != resized_width: + # Convert to uint8 before resizing to avoid double scaling + image = image.astype("uint8") + # Convert to PIL Image and resize + image = Image.fromarray(image) + image = resize( + image, + size=(resized_height, resized_width), + resample=resample, + data_format=input_data_format, + ) + + if do_rescale and do_normalize: + # Adjust mean and std for combined rescale+normalize + image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor) + image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor) + do_rescale = False # Skip separate rescale step + + if do_rescale: + image = image.astype(np.float32) + image = rescale(image, scale=rescale_factor, data_format=input_data_format) + + if do_normalize: + image = image.astype(np.float32) + image = normalize( + image=image, + mean=image_mean, + std=image_std, + data_format=input_data_format, + ) + + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W] + processed_images.append(image) + + # Convert processed images to numpy array + patches = np.array(processed_images) + + # Pad temporal dimension if needed + if patches.shape[0] % self.temporal_patch_size != 0: + repeats = np.repeat( + patches[-1][np.newaxis], + self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size), + axis=0, + ) + patches = np.concatenate([patches, repeats], axis=0) + + # Convert to channels-first format if needed + if data_format == ChannelDimension.LAST: + patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W] + + grid_t, channel = patches.shape[:2] + grid_t = grid_t // self.temporal_patch_size + + grid_h, grid_w = ( + resized_height // self.patch_size, + resized_width // self.patch_size, + ) + # Reshape into hierarchical patch structure + patches = patches.reshape( + [ + grid_t, + self.temporal_patch_size, + channel, + grid_h // self.merge_size, + self.merge_size, + self.patch_size, + grid_w // self.merge_size, + self.merge_size, + self.patch_size, + ] + ) + # Reorder dimensions for better memory access pattern + # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz] + patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8]) + + flatten_patches = patches.reshape( + [ + grid_t * grid_h * grid_w, + channel * self.temporal_patch_size * self.patch_size * self.patch_size, + ] + ) + + return flatten_patches, np.array([grid_t, grid_h, grid_w]) + + def preprocess( + self, + images: Union[ImageInput, VideoInput], + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + rescale_factor: Optional[float] = None, + do_rescale: Optional[bool] = None, + do_normalize: Optional[bool] = None, + resample: Optional[PILImageResampling] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST, + ): + """ + Main preprocessing method for images/videos. + + Args: + images: Input image/video data + min_pixels: Override for minimum pixels + max_pixels: Override for maximum pixels + image_mean: Override for normalization mean + image_std: Override for normalization std + rescale_factor: Override for rescaling factor + do_rescale: Override for rescaling flag + do_normalize: Override for normalization flag + resample: Override for resampling method + return_tensors: Desired output tensor format + data_format: Output channel dimension format + input_data_format: Input channel dimension format + + Returns: + BatchFeature: Processed features containing: + - pixel_values: Preprocessed pixel data + - grid_thw: Grid dimensions [temporal, height, width] + + Raises: + ValueError: For invalid image types or dimensions + """ + min_pixels = min_pixels if min_pixels is not None else self.min_pixels + max_pixels = max_pixels if max_pixels is not None else self.max_pixels + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + + if images is not None and not valid_images(images): + raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") + + pixel_values, grid_thw = self._preprocess( + images, + min_pixels=min_pixels, + max_pixels=max_pixels, + image_mean=image_mean, + image_std=image_std, + rescale_factor=rescale_factor, + do_rescale=do_rescale, + do_normalize=do_normalize, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + ) + data = {"pixel_values": pixel_values, "grid_thw": grid_thw} + return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/fastdeploy/input/multimodal_processor.py b/fastdeploy/input/multimodal_processor.py new file mode 100644 index 00000000000..2b9211250b2 --- /dev/null +++ b/fastdeploy/input/multimodal_processor.py @@ -0,0 +1,472 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +"""Unified multimodal processor for all VL model types. + +Consolidates the four separate VL processor wrappers (QwenVLProcessor, +Qwen3VLProcessor, PaddleOCRVLProcessor, Ernie4_5_VLProcessor) into a +single class that dispatches per ``model_type``. +""" + +import pickle +from collections.abc import Mapping +from typing import Any, Dict, Optional + +import numpy as np + +from fastdeploy.input.base_processor import BaseTextProcessor +from fastdeploy.input.utils import IDS_TYPE_FLAG, process_stop_token_ids +from fastdeploy.utils import data_processor_logger + +QWEN_VL = "qwen_vl" +QWEN3_VL = "qwen3_vl" +PADDLEOCR_VL = "paddleocr_vl" +ERNIE4_5_VL = "ernie4_5_vl" + +_SUPPORTED_MODEL_TYPES = {QWEN_VL, QWEN3_VL, PADDLEOCR_VL, ERNIE4_5_VL} + +_QWEN_EXPECTED_KWARGS = { + "video_max_frames": int, + "video_min_frames": int, +} + +_ERNIE_EXPECTED_KWARGS = { + "spatial_conv_size": int, + "temporal_conv_size": int, + "image_min_pixels": int, + "image_max_pixels": int, + "video_min_pixels": int, + "video_max_pixels": int, + "video_target_frames": int, + "video_frames_sample": str, + "video_max_frames": int, + "video_min_frames": int, + "video_fps": int, +} + +_TYPES_ACCEPT_URL_SUFFIX = {QWEN_VL, QWEN3_VL, PADDLEOCR_VL} + +_DEFAULT_MM_LIMITS = {"image": 1, "video": 1, "audio": 1} + +_SAMPLING_EPS = 1e-5 + + +class MultiModalProcessor(BaseTextProcessor): + """Unified multimodal processor for all supported VL model types. + + Dispatches image-processor creation, config initialisation, and + encoding logic based on ``model_type``. + """ + + def __init__( + self, + model_name_or_path: str, + model_type: str, + config=None, + limit_mm_per_prompt: Optional[Dict[str, Any]] = None, + mm_processor_kwargs: Optional[Dict[str, Any]] = None, + reasoning_parser_obj=None, + tool_parser_obj=None, + enable_processor_cache: bool = False, + ): + if model_type not in _SUPPORTED_MODEL_TYPES: + raise ValueError( + f"Unsupported model_type '{model_type}'. " f"Must be one of {sorted(_SUPPORTED_MODEL_TYPES)}." + ) + self.model_type = model_type + self.config = config + self.enable_processor_cache = enable_processor_cache + + tokenizer_type = "ernie4_5" if model_type == ERNIE4_5_VL else "auto" + + super().__init__( + model_name_or_path, + tokenizer_type=tokenizer_type, + reasoning_parser_obj=reasoning_parser_obj, + tool_parser_obj=tool_parser_obj, + ) + + data_processor_logger.info(f"model_name_or_path: {model_name_or_path}") + + processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs) + self._init_mm_processor(processor_kwargs) + self._init_mm_config() + self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt) + + def _load_tokenizer(self): + """Load the appropriate tokenizer based on model_type.""" + if self.tokenizer_type == "ernie4_5": + from paddleformers.transformers import AutoTokenizer as PFAutoTokenizer + + tokenizer = PFAutoTokenizer.from_pretrained(self.model_name_or_path) + else: + from paddleformers.transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, padding_side="left", use_fast=True) + return tokenizer + + def _init_mm_processor(self, processor_kwargs: dict): + """Create the model-type-specific internal DataProcessor.""" + if self.model_type == QWEN_VL: + from fastdeploy.input.qwen_vl_processor.process import DataProcessor + + tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2) + self.processor = DataProcessor( + model_path=self.model_name_or_path, + enable_processor_cache=self.enable_processor_cache, + tokens_per_second=tokens_per_second, + tokenizer=self.tokenizer, + **processor_kwargs, + ) + elif self.model_type == QWEN3_VL: + from fastdeploy.input.qwen3_vl_processor.process import DataProcessor + + self.processor = DataProcessor( + model_path=self.model_name_or_path, + enable_processor_cache=self.enable_processor_cache, + tokenizer=self.tokenizer, + **processor_kwargs, + ) + elif self.model_type == PADDLEOCR_VL: + from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor + + tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2) + self.processor = DataProcessor( + model_path=self.model_name_or_path, + enable_processor_cache=self.enable_processor_cache, + tokens_per_second=tokens_per_second, + tokenizer=self.tokenizer, + **processor_kwargs, + ) + elif self.model_type == ERNIE4_5_VL: + from fastdeploy.input.ernie4_5_vl_processor.process import DataProcessor + + self.processor = DataProcessor( + tokenizer_name=self.model_name_or_path, + image_preprocessor_name=self.model_name_or_path, + enable_processor_cache=self.enable_processor_cache, + **processor_kwargs, + ) + self.processor.eval() + + def _init_mm_config(self): + """Set model-type-specific multimodal configuration attributes.""" + if self.model_type in (QWEN_VL, QWEN3_VL): + self.image_patch_id = self.processor.image_token_id + elif self.model_type == PADDLEOCR_VL: + self.image_patch_id = self.processor.image_patch_id + elif self.model_type == ERNIE4_5_VL: + self.image_patch_id = self.processor.image_patch_id + self.spatial_conv_size = self.processor.spatial_conv_size + + def _parse_processor_kwargs(self, kwargs: Optional[dict]) -> dict: + """Parse and validate multimodal processor kwargs.""" + if not kwargs: + return {} + + try: + if not isinstance(kwargs, dict): + raise ValueError("mm-processor-kwargs must be a dictionary") + + data_processor_logger.info(f"Processing kwargs: {kwargs}") + + if self.model_type == ERNIE4_5_VL: + expected_types = _ERNIE_EXPECTED_KWARGS + else: + expected_types = _QWEN_EXPECTED_KWARGS + + for key, value in kwargs.items(): + if key in expected_types and not isinstance(value, expected_types[key]): + raise ValueError( + f"Invalid type for {key}: expected " + f"{expected_types[key].__name__}, got {type(value).__name__}" + ) + return kwargs + + except Exception as e: + data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}") + return {} + + def _parse_limits(self, limits: Optional[dict]) -> dict: + """Parse multimodal input limits, merging with defaults.""" + if not limits: + return dict(_DEFAULT_MM_LIMITS) + + try: + if not isinstance(limits, dict): + raise ValueError("limit-mm-per-prompt must be a dictionary") + data_processor_logger.info(f"_parse_limits:{limits}") + return {**_DEFAULT_MM_LIMITS, **limits} + except Exception as e: + data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits") + return dict(_DEFAULT_MM_LIMITS) + + def _check_mm_limits(self, item): + """Validate multimodal inputs against configured limits.""" + if isinstance(item, dict): + mm_data = item + else: + mm_data = {"image": [], "video": []} + accept_url_suffix = self.model_type in _TYPES_ACCEPT_URL_SUFFIX + + for message in item: + if isinstance(message.get("content"), list): + for part in message["content"]: + part_type = part.get("type") + if accept_url_suffix: + if part_type in ("image_url", "image"): + mm_data["image"].append(part) + elif part_type in ("video_url", "video"): + mm_data["video"].append(part) + else: + if part_type == "image": + mm_data["image"].append(part) + elif part_type == "video": + mm_data["video"].append(part) + + for modality, data in mm_data.items(): + if modality in self.limit_mm_per_prompt: + limit = self.limit_mm_per_prompt[modality] + if len(data) > limit: + raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}") + + def _get_processor_cache(self, socket, mm_hashes: list) -> list: + """Retrieve cached processor results for the given hashes.""" + req = pickle.dumps(mm_hashes) + socket.send_multipart([b"", req]) + _, resp = socket.recv_multipart() + mm_items = pickle.loads(resp) + data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}") + return mm_items + + def _update_processor_cache(self, socket, mm_hashes: list, mm_items): + """Update the processor cache with new results.""" + req = pickle.dumps((mm_hashes, mm_items)) + socket.send_multipart([b"", req]) + data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}") + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Mapping[str, int]]: + """Return per-modality max token counts, if available.""" + if self.model_type == ERNIE4_5_VL: + return self.processor.get_mm_max_tokens_per_item(seq_len) + return None + + def process_request_dict(self, request, max_model_len=None): + """Process a request dictionary into model inputs. + + Unified template-method flow for all VL model types. Per-model + differences are handled by small conditional branches rather than + duplicating the entire pipeline. + """ + request = self._apply_default_parameters(request) + + if not request.get("eos_token_ids"): + request["eos_token_ids"] = self.eos_token_ids + + self._process_stop_tokens(request) + + if self.model_type != PADDLEOCR_VL: + self._process_bad_words(request) + + if self.model_type == ERNIE4_5_VL: + logits_processors_args = self._prepare_think_stop_sentence( + request.get("logits_processors_args") or {}, max_model_len + ) + request["logits_processors_args"] = logits_processors_args + + outputs = self._tokenize_request(request) + + self._process_post_tokens(request, outputs) + + if self.model_type in (QWEN_VL, QWEN3_VL): + request["enable_thinking"] = False + + outputs = self.pack_outputs(outputs) + + if self.model_type in (QWEN3_VL, ERNIE4_5_VL) and request.get("prompt_token_ids"): + pass # preserve existing prompt_token_ids + else: + request["prompt_token_ids"] = outputs["input_ids"].tolist() + request["prompt_token_ids_len"] = len(request["prompt_token_ids"]) + request["multimodal_inputs"] = outputs + + if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len: + request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1] + + if self.model_type == ERNIE4_5_VL: + logits_processors_args = self._update_thinking_prompt_state( + request["prompt_token_ids"], request.get("logits_processors_args") or {} + ) + request["logits_processors_args"] = logits_processors_args + + max_tokens = max_model_len - len(request["prompt_token_ids"]) + if request.get("max_tokens") is None: + request["max_tokens"] = max(1, max_tokens) + else: + request["max_tokens"] = min(max_tokens, request["max_tokens"]) + + if self.model_type == ERNIE4_5_VL and request.get("reasoning_max_tokens") is None: + request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1) + + if self.model_type in (PADDLEOCR_VL, ERNIE4_5_VL): + if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS: + request["top_p"] = _SAMPLING_EPS + request["top_k"] = 1 + + if self.model_type != QWEN3_VL and self.reasoning_parser: + self._apply_reasoning_parser(request) + + if self.model_type == ERNIE4_5_VL: + if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False: + request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"]) + + data_processor_logger.info(f"Processed request {request}") + return request + + def _process_stop_tokens(self, request): + """Handle stop token processing based on model type.""" + if self.model_type == QWEN3_VL: + stop_sequences = request.get("stop", []) + if stop_sequences: + stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences) + request["stop_token_ids"] = stop_seqs + request["stop_seqs_len"] = stop_seqs_len + else: + process_stop_token_ids(request, self.update_stop_seq) + + def _process_bad_words(self, request): + """Process bad_words into token ids.""" + bad_words = request.get("bad_words") + bad_words_token_ids = request.get("bad_words_token_ids") + if bad_words: + bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids) + request["bad_words_token_ids"] = bad_words_token_ids + + def _tokenize_request(self, request): + """Core tokenization dispatch: prompt_token_ids > prompt > messages.""" + default_thinking = True if self.model_type == ERNIE4_5_VL else False + + if request.get("prompt_token_ids") and self.model_type in (QWEN3_VL, ERNIE4_5_VL): + messages = request.get("messages") + if messages: + self._check_mm_limits(messages) + request.setdefault("enable_thinking", default_thinking) + return self.processor.prompt_token_ids2outputs(request) + + elif request.get("prompt"): + multimodal_data = request.get("multimodal_data") or {} + self._check_mm_limits(multimodal_data) + images = multimodal_data.get("image", None) + videos = multimodal_data.get("video", None) + if self.model_type == ERNIE4_5_VL: + request["prompt_tokens"] = request.get("prompt") + request.setdefault("enable_thinking", default_thinking) + return self.processor.text2ids(request["prompt"], images, videos) + + elif request.get("messages"): + messages = request["messages"] + self._check_mm_limits(messages) + chat_template_kwargs = request.get("chat_template_kwargs") + if chat_template_kwargs: + if isinstance(chat_template_kwargs, dict): + for k, v in chat_template_kwargs.items(): + if k not in request or request[k] is None: + request[k] = v + else: + raise ValueError("Invalid input: chat_template_kwargs must be a dict") + request.setdefault("enable_thinking", default_thinking) + return self.processor.request2ids(request) + + else: + raise ValueError(f"Request must contain 'prompt', or 'messages': {request}") + + def _process_post_tokens(self, request, outputs): + """Handle post-tokenization token appending.""" + if self.model_type == PADDLEOCR_VL: + metadata = request.get("metadata") + if metadata and metadata.get("generated_token_ids"): + self._append_completion_tokens_qwen(outputs, metadata["generated_token_ids"]) + else: + if request.get("completion_token_ids"): + self.append_completion_tokens(outputs, request["completion_token_ids"]) + + def _apply_reasoning_parser(self, request): + """Apply reasoning parser and update model status dict.""" + model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + parts = request["request_id"].split("_") + if len(parts) > 1: + real_req_id = parts[0] + index = int(parts[1]) + n = request.get("n", 1) + for idx in range(index * n, (index + 1) * n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + else: + self.model_status_dict[request["request_id"]] = model_status + request["enable_thinking"] = model_status == "think_start" + + def append_completion_tokens(self, multimodal_inputs, completion_token_ids): + """Append completion tokens to existing multimodal outputs.""" + if self.model_type == ERNIE4_5_VL: + self._append_completion_tokens_ernie(multimodal_inputs, completion_token_ids) + else: + self._append_completion_tokens_qwen(multimodal_inputs, completion_token_ids) + + def _append_completion_tokens_qwen(self, multimodal_inputs, completion_token_ids): + """Append completion tokens for qwen_vl / qwen3_vl / paddleocr_vl.""" + num_tokens = len(completion_token_ids) + multimodal_inputs["input_ids"].extend(completion_token_ids) + multimodal_inputs["token_type_ids"].extend([0] * num_tokens) + + pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens) + multimodal_inputs["position_ids"].append(pos_ids) + multimodal_inputs["cur_position"] += num_tokens + + def _append_completion_tokens_ernie(self, multimodal_inputs, completion_token_ids): + """Append completion tokens for ernie4_5_vl.""" + num_tokens = len(completion_token_ids) + multimodal_inputs["input_ids"].extend(completion_token_ids) + multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens) + + start = multimodal_inputs["cur_position"] + for i in range(num_tokens): + multimodal_inputs["position_ids"].append([start + i] * 3) + multimodal_inputs["cur_position"] += num_tokens + + def pack_outputs(self, outputs): + """Convert intermediate processing outputs to final format.""" + if not outputs["images"]: + outputs["images"] = None + outputs["grid_thw"] = None + outputs["image_type_ids"] = None + else: + outputs["images"] = np.vstack(outputs["images"]) + outputs["grid_thw"] = np.vstack(outputs["grid_thw"]) + outputs["image_type_ids"] = np.array(outputs["image_type_ids"]) + + outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64) + outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64) + outputs["mm_num_token_func"] = self.processor.mm_num_tokens + + if self.model_type in (QWEN_VL, QWEN3_VL, PADDLEOCR_VL): + outputs["position_ids"] = np.concatenate(outputs["position_ids"], axis=1, dtype=np.int64) + outputs["image_patch_id"] = self.processor.image_token_id + outputs["video_patch_id"] = self.processor.video_token_id + outputs["position_ids"] = outputs["position_ids"].transpose(1, 0) + else: + outputs["position_ids"] = np.array(outputs["position_ids"], dtype=np.int64) + outputs["image_patch_id"] = self.image_patch_id + + return outputs diff --git a/fastdeploy/input/paddleocr_vl_processor/image_processor.py b/fastdeploy/input/paddleocr_vl_processor/image_processor.py index a6e318e1ed7..ef86d77b714 100644 --- a/fastdeploy/input/paddleocr_vl_processor/image_processor.py +++ b/fastdeploy/input/paddleocr_vl_processor/image_processor.py @@ -14,216 +14,12 @@ # limitations under the License. """ -"""Image processor class for Keye.""" - -# TODO: Support videos - -import json -from pathlib import Path -from typing import Dict, List, Optional, Union - -import numpy as np -from paddleformers.transformers.feature_extraction_utils import BatchFeature -from paddleformers.transformers.image_processing_utils import BaseImageProcessor -from paddleformers.transformers.image_utils import ( - ImageInput, - is_valid_image, - make_list_of_images, - to_numpy_array, -) - -from fastdeploy.input.image_processors.common import ( - smart_resize_paddleocr as smart_resize, +# Backward compatibility: this module has been migrated to +# fastdeploy.input.image_processors.paddleocr_processor +# This file will be removed in a future version. + +from fastdeploy.input.image_processors.paddleocr_processor import ( # noqa: F401 + ImageProcessor, + make_batched_images, + smart_resize, ) - -_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] -_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711] - - -def make_batched_images(images) -> List[List[ImageInput]]: - """ - Accepts images in list or nested list format, and makes a list of images for preprocessing. - - Args: - images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): - The input image. - - Returns: - list: A list of images. - """ - if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): - return [img for img_list in images for img in img_list] - - elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): - return images - - elif is_valid_image(images): - return [images] - - raise ValueError(f"Could not make batched images from {images}") - - -def adjust_size(size, patch_size): - num_patches = size // patch_size - if num_patches % 2 != 0: - num_patches -= 1 - return num_patches * patch_size - - -class ImageProcessor(BaseImageProcessor): - model_input_names = [ - "pixel_values", - "image_grid_thw", - "pixel_values_videos", - "video_grid_thw", - ] - - def __init__( - self, - do_resize: bool = True, - resample: int = 3, - do_rescale: bool = True, - rescale_factor: Union[int, float] = 1 / 255, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: bool = True, - min_pixels: int = 28 * 28 * 130, - max_pixels: int = 28 * 28 * 1280, - patch_size: int = 14, - temporal_patch_size: int = 1, - merge_size: int = 2, - **kwargs, - ) -> None: - super().__init__() - self.do_resize = do_resize - self.resample = resample - self.do_rescale = do_rescale - self.rescale_factor = rescale_factor - self.do_normalize = do_normalize - self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN - self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD - self.min_pixels = min_pixels - self.max_pixels = max_pixels - self.patch_size = patch_size - self.temporal_patch_size = temporal_patch_size - self.merge_size = merge_size - self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} # not used - self.do_convert_rgb = do_convert_rgb - - @classmethod - def from_pretrained(cls, pretrained_model_dir): - pretrained_model_dir = Path(pretrained_model_dir) - image_processor_config_path = pretrained_model_dir / "preprocessor_config.json" - with open(image_processor_config_path, "r", encoding="utf-8") as f: - image_processor_config = json.load(f) - return cls(**image_processor_config) - - def _preprocess( - self, - images, - do_resize: Optional[bool] = None, - do_rescale: Optional[bool] = None, - rescale_factor: Optional[float] = None, - do_normalize: Optional[bool] = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: Optional[bool] = None, - ): - images = make_list_of_images(images) - - if do_convert_rgb: - images = [image.convert("RGB") for image in images] - - width, height = images[0].size - resized_height, resized_width = height, width - processed_images = [] - - for image in images: - if do_resize: - resized_height, resized_width = smart_resize( - height, - width, - factor=self.patch_size * self.merge_size, - min_pixels=self.min_pixels, - max_pixels=self.max_pixels, - ) - - image = image.resize((resized_width, resized_height), resample=self.resample) - - image = to_numpy_array(image) - - if do_rescale: - image = (image * rescale_factor).astype(np.float32) - - if do_normalize: - image = image.astype(np.float32) - image -= np.array(image_mean, dtype=np.float32) - image /= np.array(image_std, dtype=np.float32) - - processed_images.append(image) - - patches = np.array(processed_images) - patches = patches.transpose(0, 3, 1, 2) - if patches.shape[0] == 1: - patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1)) - channel = patches.shape[1] - grid_t = patches.shape[0] // self.temporal_patch_size - grid_h, grid_w = ( - resized_height // self.patch_size, - resized_width // self.patch_size, - ) - - patches = patches.reshape( - grid_t, - self.temporal_patch_size, - channel, - grid_h, - self.patch_size, - grid_w, - self.patch_size, - ) - patches = patches.transpose(0, 3, 5, 2, 1, 4, 6) - assert self.temporal_patch_size == 1 - flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size) - return flatten_patches, np.array([grid_t, grid_h, grid_w]) - - def preprocess( - self, - images, - videos=None, - do_resize: Optional[bool] = None, - size: Optional[Dict[str, int]] = None, - do_rescale: Optional[bool] = None, - rescale_factor: Optional[float] = None, - do_normalize: Optional[bool] = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: Optional[bool] = None, - return_tensors=None, - ): - do_resize = do_resize if do_resize is not None else self.do_resize - size = size if size is not None else self.size - do_rescale = do_rescale if do_rescale is not None else self.do_rescale - rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor - do_normalize = do_normalize if do_normalize is not None else self.do_normalize - image_mean = image_mean if image_mean is not None else self.image_mean - image_std = image_std if image_std is not None else self.image_std - do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb - - if videos is not None: - raise NotImplementedError("Videos are not yet supported") - - patches, image_grid_thw = self._preprocess( - images, - do_resize=do_resize, - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - do_convert_rgb=do_convert_rgb, - ) - pixel_values = np.array(patches) - data = {"pixel_values": pixel_values, "grid_thw": image_grid_thw} - return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/fastdeploy/input/preprocess.py b/fastdeploy/input/preprocess.py index 8568d1ff32d..0261177eeaa 100644 --- a/fastdeploy/input/preprocess.py +++ b/fastdeploy/input/preprocess.py @@ -106,74 +106,34 @@ def create_processor(self): tool_parser_obj=tool_parser_obj, ) else: - if ErnieArchitectures.contains_ernie_arch(architecture): - if not envs.ENABLE_V1_DATA_PROCESSOR: - from fastdeploy.input.ernie4_5_vl_processor import ( - Ernie4_5_VLProcessor, - ) - else: - from fastdeploy.input.v1.ernie4_5_vl_processor import ( - Ernie4_5_VLProcessor, - ) + from fastdeploy.input.multimodal_processor import ( + ERNIE4_5_VL, + PADDLEOCR_VL, + QWEN3_VL, + QWEN_VL, + MultiModalProcessor, + ) - self.processor = Ernie4_5_VLProcessor( - model_name_or_path=self.model_name_or_path, - limit_mm_per_prompt=self.limit_mm_per_prompt, - mm_processor_kwargs=self.mm_processor_kwargs, - reasoning_parser_obj=reasoning_parser_obj, - tool_parser_obj=tool_parser_obj, - enable_processor_cache=self.enable_processor_cache, - ) + if ErnieArchitectures.contains_ernie_arch(architecture): + model_type = ERNIE4_5_VL elif "PaddleOCRVL" in architecture: - if not envs.ENABLE_V1_DATA_PROCESSOR: - from fastdeploy.input.paddleocr_vl_processor import ( - PaddleOCRVLProcessor, - ) - else: - from fastdeploy.input.v1.paddleocr_vl_processor import ( - PaddleOCRVLProcessor, - ) - - self.processor = PaddleOCRVLProcessor( - config=self.model_config, - model_name_or_path=self.model_name_or_path, - limit_mm_per_prompt=self.limit_mm_per_prompt, - mm_processor_kwargs=self.mm_processor_kwargs, - reasoning_parser_obj=reasoning_parser_obj, - ) + model_type = PADDLEOCR_VL elif "Qwen2_5_VL" in architecture: - if not envs.ENABLE_V1_DATA_PROCESSOR: - from fastdeploy.input.qwen_vl_processor import QwenVLProcessor - else: - from fastdeploy.input.v1.qwen_vl_processor import ( - QwenVLProcessor, - ) - - self.processor = QwenVLProcessor( - config=self.model_config, - model_name_or_path=self.model_name_or_path, - limit_mm_per_prompt=self.limit_mm_per_prompt, - mm_processor_kwargs=self.mm_processor_kwargs, - reasoning_parser_obj=reasoning_parser_obj, - enable_processor_cache=self.enable_processor_cache, - ) + model_type = QWEN_VL elif "Qwen3VL" in architecture: - if not envs.ENABLE_V1_DATA_PROCESSOR: - from fastdeploy.input.qwen3_vl_processor import Qwen3VLProcessor - else: - from fastdeploy.input.v1.qwen3_vl_processor import ( - Qwen3VLProcessor, - ) - - self.processor = Qwen3VLProcessor( - config=self.model_config, - model_name_or_path=self.model_name_or_path, - limit_mm_per_prompt=self.limit_mm_per_prompt, - mm_processor_kwargs=self.mm_processor_kwargs, - reasoning_parser_obj=reasoning_parser_obj, - enable_processor_cache=self.enable_processor_cache, - ) + model_type = QWEN3_VL else: raise ValueError(f"Unsupported model processor architecture: {architecture}. ") + self.processor = MultiModalProcessor( + model_name_or_path=self.model_name_or_path, + model_type=model_type, + config=self.model_config, + limit_mm_per_prompt=self.limit_mm_per_prompt, + mm_processor_kwargs=self.mm_processor_kwargs, + reasoning_parser_obj=reasoning_parser_obj, + tool_parser_obj=tool_parser_obj, + enable_processor_cache=self.enable_processor_cache, + ) + return self.processor diff --git a/fastdeploy/input/qwen3_vl_processor/image_processor.py b/fastdeploy/input/qwen3_vl_processor/image_processor.py index 5927a0f9699..2b0afe4c047 100644 --- a/fastdeploy/input/qwen3_vl_processor/image_processor.py +++ b/fastdeploy/input/qwen3_vl_processor/image_processor.py @@ -14,320 +14,10 @@ # limitations under the License. """ -from typing import List, Optional, Union +# Backward compatibility: this module has been migrated to +# fastdeploy.input.image_processors.qwen3_processor +# This file will be removed in a future version. -import numpy as np -import paddle -import PIL -from paddleformers.transformers.feature_extraction_utils import BatchFeature -from paddleformers.transformers.image_processing_utils import BaseImageProcessor -from paddleformers.transformers.image_transforms import ( - normalize, - rescale, - resize, - to_channel_dimension_format, +from fastdeploy.input.image_processors.qwen3_processor import ( # noqa: F401 + ImageProcessor, ) -from paddleformers.transformers.image_utils import ( - ChannelDimension, - ImageInput, - PILImageResampling, - get_image_size, - infer_channel_dimension_format, - make_list_of_images, - to_numpy_array, - valid_images, -) -from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType -from PIL import Image - -from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize -from fastdeploy.utils import data_processor_logger - -IMAGE_MEAN = [0.5, 0.5, 0.5] -IMAGE_STD = [0.5, 0.5, 0.5] - -MIN_PIXELS = 65536 -MAX_PIXELS = 16777216 - - -VideoInput = Union[ - List["PIL.Image.Image"], - "np.ndarray", - "paddle.Tensor", - List["np.ndarray"], - List["paddle.Tensor"], - List[List["PIL.Image.Image"]], - List[List["np.ndarray"]], - List[List["paddle.Tensor"]], -] - - -class ImageProcessor(BaseImageProcessor): - """ - Adaptive image processor for dynamic image resizing and preprocessing. - - This processor handles image resizing, rescaling, normalization and format conversion. - It dynamically adjusts image dimensions based on original size and specified constraints. - """ - - def __init__( - self, - patch_size: int = 16, - merge_size: int = 2, - temporal_patch_size: int = 2, - min_pixels: int = MIN_PIXELS, - max_pixels: int = MAX_PIXELS, - image_mean: Union[float, List[float]] = IMAGE_MEAN, - image_std: Union[float, List[float]] = IMAGE_STD, - rescale_factor: float = 1 / 255, - do_rescale: bool = True, - do_normalize: bool = True, - resample: PILImageResampling = PILImageResampling.BICUBIC, - **kwargs, - ) -> None: - """ - Initialize image processor with configuration parameters. - - Args: - patch_size (int): Spatial patch size for vision encoder - merge_size (int): Merge size between vision and LLM encoders - temporal_patch_size (int): Temporal patch size for video processing - min_pixels (int): Minimum allowed pixels in resized image - max_pixels (int): Maximum allowed pixels in resized image - image_mean (float/list): Mean values for normalization per channel - image_std (float/list): Std values for normalization per channel - rescale_factor (float): Scaling factor for pixel values (default 1/255) - do_rescale (bool): Whether to rescale images - do_normalize (bool): Whether to normalize images - resample: Resampling method for image resizing - **kwargs: Additional base class arguments - """ - super().__init__(**kwargs) - self.patch_size = patch_size - self.merge_size = merge_size - self.temporal_patch_size = temporal_patch_size - - self.min_pixels = min_pixels - self.max_pixels = max_pixels - - self.image_mean = image_mean - self.image_std = image_std - self.rescale_factor = rescale_factor - self.do_rescale = do_rescale - self.do_normalize = do_normalize - - self.resample = resample - - def _preprocess( - self, - images: Union[ImageInput, VideoInput], - min_pixels: int, - max_pixels: int, - image_mean: Optional[Union[float, List[float]]], - image_std: Optional[Union[float, List[float]]], - rescale_factor: float, - do_rescale: bool, - do_normalize: bool, - resample: PILImageResampling, - data_format: Optional[ChannelDimension], - input_data_format: Optional[Union[str, ChannelDimension]], - ): - """ - Internal method for image preprocessing pipeline. - - Args: - images: Input image or batch of images - min_pixels: Minimum allowed pixels in output - max_pixels: Maximum allowed pixels in output - image_mean: Normalization mean values - image_std: Normalization std values - rescale_factor: Pixel value scaling factor - do_rescale: Whether to rescale pixel values - do_normalize: Whether to normalize pixel values - resample: Resampling method - data_format: Output channel format - input_data_format: Input channel format - - Returns: - tuple: (flatten_patches, grid_dimensions) - - flatten_patches: Flattened image patches - - grid_dimensions: Grid dimensions [t, h, w] - """ - images = make_list_of_images(images) - - # All transformations expect numpy arrays. - images = [to_numpy_array(image) for image in images] - - if is_scaled_image(images[0]) and do_rescale: - data_processor_logger.warning( - "It looks like you are trying to rescale already rescaled images. If the input" - " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." - ) - if input_data_format is None: - # We assume that all images have the same channel dimension format. - input_data_format = infer_channel_dimension_format(images[0]) - - # Get original dimensions and calculate optimal resize dimensions - height, width = get_image_size(images[0], channel_dim=input_data_format) - resized_height, resized_width = smart_resize( - height, - width, - factor=self.patch_size * self.merge_size, # Combine patch and merge factors - min_pixels=min_pixels, - max_pixels=max_pixels, - ) - - processed_images = [] - for image in images: - if height != resized_height or width != resized_width: - # Convert to uint8 before resizing to avoid double scaling - image = image.astype("uint8") - # Convert to PIL Image and resize - image = Image.fromarray(image) - image = resize( - image, - size=(resized_height, resized_width), - resample=resample, - data_format=input_data_format, - ) - - if do_rescale and do_normalize: - # Adjust mean and std for combined rescale+normalize - image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor) - image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor) - do_rescale = False # Skip separate rescale step - - # mutual exclusion and upper branch - if do_rescale: - image = image.astype(np.float32) - image = rescale(image, scale=rescale_factor, data_format=input_data_format) - - if do_normalize: - image = image.astype(np.float32) - image = normalize( - image=image, - mean=image_mean, - std=image_std, - data_format=input_data_format, - ) - - image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W] - processed_images.append(image) - - # Convert processed images to numpy array - patches = np.array(processed_images) - - # Pad temporal dimension if needed - if patches.shape[0] % self.temporal_patch_size != 0: - repeats = np.repeat( - patches[-1][np.newaxis], - self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size), - axis=0, - ) - patches = np.concatenate([patches, repeats], axis=0) - - # Convert to channels-first format if needed - if data_format == ChannelDimension.LAST: - patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W] - - grid_t, channel = patches.shape[:2] - grid_t = grid_t // self.temporal_patch_size - - grid_h, grid_w = ( - resized_height // self.patch_size, - resized_width // self.patch_size, - ) - # Reshape into hierarchical patch structure - patches = patches.reshape( - [ - grid_t, - self.temporal_patch_size, - channel, - grid_h // self.merge_size, - self.merge_size, - self.patch_size, - grid_w // self.merge_size, - self.merge_size, - self.patch_size, - ] - ) - # Reorder dimensions for better memory access pattern - # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz] - patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8]) - - flatten_patches = patches.reshape( - [ - grid_t * grid_h * grid_w, - channel * self.temporal_patch_size * self.patch_size * self.patch_size, - ] - ) - - return flatten_patches, np.array([grid_t, grid_h, grid_w]) - - def preprocess( - self, - images: Union[ImageInput, VideoInput], - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - rescale_factor: Optional[float] = None, - do_rescale: Optional[bool] = None, - do_normalize: Optional[bool] = None, - resample: Optional[PILImageResampling] = None, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST, - ): - """ - Main preprocessing method for images/videos. - - Args: - images: Input image/video data - min_pixels: Override for minimum pixels - max_pixels: Override for maximum pixels - image_mean: Override for normalization mean - image_std: Override for normalization std - rescale_factor: Override for rescaling factor - do_rescale: Override for rescaling flag - do_normalize: Override for normalization flag - resample: Override for resampling method - return_tensors: Desired output tensor format - data_format: Output channel dimension format - input_data_format: Input channel dimension format - - Returns: - BatchFeature: Processed features containing: - - pixel_values: Preprocessed pixel data - - grid_thw: Grid dimensions [temporal, height, width] - - Raises: - ValueError: For invalid image types or dimensions - """ - min_pixels = min_pixels if min_pixels is not None else self.min_pixels - max_pixels = max_pixels if max_pixels is not None else self.max_pixels - image_mean = image_mean if image_mean is not None else self.image_mean - image_std = image_std if image_std is not None else self.image_std - rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor - do_rescale = do_rescale if do_rescale is not None else self.do_rescale - do_normalize = do_normalize if do_normalize is not None else self.do_normalize - resample = resample if resample is not None else self.resample - - if images is not None and not valid_images(images): - raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") - - pixel_values, grid_thw = self._preprocess( - images, - min_pixels=min_pixels, - max_pixels=max_pixels, - image_mean=image_mean, - image_std=image_std, - rescale_factor=rescale_factor, - do_rescale=do_rescale, - do_normalize=do_normalize, - resample=resample, - data_format=data_format, - input_data_format=input_data_format, - ) - data = {"pixel_values": pixel_values, "grid_thw": grid_thw} - return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/fastdeploy/input/qwen_vl_processor/image_processor.py b/fastdeploy/input/qwen_vl_processor/image_processor.py index 7c3df2b69bf..3a5a77ea6d8 100644 --- a/fastdeploy/input/qwen_vl_processor/image_processor.py +++ b/fastdeploy/input/qwen_vl_processor/image_processor.py @@ -14,319 +14,10 @@ # limitations under the License. """ -from typing import List, Optional, Union +# Backward compatibility: this module has been migrated to +# fastdeploy.input.image_processors.qwen_processor +# This file will be removed in a future version. -import numpy as np -import paddle -import PIL -from paddleformers.transformers.feature_extraction_utils import BatchFeature -from paddleformers.transformers.image_processing_utils import BaseImageProcessor -from paddleformers.transformers.image_transforms import ( - normalize, - rescale, - resize, - to_channel_dimension_format, +from fastdeploy.input.image_processors.qwen_processor import ( # noqa: F401 + ImageProcessor, ) -from paddleformers.transformers.image_utils import ( - ChannelDimension, - ImageInput, - PILImageResampling, - get_image_size, - infer_channel_dimension_format, - make_list_of_images, - to_numpy_array, - valid_images, -) -from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType -from PIL import Image - -from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize -from fastdeploy.utils import data_processor_logger - -OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] -OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711] - -MIN_PIXELS = 4 * 28 * 28 -MAX_PIXELS = 16384 * 28 * 28 - - -VideoInput = Union[ - List["PIL.Image.Image"], - "np.ndarray", - "paddle.Tensor", - List["np.ndarray"], - List["paddle.Tensor"], - List[List["PIL.Image.Image"]], - List[List["np.ndarray"]], - List[List["paddle.Tensor"]], -] - - -class ImageProcessor(BaseImageProcessor): - """ - Adaptive image processor for dynamic image resizing and preprocessing. - - This processor handles image resizing, rescaling, normalization and format conversion. - It dynamically adjusts image dimensions based on original size and specified constraints. - """ - - def __init__( - self, - patch_size: int = 14, - merge_size: int = 2, - temporal_patch_size: int = 2, - min_pixels: int = MIN_PIXELS, - max_pixels: int = MAX_PIXELS, - image_mean: Union[float, List[float]] = OPENAI_CLIP_MEAN, - image_std: Union[float, List[float]] = OPENAI_CLIP_STD, - rescale_factor: float = 1 / 255, - do_rescale: bool = True, - do_normalize: bool = True, - resample: PILImageResampling = PILImageResampling.BICUBIC, - **kwargs, - ) -> None: - """ - Initialize image processor with configuration parameters. - - Args: - patch_size (int): Spatial patch size for vision encoder - merge_size (int): Merge size between vision and LLM encoders - temporal_patch_size (int): Temporal patch size for video processing - min_pixels (int): Minimum allowed pixels in resized image - max_pixels (int): Maximum allowed pixels in resized image - image_mean (float/list): Mean values for normalization per channel - image_std (float/list): Std values for normalization per channel - rescale_factor (float): Scaling factor for pixel values (default 1/255) - do_rescale (bool): Whether to rescale images - do_normalize (bool): Whether to normalize images - resample: Resampling method for image resizing - **kwargs: Additional base class arguments - """ - super().__init__(**kwargs) - self.patch_size = patch_size - self.merge_size = merge_size - self.temporal_patch_size = temporal_patch_size - - self.min_pixels = min_pixels - self.max_pixels = max_pixels - - self.image_mean = image_mean - self.image_std = image_std - self.rescale_factor = rescale_factor - self.do_rescale = do_rescale - self.do_normalize = do_normalize - - self.resample = resample - - def _preprocess( - self, - images: Union[ImageInput, VideoInput], - min_pixels: int, - max_pixels: int, - image_mean: Optional[Union[float, List[float]]], - image_std: Optional[Union[float, List[float]]], - rescale_factor: float, - do_rescale: bool, - do_normalize: bool, - resample: PILImageResampling, - data_format: Optional[ChannelDimension], - input_data_format: Optional[Union[str, ChannelDimension]], - ): - """ - Internal method for image preprocessing pipeline. - - Args: - images: Input image or batch of images - min_pixels: Minimum allowed pixels in output - max_pixels: Maximum allowed pixels in output - image_mean: Normalization mean values - image_std: Normalization std values - rescale_factor: Pixel value scaling factor - do_rescale: Whether to rescale pixel values - do_normalize: Whether to normalize pixel values - resample: Resampling method - data_format: Output channel format - input_data_format: Input channel format - - Returns: - tuple: (flatten_patches, grid_dimensions) - - flatten_patches: Flattened image patches - - grid_dimensions: Grid dimensions [t, h, w] - """ - images = make_list_of_images(images) - - # All transformations expect numpy arrays. - images = [to_numpy_array(image) for image in images] - - if is_scaled_image(images[0]) and do_rescale: - data_processor_logger.warning( - "It looks like you are trying to rescale already rescaled images. If the input" - " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." - ) - if input_data_format is None: - # We assume that all images have the same channel dimension format. - input_data_format = infer_channel_dimension_format(images[0]) - - # Get original dimensions and calculate optimal resize dimensions - height, width = get_image_size(images[0], channel_dim=input_data_format) - resized_height, resized_width = smart_resize( - height, - width, - factor=self.patch_size * self.merge_size, # Combine patch and merge factors - min_pixels=min_pixels, - max_pixels=max_pixels, - ) - - processed_images = [] - for image in images: - if height != resized_height or width != resized_width: - # Convert to uint8 before resizing to avoid double scaling - image = image.astype("uint8") - # Convert to PIL Image and resize - image = Image.fromarray(image) - image = resize( - image, - size=(resized_height, resized_width), - resample=resample, - data_format=input_data_format, - ) - - if do_rescale and do_normalize: - # Adjust mean and std for combined rescale+normalize - image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor) - image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor) - do_rescale = False # Skip separate rescale step - - if do_rescale: - image = image.astype(np.float32) - image = rescale(image, scale=rescale_factor, data_format=input_data_format) - - if do_normalize: - image = image.astype(np.float32) - image = normalize( - image=image, - mean=image_mean, - std=image_std, - data_format=input_data_format, - ) - - image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W] - processed_images.append(image) - - # Convert processed images to numpy array - patches = np.array(processed_images) - - # Pad temporal dimension if needed - if patches.shape[0] % self.temporal_patch_size != 0: - repeats = np.repeat( - patches[-1][np.newaxis], - self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size), - axis=0, - ) - patches = np.concatenate([patches, repeats], axis=0) - - # Convert to channels-first format if needed - if data_format == ChannelDimension.LAST: - patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W] - - grid_t, channel = patches.shape[:2] - grid_t = grid_t // self.temporal_patch_size - - grid_h, grid_w = ( - resized_height // self.patch_size, - resized_width // self.patch_size, - ) - # Reshape into hierarchical patch structure - patches = patches.reshape( - [ - grid_t, - self.temporal_patch_size, - channel, - grid_h // self.merge_size, - self.merge_size, - self.patch_size, - grid_w // self.merge_size, - self.merge_size, - self.patch_size, - ] - ) - # Reorder dimensions for better memory access pattern - # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz] - patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8]) - - flatten_patches = patches.reshape( - [ - grid_t * grid_h * grid_w, - channel * self.temporal_patch_size * self.patch_size * self.patch_size, - ] - ) - - return flatten_patches, np.array([grid_t, grid_h, grid_w]) - - def preprocess( - self, - images: Union[ImageInput, VideoInput], - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - rescale_factor: Optional[float] = None, - do_rescale: Optional[bool] = None, - do_normalize: Optional[bool] = None, - resample: Optional[PILImageResampling] = None, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST, - ): - """ - Main preprocessing method for images/videos. - - Args: - images: Input image/video data - min_pixels: Override for minimum pixels - max_pixels: Override for maximum pixels - image_mean: Override for normalization mean - image_std: Override for normalization std - rescale_factor: Override for rescaling factor - do_rescale: Override for rescaling flag - do_normalize: Override for normalization flag - resample: Override for resampling method - return_tensors: Desired output tensor format - data_format: Output channel dimension format - input_data_format: Input channel dimension format - - Returns: - BatchFeature: Processed features containing: - - pixel_values: Preprocessed pixel data - - grid_thw: Grid dimensions [temporal, height, width] - - Raises: - ValueError: For invalid image types or dimensions - """ - min_pixels = min_pixels if min_pixels is not None else self.min_pixels - max_pixels = max_pixels if max_pixels is not None else self.max_pixels - image_mean = image_mean if image_mean is not None else self.image_mean - image_std = image_std if image_std is not None else self.image_std - rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor - do_rescale = do_rescale if do_rescale is not None else self.do_rescale - do_normalize = do_normalize if do_normalize is not None else self.do_normalize - resample = resample if resample is not None else self.resample - - if images is not None and not valid_images(images): - raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") - - pixel_values, grid_thw = self._preprocess( - images, - min_pixels=min_pixels, - max_pixels=max_pixels, - image_mean=image_mean, - image_std=image_std, - rescale_factor=rescale_factor, - do_rescale=do_rescale, - do_normalize=do_normalize, - resample=resample, - data_format=data_format, - input_data_format=input_data_format, - ) - data = {"pixel_values": pixel_values, "grid_thw": grid_thw} - return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/tests/input/test_image_preprocessor_adaptive.py b/tests/input/test_image_preprocessor_adaptive.py index cc9ed857554..d01ce6e179e 100644 --- a/tests/input/test_image_preprocessor_adaptive.py +++ b/tests/input/test_image_preprocessor_adaptive.py @@ -340,9 +340,7 @@ def test_preprocess_scaled_image_warning(self): # Create a scaled image (values between 0-1) img_array = np.random.rand(224, 224, 3).astype(np.float32) * 0.5 # Use patch to capture warning - with patch( - "fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.data_processor_logger" - ) as mock_logger: + with patch("fastdeploy.input.image_processors.adaptive_processor.data_processor_logger") as mock_logger: # Directly call _preprocess, pass scaled image self.processor._preprocess( [img_array], # Pass scaled numpy array @@ -356,9 +354,7 @@ def test_preprocess_invalid_images_check(self): """Test invalid image check in preprocess (line 464)""" # Test invalid image type - need to ensure valid_images returns False # Use patch to make valid_images return False, but make_batched_images succeeds - with patch( - "fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.valid_images" - ) as mock_valid: + with patch("fastdeploy.input.image_processors.adaptive_processor.valid_images") as mock_valid: mock_valid.return_value = False valid_images_list = [Image.new("RGB", (224, 224))] # Valid image, but valid_images returns False with self.assertRaises(ValueError) as context: