CompleteMe/inference_folder.py at main · adobe-research/CompleteMe · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
import json
import os
from os.path import join as osp
import numpy as np
import torch
from diffusers.models import AutoencoderKL
from PIL import Image, ImageChops
from tqdm import tqdm
import argparse
import cv2

from completeme.models import UNet2DConditionModel
from completeme.models.custom_attention_processor import (
    DecoupledCrossAttnProcessor2_0,
    set_unet_2d_condition_attn_processor,
)
from completeme.pipelines.pipeline_refs2image import Refs2ImagePipeline


def remove_module_prefix(state_dict):
    new_state_dict = {}
    for k, v in state_dict.items():
        if k.startswith("module."):
            new_state_dict[k[7:]] = v  # Remove "module." prefix
        else:
            new_state_dict[k] = v
    return new_state_dict


def load_images_from_dict(data_dict):
    images = {}
    image_list = []

    def recursive_load(path, key, container, image_list):
        if isinstance(path, dict):
            container[key] = {}
            for k, v in path.items():
                recursive_load(v, k, container[key], image_list)
        else:
            # Load and store the image
            image = Image.open(path)
            container[key] = image
            image_list.append(image)

    recursive_load(data_dict, "root", images, image_list)
    return images["root"], image_list


def postprocess_image(original_image, mask_image, result_image, blurring=True):
    # Convert the images to numpy arrays
    original_array = np.array(original_image)
    mask_array = np.array(mask_image)
    result_array = np.array(result_image)

    mask = np.where(mask_array > 128, 1, 0).astype(np.uint8)
    if blurring:
        mask_3 = np.stack([mask,mask,mask],-1).astype(np.uint8) * 255

        mask_alpha = mask_3.copy()
        for i in range(10):
            mask_alpha = cv2.GaussianBlur(mask_alpha, (3, 3), 0)

        mask_3_bin = mask_alpha / 255
        # Apply the mask to blend the images
        blended_array = original_array * (1 - mask_3_bin) + result_array * mask_3_bin
    else:
        mask_3 = np.stack([mask, mask, mask], -1).astype(np.uint8)
        # Apply the mask to blend the images
        blended_array = original_array * (1 - mask_3) + result_array * mask_3


    # Convert the blended array back to an image
    blended_image = Image.fromarray(blended_array.astype(np.uint8))

    return blended_image


# Argument parser
parser = argparse.ArgumentParser(description='Batch inference from folder for image, mask, and reference.')

# Arguments
parser.add_argument('--image_folder', type=str, required=True, help='Path to the folder with input images.')
parser.add_argument('--mask_folder', type=str, required=True, help='Path to the folder with mask images.')
parser.add_argument('--reference_folder', type=str, required=True, help='Path to the folder with reference images.')
parser.add_argument('--model', type=str, required=True, help='Path to the model.')
parser.add_argument('--width', type=int, required=True, help='Width to resize the images.')
parser.add_argument('--height', type=int, required=True, help='Height to resize the images.')
parser.add_argument('--output_dir', type=str, default='.', help='Output directory.')
parser.add_argument('--reference_mask_folder', type=str, default='NO', help='Path to the folder with reference mask images.')
parser.add_argument('--prompt_folder', type=str, default='NO', help='Path to the folder with prompt files.')
parser.add_argument('--guidance_scale', type=float, default=0.0, help='Guidance scale for the model.')
parser.add_argument('--blurring', type=bool, default=True, help='Apply blurring to the mask for blending.')

# Parse arguments
args = parser.parse_args()


### Define configurations ###
device = "cuda"
torch_dtype = torch.float16
seed = 42
model_dir = args.model
use_decoupled_cross_attn = True
decoupled_cross_attn_path = osp(model_dir, "decoupled_attn.pth")
guidance_scale = args.guidance_scale

### Load model ###
pipe = Refs2ImagePipeline.from_pretrained(model_dir)

if use_decoupled_cross_attn:
    print("Loading decoupled cross attention...")

    from safetensors.torch import load_file

    unet = pipe.unet
    state_dict = unet.state_dict()
    set_unet_2d_condition_attn_processor(
        unet,
        set_cross_attn_proc_func=lambda n, hs, cd, ori: DecoupledCrossAttnProcessor2_0(
            hidden_size=hs, cross_attention_dim=cd, max_image_length=6
        ),
    )
    dc_state_dict = load_file(decoupled_cross_attn_path, device="cpu")
    state_dict.update(dc_state_dict)
    state_dict = remove_module_prefix(state_dict)
    unet.load_state_dict(state_dict)
    pipe.unet = unet

pipe = pipe.to(device, dtype=torch_dtype)

generator = torch.Generator(device=device)
generator.manual_seed(seed)

# Create output directory if it doesn't exist
if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)

# Get image, mask, and reference files from the folders
image_files = sorted(os.listdir(args.image_folder))
mask_files = sorted(os.listdir(args.mask_folder))
reference_files = sorted(os.listdir(args.reference_folder))
if args.reference_mask_folder != 'NO':
    reference_mask_files = sorted(os.listdir(args.reference_mask_folder))
if args.prompt_folder != 'NO':
    prompt_files = sorted(os.listdir(args.prompt_folder))

# Ensure that the number of images, masks, and references are the same
assert len(image_files) == len(mask_files) == len(reference_files), "Mismatch in number of images, masks, and references."

### Process each image, mask, and reference in the folders ###
for i in tqdm(range(len(image_files)), desc="Processing images", unit="image"):
    image_path = osp(args.image_folder, image_files[i])
    mask_path = osp(args.mask_folder, mask_files[i])
    reference_path = osp(args.reference_folder, reference_files[i])
    prompt_path = osp(args.prompt_folder, prompt_files[i])

    with open(prompt_path, 'r', encoding='utf-8') as f:
        prompt = f.read().strip()


    # Get the original filename (without extension) for saving output
    original_name = os.path.splitext(image_files[i])[0]

    # Load the original image to get its original size
    original_image = Image.open(image_path)
    original_size = original_image.size  # Store the original size (width, height)

    ### Define input data ###
    height = args.height
    width = args.width

    if args.reference_mask_folder != 'NO':
        reference_mask_path = osp(args.reference_mask_folder, reference_mask_files[i])
        input_dict = {
            "image": image_path,
            "mask_image": mask_path,
            "appearance": {
                "whole body clothes": reference_path,
            },
            "mask_dict": {
                "whole body clothes": reference_mask_path,
            },
        }
    else:
        input_dict = {
            "image": image_path,
            "mask_image": mask_path,
            "appearance": {
                "whole body clothes": reference_path,
            },
            "mask_dict": {},
        }
    image_dict, image_list = load_images_from_dict(input_dict)

    ### Inference ###
    images = pipe(
        **image_dict,
        prompt=prompt,
        generator=generator,
        height=height,
        width=width,
        num_inference_steps=50,
        guidance_scale=guidance_scale,
        use_decoupled_cross_attn=use_decoupled_cross_attn,
    ).images

    # Resize output images back to original size and save them using the original filename
    for j, image in enumerate(images):
        resized_image = image.resize(original_size)  # Resize to original image size
        output_path = osp(args.output_dir, f"{original_name}_output_{j}.jpg")
        resized_image.save(output_path)

    # Blend original, mask, and result images, and resize the blended image back to original size
    blended_image = postprocess_image(image_dict["image"].convert("RGB").resize((width, height)),
                                      image_dict["mask_image"].convert("L").resize((width, height)),
                                      images[0], blurring=args.blurring)
    blended_image = blended_image.resize(original_size)  # Resize to original image size
    blended_image_output_path = osp(args.output_dir, f"{original_name}_blend.jpg")
    blended_image.save(blended_image_output_path)

    # Create composite image for visualization (no need to resize back to original size)
    composite_image = image_dict["image"].convert("RGBA").resize((width, height))
    composite_mask = image_dict["mask_image"].convert("L").resize((width, height))
    composite_mask = ImageChops.invert(composite_mask)
    composite = Image.new("RGBA", (width, height), (128, 128, 128, 255))
    composite = Image.composite(composite_image, composite, composite_mask)
    # save the composite image
    composite_image = composite.resize(original_size)  # Resize to original image size
    composite_image = composite_image.convert("RGB")
    composite_image_output_path = osp(args.output_dir, f"{original_name}_composite.jpg")
    composite_image.save(composite_image_output_path)

    # Concatenate images (originals and results) for comparison
    all_images = [image.resize((width, height)).convert("RGB") for image in image_list] + [composite.convert("RGB")] + images
    total_width = sum(im.size[0] for im in all_images)
    max_height = max(im.size[1] for im in all_images)

    concat_image = Image.new("RGB", (total_width, max_height))

    x_offset = 0
    for im in all_images:
        concat_image.paste(im, (x_offset, 0))
        x_offset += im.size[0]

    # Save the visualization without resizing to the original image size
    concat_output_path = osp(args.output_dir, f"{original_name}_visualize.jpg")
    concat_image.save(concat_output_path)