From 0c09523d5e6b35e446528988538cbe80754ec1cd Mon Sep 17 00:00:00 2001 From: zwplus <158684334@qq.com> Date: Sat, 7 Mar 2026 21:47:31 +0800 Subject: [PATCH 1/8] Fix wan-animate temporal alignment without dropping frames --- diffsynth/pipelines/wan_video.py | 38 ++++++++++---------------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/diffsynth/pipelines/wan_video.py b/diffsynth/pipelines/wan_video.py index bbc479e29..f347a8d24 100644 --- a/diffsynth/pipelines/wan_video.py +++ b/diffsynth/pipelines/wan_video.py @@ -66,7 +66,6 @@ def __init__(self, device=get_device_type(), torch_dtype=torch.bfloat16): WanVideoUnit_FunCameraControl(), WanVideoUnit_SpeedControl(), WanVideoUnit_VACE(), - WanVideoUnit_AnimateVideoSplit(), WanVideoUnit_AnimatePoseLatents(), WanVideoUnit_AnimateFacePixelValues(), WanVideoUnit_AnimateInpaint(), @@ -351,12 +350,15 @@ def process(self, pipe: WanVideoPipeline, height, width, num_frames): class WanVideoUnit_NoiseInitializer(PipelineUnit): def __init__(self): super().__init__( - input_params=("height", "width", "num_frames", "seed", "rand_device", "vace_reference_image"), + input_params=("height", "width", "num_frames", "seed", "rand_device", "vace_reference_image", "input_image", "animate_pose_video"), output_params=("noise",) ) - def process(self, pipe: WanVideoPipeline, height, width, num_frames, seed, rand_device, vace_reference_image): + def process(self, pipe: WanVideoPipeline, height, width, num_frames, seed, rand_device, vace_reference_image, input_image, animate_pose_video): length = (num_frames - 1) // 4 + 1 + # For wan-animate, input_image is a single reference frame; align time dimension. + if input_image is not None and animate_pose_video is not None: + length += 1 if vace_reference_image is not None: f = len(vace_reference_image) if isinstance(vace_reference_image, list) else 1 length += f @@ -371,12 +373,12 @@ def process(self, pipe: WanVideoPipeline, height, width, num_frames, seed, rand_ class WanVideoUnit_InputVideoEmbedder(PipelineUnit): def __init__(self): super().__init__( - input_params=("input_video", "noise", "tiled", "tile_size", "tile_stride", "vace_reference_image"), + input_params=("input_video", "noise", "tiled", "tile_size", "tile_stride", "vace_reference_image", "input_image", "animate_pose_video"), output_params=("latents", "input_latents"), onload_model_names=("vae",) ) - def process(self, pipe: WanVideoPipeline, input_video, noise, tiled, tile_size, tile_stride, vace_reference_image): + def process(self, pipe: WanVideoPipeline, input_video, noise, tiled, tile_size, tile_stride, vace_reference_image, input_image, animate_pose_video): if input_video is None: return {"latents": noise} pipe.load_models_to_device(self.onload_model_names) @@ -388,6 +390,11 @@ def process(self, pipe: WanVideoPipeline, input_video, noise, tiled, tile_size, vace_reference_image = pipe.preprocess_video(vace_reference_image) vace_reference_latents = pipe.vae.encode(vace_reference_image, device=pipe.device).to(dtype=pipe.torch_dtype, device=pipe.device) input_latents = torch.concat([vace_reference_latents, input_latents], dim=2) + # For wan-animate, prepend the single reference frame latent + if input_image is not None and animate_pose_video is not None: + input_image = pipe.preprocess_video([input_image]) + input_image_latents = pipe.vae.encode(input_image, device=pipe.device).to(dtype=pipe.torch_dtype, device=pipe.device) + input_latents = torch.concat([input_image_latents, input_latents], dim=2) if pipe.scheduler.training: return {"latents": noise, "input_latents": input_latents} else: @@ -903,27 +910,6 @@ def process(self, pipe: WanVideoPipeline, latents, motion_latents, drop_motion_f return {"latents": latents} -class WanVideoUnit_AnimateVideoSplit(PipelineUnit): - def __init__(self): - super().__init__( - input_params=("input_video", "animate_pose_video", "animate_face_video", "animate_inpaint_video", "animate_mask_video"), - output_params=("animate_pose_video", "animate_face_video", "animate_inpaint_video", "animate_mask_video") - ) - - def process(self, pipe: WanVideoPipeline, input_video, animate_pose_video, animate_face_video, animate_inpaint_video, animate_mask_video): - if input_video is None: - return {} - if animate_pose_video is not None: - animate_pose_video = animate_pose_video[:len(input_video) - 4] - if animate_face_video is not None: - animate_face_video = animate_face_video[:len(input_video) - 4] - if animate_inpaint_video is not None: - animate_inpaint_video = animate_inpaint_video[:len(input_video) - 4] - if animate_mask_video is not None: - animate_mask_video = animate_mask_video[:len(input_video) - 4] - return {"animate_pose_video": animate_pose_video, "animate_face_video": animate_face_video, "animate_inpaint_video": animate_inpaint_video, "animate_mask_video": animate_mask_video} - - class WanVideoUnit_AnimatePoseLatents(PipelineUnit): def __init__(self): super().__init__( From 9e636c88908b16ee883c0108d708b9b3ecaeee4a Mon Sep 17 00:00:00 2001 From: zwplus <158684334@qq.com> Date: Sun, 8 Mar 2026 01:56:20 +0800 Subject: [PATCH 2/8] Set wan-animate examples to 77 frames --- .../model_inference/Wan2.2-Animate-14B.py | 21 ++++++++++--------- .../Wan2.2-Animate-14B.py | 21 ++++++++++--------- .../validate_full/Wan2.2-Animate-14B.py | 10 ++++----- .../validate_lora/Wan2.2-Animate-14B.py | 10 ++++----- 4 files changed, 32 insertions(+), 30 deletions(-) diff --git a/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py b/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py index d435b688f..e80b0638e 100644 --- a/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py +++ b/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py @@ -26,15 +26,15 @@ # Animate input_image = Image.open("data/examples/wan/animate/animate_input_image.png") -animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4").raw_data()[:81-4] -animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4").raw_data()[:81-4] +animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4").raw_data()[:77] +animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4").raw_data()[:77] video = pipe( - prompt="视频中的人在做动作", + prompt="视频中的人在做动?, seed=0, tiled=True, input_image=input_image, animate_pose_video=animate_pose_video, animate_face_video=animate_face_video, - num_frames=81, height=720, width=1280, + num_frames=77, height=720, width=1280, num_inference_steps=20, cfg_scale=1, ) save_video(video, "video_1_Wan2.2-Animate-14B.mp4", fps=15, quality=5) @@ -44,19 +44,20 @@ lora_state_dict = load_state_dict("models/Wan-AI/Wan2.2-Animate-14B/relighting_lora.ckpt", torch_dtype=torch.bfloat16, device="cuda")["state_dict"] pipe.load_lora(pipe.dit, state_dict=lora_state_dict) input_image = Image.open("data/examples/wan/animate/replace_input_image.png") -animate_pose_video = VideoData("data/examples/wan/animate/replace_pose_video.mp4").raw_data()[:81-4] -animate_face_video = VideoData("data/examples/wan/animate/replace_face_video.mp4").raw_data()[:81-4] -animate_inpaint_video = VideoData("data/examples/wan/animate/replace_inpaint_video.mp4").raw_data()[:81-4] -animate_mask_video = VideoData("data/examples/wan/animate/replace_mask_video.mp4").raw_data()[:81-4] +animate_pose_video = VideoData("data/examples/wan/animate/replace_pose_video.mp4").raw_data()[:77] +animate_face_video = VideoData("data/examples/wan/animate/replace_face_video.mp4").raw_data()[:77] +animate_inpaint_video = VideoData("data/examples/wan/animate/replace_inpaint_video.mp4").raw_data()[:77] +animate_mask_video = VideoData("data/examples/wan/animate/replace_mask_video.mp4").raw_data()[:77] video = pipe( - prompt="视频中的人在做动作", + prompt="视频中的人在做动?, seed=0, tiled=True, input_image=input_image, animate_pose_video=animate_pose_video, animate_face_video=animate_face_video, animate_inpaint_video=animate_inpaint_video, animate_mask_video=animate_mask_video, - num_frames=81, height=720, width=1280, + num_frames=77, height=720, width=1280, num_inference_steps=20, cfg_scale=1, ) save_video(video, "video_2_Wan2.2-Animate-14B.mp4", fps=15, quality=5) + diff --git a/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py b/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py index 180482c14..be1431a7b 100644 --- a/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py +++ b/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py @@ -37,15 +37,15 @@ # Animate input_image = Image.open("data/examples/wan/animate/animate_input_image.png") -animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4").raw_data()[:81-4] -animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4").raw_data()[:81-4] +animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4").raw_data()[:77] +animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4").raw_data()[:77] video = pipe( - prompt="视频中的人在做动作", + prompt="视频中的人在做动?, seed=0, tiled=True, input_image=input_image, animate_pose_video=animate_pose_video, animate_face_video=animate_face_video, - num_frames=81, height=720, width=1280, + num_frames=77, height=720, width=1280, num_inference_steps=20, cfg_scale=1, ) save_video(video, "video_1_Wan2.2-Animate-14B.mp4", fps=15, quality=5) @@ -56,19 +56,20 @@ lora_state_dict = {i: lora_state_dict[i].to(torch.bfloat16) for i in lora_state_dict} pipe.load_lora(pipe.dit, state_dict=lora_state_dict) input_image = Image.open("data/examples/wan/animate/replace_input_image.png") -animate_pose_video = VideoData("data/examples/wan/animate/replace_pose_video.mp4").raw_data()[:81-4] -animate_face_video = VideoData("data/examples/wan/animate/replace_face_video.mp4").raw_data()[:81-4] -animate_inpaint_video = VideoData("data/examples/wan/animate/replace_inpaint_video.mp4").raw_data()[:81-4] -animate_mask_video = VideoData("data/examples/wan/animate/replace_mask_video.mp4").raw_data()[:81-4] +animate_pose_video = VideoData("data/examples/wan/animate/replace_pose_video.mp4").raw_data()[:77] +animate_face_video = VideoData("data/examples/wan/animate/replace_face_video.mp4").raw_data()[:77] +animate_inpaint_video = VideoData("data/examples/wan/animate/replace_inpaint_video.mp4").raw_data()[:77] +animate_mask_video = VideoData("data/examples/wan/animate/replace_mask_video.mp4").raw_data()[:77] video = pipe( - prompt="视频中的人在做动作", + prompt="视频中的人在做动?, seed=0, tiled=True, input_image=input_image, animate_pose_video=animate_pose_video, animate_face_video=animate_face_video, animate_inpaint_video=animate_inpaint_video, animate_mask_video=animate_mask_video, - num_frames=81, height=720, width=1280, + num_frames=77, height=720, width=1280, num_inference_steps=20, cfg_scale=1, ) save_video(video, "video_2_Wan2.2-Animate-14B.mp4", fps=15, quality=5) + diff --git a/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py b/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py index 0cdce0656..4a2806b9e 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py @@ -19,15 +19,15 @@ pipe.animate_adapter.load_state_dict(state_dict, strict=False) input_image = VideoData("data/example_video_dataset/animate/animate_output.mp4", height=480, width=832)[0] -animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4", height=480, width=832).raw_data()[:81-4] -animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4", height=512, width=512).raw_data()[:81-4] +animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4", height=480, width=832).raw_data()[:77] +animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4", height=512, width=512).raw_data()[:77] video = pipe( - prompt="视频中的人在做动作", + prompt="视频中的人在做动?, seed=0, tiled=True, input_image=input_image, animate_pose_video=animate_pose_video, animate_face_video=animate_face_video, - num_frames=81, height=480, width=832, + num_frames=77, height=480, width=832, num_inference_steps=20, cfg_scale=1, ) -save_video(video, "video_Wan2.2-Animate-14B.mp4", fps=15, quality=5) \ No newline at end of file +save_video(video, "video_Wan2.2-Animate-14B.mp4", fps=15, quality=5) diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py b/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py index 79326cd08..71c7b37b3 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py @@ -18,15 +18,15 @@ pipe.load_lora(pipe.dit, "models/train/Wan2.2-Animate-14B_lora/epoch-4.safetensors", alpha=1) input_image = VideoData("data/example_video_dataset/animate/animate_output.mp4", height=480, width=832)[0] -animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4", height=480, width=832).raw_data()[:81-4] -animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4", height=512, width=512).raw_data()[:81-4] +animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4", height=480, width=832).raw_data()[:77] +animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4", height=512, width=512).raw_data()[:77] video = pipe( - prompt="视频中的人在做动作", + prompt="视频中的人在做动?, seed=0, tiled=True, input_image=input_image, animate_pose_video=animate_pose_video, animate_face_video=animate_face_video, - num_frames=81, height=480, width=832, + num_frames=77, height=480, width=832, num_inference_steps=20, cfg_scale=1, ) -save_video(video, "video_Wan2.2-Animate-14B.mp4", fps=15, quality=5) \ No newline at end of file +save_video(video, "video_Wan2.2-Animate-14B.mp4", fps=15, quality=5) From 48a6826e53c0f4f415f5382e9054b36d4d3dd486 Mon Sep 17 00:00:00 2001 From: zwplus <158684334@qq.com> Date: Sun, 8 Mar 2026 01:57:24 +0800 Subject: [PATCH 3/8] Revert "Set wan-animate examples to 77 frames" This reverts commit 9e636c88908b16ee883c0108d708b9b3ecaeee4a. --- .../model_inference/Wan2.2-Animate-14B.py | 21 +++++++++---------- .../Wan2.2-Animate-14B.py | 21 +++++++++---------- .../validate_full/Wan2.2-Animate-14B.py | 10 ++++----- .../validate_lora/Wan2.2-Animate-14B.py | 10 ++++----- 4 files changed, 30 insertions(+), 32 deletions(-) diff --git a/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py b/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py index e80b0638e..d435b688f 100644 --- a/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py +++ b/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py @@ -26,15 +26,15 @@ # Animate input_image = Image.open("data/examples/wan/animate/animate_input_image.png") -animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4").raw_data()[:77] -animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4").raw_data()[:77] +animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4").raw_data()[:81-4] +animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4").raw_data()[:81-4] video = pipe( - prompt="视频中的人在做动?, + prompt="视频中的人在做动作", seed=0, tiled=True, input_image=input_image, animate_pose_video=animate_pose_video, animate_face_video=animate_face_video, - num_frames=77, height=720, width=1280, + num_frames=81, height=720, width=1280, num_inference_steps=20, cfg_scale=1, ) save_video(video, "video_1_Wan2.2-Animate-14B.mp4", fps=15, quality=5) @@ -44,20 +44,19 @@ lora_state_dict = load_state_dict("models/Wan-AI/Wan2.2-Animate-14B/relighting_lora.ckpt", torch_dtype=torch.bfloat16, device="cuda")["state_dict"] pipe.load_lora(pipe.dit, state_dict=lora_state_dict) input_image = Image.open("data/examples/wan/animate/replace_input_image.png") -animate_pose_video = VideoData("data/examples/wan/animate/replace_pose_video.mp4").raw_data()[:77] -animate_face_video = VideoData("data/examples/wan/animate/replace_face_video.mp4").raw_data()[:77] -animate_inpaint_video = VideoData("data/examples/wan/animate/replace_inpaint_video.mp4").raw_data()[:77] -animate_mask_video = VideoData("data/examples/wan/animate/replace_mask_video.mp4").raw_data()[:77] +animate_pose_video = VideoData("data/examples/wan/animate/replace_pose_video.mp4").raw_data()[:81-4] +animate_face_video = VideoData("data/examples/wan/animate/replace_face_video.mp4").raw_data()[:81-4] +animate_inpaint_video = VideoData("data/examples/wan/animate/replace_inpaint_video.mp4").raw_data()[:81-4] +animate_mask_video = VideoData("data/examples/wan/animate/replace_mask_video.mp4").raw_data()[:81-4] video = pipe( - prompt="视频中的人在做动?, + prompt="视频中的人在做动作", seed=0, tiled=True, input_image=input_image, animate_pose_video=animate_pose_video, animate_face_video=animate_face_video, animate_inpaint_video=animate_inpaint_video, animate_mask_video=animate_mask_video, - num_frames=77, height=720, width=1280, + num_frames=81, height=720, width=1280, num_inference_steps=20, cfg_scale=1, ) save_video(video, "video_2_Wan2.2-Animate-14B.mp4", fps=15, quality=5) - diff --git a/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py b/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py index be1431a7b..180482c14 100644 --- a/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py +++ b/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py @@ -37,15 +37,15 @@ # Animate input_image = Image.open("data/examples/wan/animate/animate_input_image.png") -animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4").raw_data()[:77] -animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4").raw_data()[:77] +animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4").raw_data()[:81-4] +animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4").raw_data()[:81-4] video = pipe( - prompt="视频中的人在做动?, + prompt="视频中的人在做动作", seed=0, tiled=True, input_image=input_image, animate_pose_video=animate_pose_video, animate_face_video=animate_face_video, - num_frames=77, height=720, width=1280, + num_frames=81, height=720, width=1280, num_inference_steps=20, cfg_scale=1, ) save_video(video, "video_1_Wan2.2-Animate-14B.mp4", fps=15, quality=5) @@ -56,20 +56,19 @@ lora_state_dict = {i: lora_state_dict[i].to(torch.bfloat16) for i in lora_state_dict} pipe.load_lora(pipe.dit, state_dict=lora_state_dict) input_image = Image.open("data/examples/wan/animate/replace_input_image.png") -animate_pose_video = VideoData("data/examples/wan/animate/replace_pose_video.mp4").raw_data()[:77] -animate_face_video = VideoData("data/examples/wan/animate/replace_face_video.mp4").raw_data()[:77] -animate_inpaint_video = VideoData("data/examples/wan/animate/replace_inpaint_video.mp4").raw_data()[:77] -animate_mask_video = VideoData("data/examples/wan/animate/replace_mask_video.mp4").raw_data()[:77] +animate_pose_video = VideoData("data/examples/wan/animate/replace_pose_video.mp4").raw_data()[:81-4] +animate_face_video = VideoData("data/examples/wan/animate/replace_face_video.mp4").raw_data()[:81-4] +animate_inpaint_video = VideoData("data/examples/wan/animate/replace_inpaint_video.mp4").raw_data()[:81-4] +animate_mask_video = VideoData("data/examples/wan/animate/replace_mask_video.mp4").raw_data()[:81-4] video = pipe( - prompt="视频中的人在做动?, + prompt="视频中的人在做动作", seed=0, tiled=True, input_image=input_image, animate_pose_video=animate_pose_video, animate_face_video=animate_face_video, animate_inpaint_video=animate_inpaint_video, animate_mask_video=animate_mask_video, - num_frames=77, height=720, width=1280, + num_frames=81, height=720, width=1280, num_inference_steps=20, cfg_scale=1, ) save_video(video, "video_2_Wan2.2-Animate-14B.mp4", fps=15, quality=5) - diff --git a/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py b/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py index 4a2806b9e..0cdce0656 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py @@ -19,15 +19,15 @@ pipe.animate_adapter.load_state_dict(state_dict, strict=False) input_image = VideoData("data/example_video_dataset/animate/animate_output.mp4", height=480, width=832)[0] -animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4", height=480, width=832).raw_data()[:77] -animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4", height=512, width=512).raw_data()[:77] +animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4", height=480, width=832).raw_data()[:81-4] +animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4", height=512, width=512).raw_data()[:81-4] video = pipe( - prompt="视频中的人在做动?, + prompt="视频中的人在做动作", seed=0, tiled=True, input_image=input_image, animate_pose_video=animate_pose_video, animate_face_video=animate_face_video, - num_frames=77, height=480, width=832, + num_frames=81, height=480, width=832, num_inference_steps=20, cfg_scale=1, ) -save_video(video, "video_Wan2.2-Animate-14B.mp4", fps=15, quality=5) +save_video(video, "video_Wan2.2-Animate-14B.mp4", fps=15, quality=5) \ No newline at end of file diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py b/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py index 71c7b37b3..79326cd08 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py @@ -18,15 +18,15 @@ pipe.load_lora(pipe.dit, "models/train/Wan2.2-Animate-14B_lora/epoch-4.safetensors", alpha=1) input_image = VideoData("data/example_video_dataset/animate/animate_output.mp4", height=480, width=832)[0] -animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4", height=480, width=832).raw_data()[:77] -animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4", height=512, width=512).raw_data()[:77] +animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4", height=480, width=832).raw_data()[:81-4] +animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4", height=512, width=512).raw_data()[:81-4] video = pipe( - prompt="视频中的人在做动?, + prompt="视频中的人在做动作", seed=0, tiled=True, input_image=input_image, animate_pose_video=animate_pose_video, animate_face_video=animate_face_video, - num_frames=77, height=480, width=832, + num_frames=81, height=480, width=832, num_inference_steps=20, cfg_scale=1, ) -save_video(video, "video_Wan2.2-Animate-14B.mp4", fps=15, quality=5) +save_video(video, "video_Wan2.2-Animate-14B.mp4", fps=15, quality=5) \ No newline at end of file From 42c6903b326a40159c7b618e25f097dea69e0fb7 Mon Sep 17 00:00:00 2001 From: zwplus <158684334@qq.com> Date: Sun, 8 Mar 2026 02:05:21 +0800 Subject: [PATCH 4/8] Use 77 frames in wan-animate examples --- .../model_inference/Wan2.2-Animate-14B.py | 19 ++++++++++--------- .../Wan2.2-Animate-14B.py | 19 ++++++++++--------- .../validate_full/Wan2.2-Animate-14B.py | 11 ++++++----- .../validate_lora/Wan2.2-Animate-14B.py | 11 ++++++----- 4 files changed, 32 insertions(+), 28 deletions(-) diff --git a/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py b/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py index d435b688f..05038bbe6 100644 --- a/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py +++ b/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py @@ -1,4 +1,4 @@ -import torch +import torch from PIL import Image from diffsynth.core import load_state_dict from diffsynth.utils.data import save_video, VideoData @@ -26,15 +26,15 @@ # Animate input_image = Image.open("data/examples/wan/animate/animate_input_image.png") -animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4").raw_data()[:81-4] -animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4").raw_data()[:81-4] +animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4").raw_data()[:77] +animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4").raw_data()[:77] video = pipe( prompt="视频中的人在做动作", seed=0, tiled=True, input_image=input_image, animate_pose_video=animate_pose_video, animate_face_video=animate_face_video, - num_frames=81, height=720, width=1280, + num_frames=77, height=720, width=1280, num_inference_steps=20, cfg_scale=1, ) save_video(video, "video_1_Wan2.2-Animate-14B.mp4", fps=15, quality=5) @@ -44,10 +44,10 @@ lora_state_dict = load_state_dict("models/Wan-AI/Wan2.2-Animate-14B/relighting_lora.ckpt", torch_dtype=torch.bfloat16, device="cuda")["state_dict"] pipe.load_lora(pipe.dit, state_dict=lora_state_dict) input_image = Image.open("data/examples/wan/animate/replace_input_image.png") -animate_pose_video = VideoData("data/examples/wan/animate/replace_pose_video.mp4").raw_data()[:81-4] -animate_face_video = VideoData("data/examples/wan/animate/replace_face_video.mp4").raw_data()[:81-4] -animate_inpaint_video = VideoData("data/examples/wan/animate/replace_inpaint_video.mp4").raw_data()[:81-4] -animate_mask_video = VideoData("data/examples/wan/animate/replace_mask_video.mp4").raw_data()[:81-4] +animate_pose_video = VideoData("data/examples/wan/animate/replace_pose_video.mp4").raw_data()[:77] +animate_face_video = VideoData("data/examples/wan/animate/replace_face_video.mp4").raw_data()[:77] +animate_inpaint_video = VideoData("data/examples/wan/animate/replace_inpaint_video.mp4").raw_data()[:77] +animate_mask_video = VideoData("data/examples/wan/animate/replace_mask_video.mp4").raw_data()[:77] video = pipe( prompt="视频中的人在做动作", seed=0, tiled=True, @@ -56,7 +56,8 @@ animate_face_video=animate_face_video, animate_inpaint_video=animate_inpaint_video, animate_mask_video=animate_mask_video, - num_frames=81, height=720, width=1280, + num_frames=77, height=720, width=1280, num_inference_steps=20, cfg_scale=1, ) save_video(video, "video_2_Wan2.2-Animate-14B.mp4", fps=15, quality=5) + diff --git a/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py b/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py index 180482c14..fa0c9eb49 100644 --- a/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py +++ b/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py @@ -1,4 +1,4 @@ -import torch +import torch from PIL import Image from diffsynth.core import load_state_dict from diffsynth.utils.data import save_video, VideoData @@ -37,15 +37,15 @@ # Animate input_image = Image.open("data/examples/wan/animate/animate_input_image.png") -animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4").raw_data()[:81-4] -animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4").raw_data()[:81-4] +animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4").raw_data()[:77] +animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4").raw_data()[:77] video = pipe( prompt="视频中的人在做动作", seed=0, tiled=True, input_image=input_image, animate_pose_video=animate_pose_video, animate_face_video=animate_face_video, - num_frames=81, height=720, width=1280, + num_frames=77, height=720, width=1280, num_inference_steps=20, cfg_scale=1, ) save_video(video, "video_1_Wan2.2-Animate-14B.mp4", fps=15, quality=5) @@ -56,10 +56,10 @@ lora_state_dict = {i: lora_state_dict[i].to(torch.bfloat16) for i in lora_state_dict} pipe.load_lora(pipe.dit, state_dict=lora_state_dict) input_image = Image.open("data/examples/wan/animate/replace_input_image.png") -animate_pose_video = VideoData("data/examples/wan/animate/replace_pose_video.mp4").raw_data()[:81-4] -animate_face_video = VideoData("data/examples/wan/animate/replace_face_video.mp4").raw_data()[:81-4] -animate_inpaint_video = VideoData("data/examples/wan/animate/replace_inpaint_video.mp4").raw_data()[:81-4] -animate_mask_video = VideoData("data/examples/wan/animate/replace_mask_video.mp4").raw_data()[:81-4] +animate_pose_video = VideoData("data/examples/wan/animate/replace_pose_video.mp4").raw_data()[:77] +animate_face_video = VideoData("data/examples/wan/animate/replace_face_video.mp4").raw_data()[:77] +animate_inpaint_video = VideoData("data/examples/wan/animate/replace_inpaint_video.mp4").raw_data()[:77] +animate_mask_video = VideoData("data/examples/wan/animate/replace_mask_video.mp4").raw_data()[:77] video = pipe( prompt="视频中的人在做动作", seed=0, tiled=True, @@ -68,7 +68,8 @@ animate_face_video=animate_face_video, animate_inpaint_video=animate_inpaint_video, animate_mask_video=animate_mask_video, - num_frames=81, height=720, width=1280, + num_frames=77, height=720, width=1280, num_inference_steps=20, cfg_scale=1, ) save_video(video, "video_2_Wan2.2-Animate-14B.mp4", fps=15, quality=5) + diff --git a/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py b/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py index 0cdce0656..4d877f09a 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py @@ -1,4 +1,4 @@ -import torch +import torch from PIL import Image from diffsynth.utils.data import save_video, VideoData from diffsynth.core import load_state_dict @@ -19,15 +19,16 @@ pipe.animate_adapter.load_state_dict(state_dict, strict=False) input_image = VideoData("data/example_video_dataset/animate/animate_output.mp4", height=480, width=832)[0] -animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4", height=480, width=832).raw_data()[:81-4] -animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4", height=512, width=512).raw_data()[:81-4] +animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4", height=480, width=832).raw_data()[:77] +animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4", height=512, width=512).raw_data()[:77] video = pipe( prompt="视频中的人在做动作", seed=0, tiled=True, input_image=input_image, animate_pose_video=animate_pose_video, animate_face_video=animate_face_video, - num_frames=81, height=480, width=832, + num_frames=77, height=480, width=832, num_inference_steps=20, cfg_scale=1, ) -save_video(video, "video_Wan2.2-Animate-14B.mp4", fps=15, quality=5) \ No newline at end of file +save_video(video, "video_Wan2.2-Animate-14B.mp4", fps=15, quality=5) + diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py b/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py index 79326cd08..a10d39a96 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py @@ -1,4 +1,4 @@ -import torch +import torch from PIL import Image from diffsynth.utils.data import save_video, VideoData from diffsynth.core import load_state_dict @@ -18,15 +18,16 @@ pipe.load_lora(pipe.dit, "models/train/Wan2.2-Animate-14B_lora/epoch-4.safetensors", alpha=1) input_image = VideoData("data/example_video_dataset/animate/animate_output.mp4", height=480, width=832)[0] -animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4", height=480, width=832).raw_data()[:81-4] -animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4", height=512, width=512).raw_data()[:81-4] +animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4", height=480, width=832).raw_data()[:77] +animate_face_video = VideoData("data/examples/wan/animate/animate_face_video.mp4", height=512, width=512).raw_data()[:77] video = pipe( prompt="视频中的人在做动作", seed=0, tiled=True, input_image=input_image, animate_pose_video=animate_pose_video, animate_face_video=animate_face_video, - num_frames=81, height=480, width=832, + num_frames=77, height=480, width=832, num_inference_steps=20, cfg_scale=1, ) -save_video(video, "video_Wan2.2-Animate-14B.mp4", fps=15, quality=5) \ No newline at end of file +save_video(video, "video_Wan2.2-Animate-14B.mp4", fps=15, quality=5) + From 1304e6f3ab416ffc318e721a2f05e26c0316c86a Mon Sep 17 00:00:00 2001 From: zhangwei <58248255+zwplus@users.noreply.github.com> Date: Sun, 8 Mar 2026 02:30:11 +0800 Subject: [PATCH 5/8] Update examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py b/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py index fa0c9eb49..fb6176ba2 100644 --- a/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py +++ b/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py @@ -1,4 +1,4 @@ -import torch +import torch from PIL import Image from diffsynth.core import load_state_dict from diffsynth.utils.data import save_video, VideoData From 91ae54f8431d8c6c1d3381b9fa515b9934e2df26 Mon Sep 17 00:00:00 2001 From: zhangwei <58248255+zwplus@users.noreply.github.com> Date: Sun, 8 Mar 2026 02:31:06 +0800 Subject: [PATCH 6/8] Update examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py b/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py index 4d877f09a..2d965d45f 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py @@ -1,4 +1,4 @@ -import torch +import torch from PIL import Image from diffsynth.utils.data import save_video, VideoData from diffsynth.core import load_state_dict From d5b7ad79377512ba6e9aa5496bcf897539b2462e Mon Sep 17 00:00:00 2001 From: zhangwei <58248255+zwplus@users.noreply.github.com> Date: Sun, 8 Mar 2026 02:31:16 +0800 Subject: [PATCH 7/8] Update examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py b/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py index a10d39a96..162ca0014 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py @@ -1,4 +1,4 @@ -import torch +import torch from PIL import Image from diffsynth.utils.data import save_video, VideoData from diffsynth.core import load_state_dict From 99b188739853cd63d5c4780d582ef77bd3710dd9 Mon Sep 17 00:00:00 2001 From: zwplus <158684334@qq.com> Date: Sun, 8 Mar 2026 02:34:32 +0800 Subject: [PATCH 8/8] Remove UTF-8 BOM from wan-animate examples --- examples/wanvideo/model_inference/Wan2.2-Animate-14B.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py b/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py index 05038bbe6..0dedeb7f3 100644 --- a/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py +++ b/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py @@ -1,4 +1,4 @@ -import torch +import torch from PIL import Image from diffsynth.core import load_state_dict from diffsynth.utils.data import save_video, VideoData