Skip to content

Commit f1b70c2

Browse files
authored
supports wan2.2 (#127)
* supports wan2.2 * revert cpu offload * fix loading configs * update README
1 parent 45ab89b commit f1b70c2

25 files changed

Lines changed: 919 additions & 205 deletions

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@ and offloading strategies, enabling loading of larger diffusion models (e.g., Fl
2121

2222
- **Cross-Platform Support:** Runnable on Windows, macOS (Apple Silicon), and Linux, ensuring a smooth experience across different operating systems.
2323

24+
## News
25+
26+
- **[v0.4.0](https://github.com/modelscope/DiffSynth-Engine/releases/tag/v0.4.0)** | **August 1, 2025**:
27+
- 🔥Supports [Wan2.2](https://modelscope.cn/collections/tongyiwanxiang-22--shipinshengcheng-2bb5b1adef2840) video generation model
28+
- ⚠️[**Breaking Change**] Improved `from_pretrained` method pipeline initialization
29+
2430
## Quick Start
2531
### Requirements
2632

diffsynth_engine/algorithm/sampler/flow_match/flow_match_euler.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,12 @@ def initialize(self, init_latents, timesteps, sigmas, mask=None):
99
self.mask = mask
1010

1111
def step(self, latents, model_outputs, i):
12-
if self.mask is not None:
13-
model_outputs = model_outputs * self.mask + self.init_latents * (1 - self.mask)
14-
1512
dt = self.sigmas[i + 1] - self.sigmas[i]
1613
latents = latents.to(dtype=torch.float32)
1714
latents = latents + model_outputs * dt
1815
latents = latents.to(dtype=model_outputs.dtype)
16+
if self.mask is not None:
17+
latents = latents * self.mask + self.init_latents * (1 - self.mask)
1918
return latents
2019

2120
def add_noise(self, latents, noise, sigma):

diffsynth_engine/conf/models/wan/dit/14b-i2v.json renamed to diffsynth_engine/conf/models/wan/dit/wan2.1-flf2v-14b.json

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
{
2-
"has_image_input": true,
2+
"has_clip_feature": true,
3+
"has_vae_feature": true,
4+
"flf_pos_emb": true,
35
"patch_size": [1, 2, 2],
46
"in_dim": 36,
57
"dim": 5120,
@@ -9,5 +11,6 @@
911
"out_dim": 16,
1012
"num_heads": 40,
1113
"num_layers": 40,
12-
"eps": 1e-6
14+
"eps": 1e-6,
15+
"shift": 16.0
1316
}

diffsynth_engine/conf/models/wan/dit/14b-flf2v.json renamed to diffsynth_engine/conf/models/wan/dit/wan2.1-i2v-14b.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
2-
"has_image_input": true,
3-
"flf_pos_emb": true,
2+
"has_clip_feature": true,
3+
"has_vae_feature": true,
44
"patch_size": [1, 2, 2],
55
"in_dim": 36,
66
"dim": 5120,

diffsynth_engine/conf/models/wan/dit/1.3b-t2v.json renamed to diffsynth_engine/conf/models/wan/dit/wan2.1-t2v-1.3b.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
{
2-
"has_image_input": false,
32
"patch_size": [1, 2, 2],
43
"in_dim": 16,
54
"dim": 1536,

diffsynth_engine/conf/models/wan/dit/14b-t2v.json renamed to diffsynth_engine/conf/models/wan/dit/wan2.1-t2v-14b.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
{
2-
"has_image_input": false,
32
"patch_size": [1, 2, 2],
43
"in_dim": 16,
54
"dim": 5120,
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{
2+
"has_vae_feature": true,
3+
"patch_size": [1, 2, 2],
4+
"in_dim": 36,
5+
"dim": 5120,
6+
"ffn_dim": 13824,
7+
"freq_dim": 256,
8+
"text_dim": 4096,
9+
"out_dim": 16,
10+
"num_heads": 40,
11+
"num_layers": 40,
12+
"eps": 1e-6,
13+
"boundary": 0.900,
14+
"cfg_scale": [3.5, 3.5],
15+
"num_inference_steps": 40
16+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{
2+
"patch_size": [1, 2, 2],
3+
"in_dim": 16,
4+
"dim": 5120,
5+
"ffn_dim": 13824,
6+
"freq_dim": 256,
7+
"text_dim": 4096,
8+
"out_dim": 16,
9+
"num_heads": 40,
10+
"num_layers": 40,
11+
"eps": 1e-6,
12+
"boundary": 0.875,
13+
"shift": 12.0,
14+
"cfg_scale": [3.0, 4.0],
15+
"num_inference_steps": 40
16+
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"fuse_image_latents": true,
3+
"patch_size": [1, 2, 2],
4+
"in_dim": 48,
5+
"dim": 3072,
6+
"ffn_dim": 14336,
7+
"freq_dim": 256,
8+
"text_dim": 4096,
9+
"out_dim": 48,
10+
"num_heads": 24,
11+
"num_layers": 30,
12+
"eps": 1e-6,
13+
"fps": 24
14+
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
{
2+
"in_channels": 3,
3+
"out_channels": 3,
4+
"encoder_dim": 96,
5+
"decoder_dim": 96,
6+
"z_dim": 16,
7+
"dim_mult": [1, 2, 4, 4],
8+
"num_res_blocks": 2,
9+
"temperal_downsample": [false, true, true],
10+
"dropout": 0.0,
11+
"patch_size": 1,
12+
"mean": [
13+
-0.7571,
14+
-0.7089,
15+
-0.9113,
16+
0.1075,
17+
-0.1745,
18+
0.9653,
19+
-0.1517,
20+
1.5508,
21+
0.4134,
22+
-0.0715,
23+
0.5517,
24+
-0.3632,
25+
-0.1922,
26+
-0.9497,
27+
0.2503,
28+
-0.2921
29+
],
30+
"std": [
31+
2.8184,
32+
1.4541,
33+
2.3275,
34+
2.6558,
35+
1.2196,
36+
1.7708,
37+
2.6052,
38+
2.0743,
39+
3.2687,
40+
2.1526,
41+
2.8652,
42+
1.5579,
43+
1.6382,
44+
1.1253,
45+
2.8251,
46+
1.9160
47+
]
48+
}

0 commit comments

Comments
 (0)