Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 41 additions & 22 deletions invokeai/app/invocations/qwen_image_denoise.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,29 +353,44 @@ def _run_diffusion(self, context: InvocationContext):
# Pack latents into 2x2 patches: (B, C, H, W) -> (B, H/2*W/2, C*4)
latents = self._pack_latents(latents, 1, out_channels, latent_height, latent_width)

# Pack reference image latents and concatenate along the sequence dimension.
# The edit transformer always expects [noisy_patches ; ref_patches] in its sequence.
if ref_latents is not None:
_, ref_ch, rh, rw = ref_latents.shape
if rh != latent_height or rw != latent_width:
ref_latents = torch.nn.functional.interpolate(
ref_latents, size=(latent_height, latent_width), mode="bilinear"
# Determine whether the model uses reference latent conditioning (zero_cond_t).
# Edit models (zero_cond_t=True) expect [noisy_patches ; ref_patches] in the sequence.
# Txt2img models (zero_cond_t=False) only take noisy patches.
has_zero_cond_t = getattr(transformer_info.model, "zero_cond_t", False) or getattr(
transformer_info.model.config, "zero_cond_t", False
)
use_ref_latents = has_zero_cond_t

ref_latents_packed = None
if use_ref_latents:
if ref_latents is not None:
_, ref_ch, rh, rw = ref_latents.shape
if rh != latent_height or rw != latent_width:
ref_latents = torch.nn.functional.interpolate(
ref_latents, size=(latent_height, latent_width), mode="bilinear"
)
else:
# No reference image provided — use zeros so the model still gets the
# expected sequence layout.
ref_latents = torch.zeros(
1, out_channels, latent_height, latent_width, device=device, dtype=inference_dtype
)
ref_latents_packed = self._pack_latents(ref_latents, 1, out_channels, latent_height, latent_width)

# img_shapes tells the transformer the spatial layout of patches.
if use_ref_latents:
img_shapes = [
[
(1, latent_height // 2, latent_width // 2),
(1, latent_height // 2, latent_width // 2),
]
]
else:
# No reference image provided — use zeros so the model still gets the
# expected sequence layout.
ref_latents = torch.zeros(
1, out_channels, latent_height, latent_width, device=device, dtype=inference_dtype
)
ref_latents_packed = self._pack_latents(ref_latents, 1, out_channels, latent_height, latent_width)

# img_shapes tells the transformer the spatial layout of noisy and reference patches.
img_shapes = [
[
(1, latent_height // 2, latent_width // 2),
(1, latent_height // 2, latent_width // 2),
img_shapes = [
[
(1, latent_height // 2, latent_width // 2),
]
]
]

# Prepare inpaint extension (operates in 4D space, so unpack/repack around it)
inpaint_mask = self._prep_inpaint_mask(context, noise) # noise has the right 4D shape
Expand Down Expand Up @@ -428,8 +443,12 @@ def _run_diffusion(self, context: InvocationContext):
# The pipeline passes timestep / 1000 to the transformer
timestep = t.expand(latents.shape[0]).to(inference_dtype)

# Concatenate noisy and reference patches along the sequence dim
model_input = torch.cat([latents, ref_latents_packed], dim=1)
# For edit models: concatenate noisy and reference patches along the sequence dim
# For txt2img models: just use noisy patches
if ref_latents_packed is not None:
model_input = torch.cat([latents, ref_latents_packed], dim=1)
else:
model_input = latents

noise_pred_cond = transformer(
hidden_states=model_input,
Expand Down
49 changes: 35 additions & 14 deletions invokeai/app/invocations/qwen_image_text_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,27 +20,45 @@
QwenImageConditioningInfo,
)

# The Qwen Image Edit pipeline uses a specific system prompt and drops the first
# N tokens (the system prompt prefix) from the embeddings. These constants are
# taken directly from the diffusers QwenImagePipeline.
_SYSTEM_PROMPT = (
# Prompt templates and drop indices for the two Qwen Image model modes.
# These are taken directly from the diffusers pipelines.

# Image editing mode (QwenImagePipeline)
_EDIT_SYSTEM_PROMPT = (
"Describe the key features of the input image (color, shape, size, texture, objects, background), "
"then explain how the user's text instruction should alter or modify the image. "
"Generate a new image that meets the user's requirements while maintaining consistency "
"with the original input where appropriate."
)
_EDIT_DROP_IDX = 64

# Text-to-image mode (QwenImagePipeline)
_GENERATE_SYSTEM_PROMPT = (
"Describe the image by detailing the color, shape, size, texture, quantity, "
"text, spatial relationships of the objects and background:"
)
_GENERATE_DROP_IDX = 34

_IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"
_DROP_IDX = 64


def _build_prompt(user_prompt: str, num_images: int) -> str:
"""Build the full prompt with one vision placeholder per reference image."""
image_tokens = _IMAGE_PLACEHOLDER * max(num_images, 1)
return (
f"<|im_start|>system\n{_SYSTEM_PROMPT}<|im_end|>\n"
f"<|im_start|>user\n{image_tokens}{user_prompt}<|im_end|>\n"
"<|im_start|>assistant\n"
)
"""Build the full prompt with the appropriate template based on whether reference images are provided."""
if num_images > 0:
# Edit mode: include vision placeholders for reference images
image_tokens = _IMAGE_PLACEHOLDER * num_images
return (
f"<|im_start|>system\n{_EDIT_SYSTEM_PROMPT}<|im_end|>\n"
f"<|im_start|>user\n{image_tokens}{user_prompt}<|im_end|>\n"
"<|im_start|>assistant\n"
)
else:
# Generate mode: text-only prompt
return (
f"<|im_start|>system\n{_GENERATE_SYSTEM_PROMPT}<|im_end|>\n"
f"<|im_start|>user\n{user_prompt}<|im_end|>\n"
"<|im_start|>assistant\n"
)


@invocation(
Expand Down Expand Up @@ -188,15 +206,18 @@ def _encode(
hidden_states = outputs.hidden_states[-1]

# Extract valid (non-padding) tokens using the attention mask,
# then drop the first _DROP_IDX tokens (system prompt prefix).
# then drop the system prompt prefix tokens.
# The drop index differs between edit mode (64) and generate mode (34).
drop_idx = _EDIT_DROP_IDX if images else _GENERATE_DROP_IDX

attn_mask = model_inputs.attention_mask
bool_mask = attn_mask.bool()
valid_lengths = bool_mask.sum(dim=1)
selected = hidden_states[bool_mask]
split_hidden = torch.split(selected, valid_lengths.tolist(), dim=0)

# Drop system prefix tokens and build padded output
trimmed = [h[_DROP_IDX:] for h in split_hidden]
trimmed = [h[drop_idx:] for h in split_hidden]
attn_mask_list = [torch.ones(h.size(0), dtype=torch.long, device=device) for h in trimmed]
max_seq_len = max(h.size(0) for h in trimmed)

Expand Down
10 changes: 8 additions & 2 deletions invokeai/app/services/model_records/model_records_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
ModelSourceType,
ModelType,
ModelVariantType,
QwenImageVariantType,
Qwen3VariantType,
QwenImageVariantType,
SchedulerPredictionType,
ZImageVariantType,
)
Expand Down Expand Up @@ -95,7 +95,13 @@ class ModelRecordChanges(BaseModelExcludeNull):
# Checkpoint-specific changes
# TODO(MM2): Should we expose these? Feels footgun-y...
variant: Optional[
ModelVariantType | ClipVariantType | FluxVariantType | Flux2VariantType | ZImageVariantType | QwenImageVariantType | Qwen3VariantType
ModelVariantType
| ClipVariantType
| FluxVariantType
| Flux2VariantType
| ZImageVariantType
| QwenImageVariantType
| Qwen3VariantType
] = Field(description="The variant of the model.", default=None)
prediction_type: Optional[SchedulerPredictionType] = Field(
description="The prediction type of the model.", default=None
Expand Down
26 changes: 22 additions & 4 deletions invokeai/backend/model_manager/configs/lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -775,14 +775,24 @@ def _validate_looks_like_lora(cls, mod: ModelOnDisk) -> None:
state_dict,
{"lora_A.weight", "lora_B.weight", "lora_down.weight", "lora_up.weight", "dora_scale"},
)
# Must NOT have diffusion_model.layers (Z-Image) or double_blocks/single_blocks (Flux)
# Must NOT have diffusion_model.layers (Z-Image) or Flux-style keys.
# Flux LoRAs can have transformer.single_transformer_blocks or transformer.transformer_blocks
# (with the "transformer." prefix and "single_" variant) which would falsely match our check.
has_z_image_keys = state_dict_has_any_keys_starting_with(state_dict, {"diffusion_model.layers."})
has_flux_keys = state_dict_has_any_keys_starting_with(state_dict, {"double_blocks.", "single_blocks."})
has_flux_keys = state_dict_has_any_keys_starting_with(
state_dict,
{
"double_blocks.",
"single_blocks.",
"single_transformer_blocks.",
"transformer.single_transformer_blocks.",
},
)

if has_qwen_ie_keys and has_lora_suffix and not has_z_image_keys and not has_flux_keys:
return

raise NotAMatchError("model does not match Qwen Image Edit LoRA heuristics")
raise NotAMatchError("model does not match Qwen Image LoRA heuristics")

@classmethod
def _get_base_or_raise(cls, mod: ModelOnDisk) -> BaseModelType:
Expand All @@ -791,7 +801,15 @@ def _get_base_or_raise(cls, mod: ModelOnDisk) -> BaseModelType:
state_dict, {"transformer_blocks.", "transformer.transformer_blocks."}
)
has_z_image_keys = state_dict_has_any_keys_starting_with(state_dict, {"diffusion_model.layers."})
has_flux_keys = state_dict_has_any_keys_starting_with(state_dict, {"double_blocks.", "single_blocks."})
has_flux_keys = state_dict_has_any_keys_starting_with(
state_dict,
{
"double_blocks.",
"single_blocks.",
"single_transformer_blocks.",
"transformer.single_transformer_blocks.",
},
)

if has_qwen_ie_keys and not has_z_image_keys and not has_flux_keys:
return BaseModelType.QwenImage
Expand Down
4 changes: 2 additions & 2 deletions invokeai/backend/model_manager/configs/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1208,7 +1208,7 @@ class Main_Diffusers_QwenImage_Config(Diffusers_Config_Base, Main_Config_Base, C
"""Model config for Qwen Image diffusers models (both txt2img and edit)."""

base: Literal[BaseModelType.QwenImage] = Field(BaseModelType.QwenImage)
variant: QwenImageVariantType = Field(default=QwenImageVariantType.Generate)
variant: QwenImageVariantType | None = Field(default=None)

@classmethod
def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self:
Expand Down Expand Up @@ -1269,7 +1269,7 @@ class Main_GGUF_QwenImage_Config(Checkpoint_Config_Base, Main_Config_Base, Confi

base: Literal[BaseModelType.QwenImage] = Field(default=BaseModelType.QwenImage)
format: Literal[ModelFormat.GGUFQuantized] = Field(default=ModelFormat.GGUFQuantized)
variant: QwenImageVariantType = Field(default=QwenImageVariantType.Generate)
variant: QwenImageVariantType | None = Field(default=None)

@classmethod
def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
BaseModelType,
ModelFormat,
ModelType,
QwenImageVariantType,
SubModelType,
)
from invokeai.backend.quantization.gguf.ggml_tensor import GGMLTensor
Expand Down Expand Up @@ -160,10 +161,13 @@ def _load_from_singlefile(self, config: AnyModelConfig) -> AnyModel:
"axes_dims_rope": (16, 56, 56),
}

# zero_cond_t was added in diffusers 0.37+; skip it on older versions
# zero_cond_t is only used by edit-variant models. It enables dual modulation
# for noisy vs reference patches. Setting it on txt2img models produces garbage.
# Also requires diffusers 0.37+ (the parameter doesn't exist in older versions).
import inspect

if "zero_cond_t" in inspect.signature(QwenImageTransformer2DModel.__init__).parameters:
is_edit = getattr(config, "variant", None) == QwenImageVariantType.Edit
if is_edit and "zero_cond_t" in inspect.signature(QwenImageTransformer2DModel.__init__).parameters:
model_config["zero_cond_t"] = True

with accelerate.init_empty_weights():
Expand Down
Loading
Loading