diff --git a/invokeai/app/invocations/qwen_image_denoise.py b/invokeai/app/invocations/qwen_image_denoise.py index cd3ff917596..4b9fb207680 100644 --- a/invokeai/app/invocations/qwen_image_denoise.py +++ b/invokeai/app/invocations/qwen_image_denoise.py @@ -353,29 +353,44 @@ def _run_diffusion(self, context: InvocationContext): # Pack latents into 2x2 patches: (B, C, H, W) -> (B, H/2*W/2, C*4) latents = self._pack_latents(latents, 1, out_channels, latent_height, latent_width) - # Pack reference image latents and concatenate along the sequence dimension. - # The edit transformer always expects [noisy_patches ; ref_patches] in its sequence. - if ref_latents is not None: - _, ref_ch, rh, rw = ref_latents.shape - if rh != latent_height or rw != latent_width: - ref_latents = torch.nn.functional.interpolate( - ref_latents, size=(latent_height, latent_width), mode="bilinear" + # Determine whether the model uses reference latent conditioning (zero_cond_t). + # Edit models (zero_cond_t=True) expect [noisy_patches ; ref_patches] in the sequence. + # Txt2img models (zero_cond_t=False) only take noisy patches. + has_zero_cond_t = getattr(transformer_info.model, "zero_cond_t", False) or getattr( + transformer_info.model.config, "zero_cond_t", False + ) + use_ref_latents = has_zero_cond_t + + ref_latents_packed = None + if use_ref_latents: + if ref_latents is not None: + _, ref_ch, rh, rw = ref_latents.shape + if rh != latent_height or rw != latent_width: + ref_latents = torch.nn.functional.interpolate( + ref_latents, size=(latent_height, latent_width), mode="bilinear" + ) + else: + # No reference image provided — use zeros so the model still gets the + # expected sequence layout. + ref_latents = torch.zeros( + 1, out_channels, latent_height, latent_width, device=device, dtype=inference_dtype ) + ref_latents_packed = self._pack_latents(ref_latents, 1, out_channels, latent_height, latent_width) + + # img_shapes tells the transformer the spatial layout of patches. + if use_ref_latents: + img_shapes = [ + [ + (1, latent_height // 2, latent_width // 2), + (1, latent_height // 2, latent_width // 2), + ] + ] else: - # No reference image provided — use zeros so the model still gets the - # expected sequence layout. - ref_latents = torch.zeros( - 1, out_channels, latent_height, latent_width, device=device, dtype=inference_dtype - ) - ref_latents_packed = self._pack_latents(ref_latents, 1, out_channels, latent_height, latent_width) - - # img_shapes tells the transformer the spatial layout of noisy and reference patches. - img_shapes = [ - [ - (1, latent_height // 2, latent_width // 2), - (1, latent_height // 2, latent_width // 2), + img_shapes = [ + [ + (1, latent_height // 2, latent_width // 2), + ] ] - ] # Prepare inpaint extension (operates in 4D space, so unpack/repack around it) inpaint_mask = self._prep_inpaint_mask(context, noise) # noise has the right 4D shape @@ -428,8 +443,12 @@ def _run_diffusion(self, context: InvocationContext): # The pipeline passes timestep / 1000 to the transformer timestep = t.expand(latents.shape[0]).to(inference_dtype) - # Concatenate noisy and reference patches along the sequence dim - model_input = torch.cat([latents, ref_latents_packed], dim=1) + # For edit models: concatenate noisy and reference patches along the sequence dim + # For txt2img models: just use noisy patches + if ref_latents_packed is not None: + model_input = torch.cat([latents, ref_latents_packed], dim=1) + else: + model_input = latents noise_pred_cond = transformer( hidden_states=model_input, diff --git a/invokeai/app/invocations/qwen_image_text_encoder.py b/invokeai/app/invocations/qwen_image_text_encoder.py index 641e8c4d388..9e3f5723ba5 100644 --- a/invokeai/app/invocations/qwen_image_text_encoder.py +++ b/invokeai/app/invocations/qwen_image_text_encoder.py @@ -20,27 +20,45 @@ QwenImageConditioningInfo, ) -# The Qwen Image Edit pipeline uses a specific system prompt and drops the first -# N tokens (the system prompt prefix) from the embeddings. These constants are -# taken directly from the diffusers QwenImagePipeline. -_SYSTEM_PROMPT = ( +# Prompt templates and drop indices for the two Qwen Image model modes. +# These are taken directly from the diffusers pipelines. + +# Image editing mode (QwenImagePipeline) +_EDIT_SYSTEM_PROMPT = ( "Describe the key features of the input image (color, shape, size, texture, objects, background), " "then explain how the user's text instruction should alter or modify the image. " "Generate a new image that meets the user's requirements while maintaining consistency " "with the original input where appropriate." ) +_EDIT_DROP_IDX = 64 + +# Text-to-image mode (QwenImagePipeline) +_GENERATE_SYSTEM_PROMPT = ( + "Describe the image by detailing the color, shape, size, texture, quantity, " + "text, spatial relationships of the objects and background:" +) +_GENERATE_DROP_IDX = 34 + _IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>" -_DROP_IDX = 64 def _build_prompt(user_prompt: str, num_images: int) -> str: - """Build the full prompt with one vision placeholder per reference image.""" - image_tokens = _IMAGE_PLACEHOLDER * max(num_images, 1) - return ( - f"<|im_start|>system\n{_SYSTEM_PROMPT}<|im_end|>\n" - f"<|im_start|>user\n{image_tokens}{user_prompt}<|im_end|>\n" - "<|im_start|>assistant\n" - ) + """Build the full prompt with the appropriate template based on whether reference images are provided.""" + if num_images > 0: + # Edit mode: include vision placeholders for reference images + image_tokens = _IMAGE_PLACEHOLDER * num_images + return ( + f"<|im_start|>system\n{_EDIT_SYSTEM_PROMPT}<|im_end|>\n" + f"<|im_start|>user\n{image_tokens}{user_prompt}<|im_end|>\n" + "<|im_start|>assistant\n" + ) + else: + # Generate mode: text-only prompt + return ( + f"<|im_start|>system\n{_GENERATE_SYSTEM_PROMPT}<|im_end|>\n" + f"<|im_start|>user\n{user_prompt}<|im_end|>\n" + "<|im_start|>assistant\n" + ) @invocation( @@ -188,7 +206,10 @@ def _encode( hidden_states = outputs.hidden_states[-1] # Extract valid (non-padding) tokens using the attention mask, - # then drop the first _DROP_IDX tokens (system prompt prefix). + # then drop the system prompt prefix tokens. + # The drop index differs between edit mode (64) and generate mode (34). + drop_idx = _EDIT_DROP_IDX if images else _GENERATE_DROP_IDX + attn_mask = model_inputs.attention_mask bool_mask = attn_mask.bool() valid_lengths = bool_mask.sum(dim=1) @@ -196,7 +217,7 @@ def _encode( split_hidden = torch.split(selected, valid_lengths.tolist(), dim=0) # Drop system prefix tokens and build padded output - trimmed = [h[_DROP_IDX:] for h in split_hidden] + trimmed = [h[drop_idx:] for h in split_hidden] attn_mask_list = [torch.ones(h.size(0), dtype=torch.long, device=device) for h in trimmed] max_seq_len = max(h.size(0) for h in trimmed) diff --git a/invokeai/app/services/model_records/model_records_base.py b/invokeai/app/services/model_records/model_records_base.py index ea5b9ef7546..dcdc0ce5956 100644 --- a/invokeai/app/services/model_records/model_records_base.py +++ b/invokeai/app/services/model_records/model_records_base.py @@ -25,8 +25,8 @@ ModelSourceType, ModelType, ModelVariantType, - QwenImageVariantType, Qwen3VariantType, + QwenImageVariantType, SchedulerPredictionType, ZImageVariantType, ) @@ -95,7 +95,13 @@ class ModelRecordChanges(BaseModelExcludeNull): # Checkpoint-specific changes # TODO(MM2): Should we expose these? Feels footgun-y... variant: Optional[ - ModelVariantType | ClipVariantType | FluxVariantType | Flux2VariantType | ZImageVariantType | QwenImageVariantType | Qwen3VariantType + ModelVariantType + | ClipVariantType + | FluxVariantType + | Flux2VariantType + | ZImageVariantType + | QwenImageVariantType + | Qwen3VariantType ] = Field(description="The variant of the model.", default=None) prediction_type: Optional[SchedulerPredictionType] = Field( description="The prediction type of the model.", default=None diff --git a/invokeai/backend/model_manager/configs/lora.py b/invokeai/backend/model_manager/configs/lora.py index a5b9f40631d..f2e6f3b34fa 100644 --- a/invokeai/backend/model_manager/configs/lora.py +++ b/invokeai/backend/model_manager/configs/lora.py @@ -775,14 +775,24 @@ def _validate_looks_like_lora(cls, mod: ModelOnDisk) -> None: state_dict, {"lora_A.weight", "lora_B.weight", "lora_down.weight", "lora_up.weight", "dora_scale"}, ) - # Must NOT have diffusion_model.layers (Z-Image) or double_blocks/single_blocks (Flux) + # Must NOT have diffusion_model.layers (Z-Image) or Flux-style keys. + # Flux LoRAs can have transformer.single_transformer_blocks or transformer.transformer_blocks + # (with the "transformer." prefix and "single_" variant) which would falsely match our check. has_z_image_keys = state_dict_has_any_keys_starting_with(state_dict, {"diffusion_model.layers."}) - has_flux_keys = state_dict_has_any_keys_starting_with(state_dict, {"double_blocks.", "single_blocks."}) + has_flux_keys = state_dict_has_any_keys_starting_with( + state_dict, + { + "double_blocks.", + "single_blocks.", + "single_transformer_blocks.", + "transformer.single_transformer_blocks.", + }, + ) if has_qwen_ie_keys and has_lora_suffix and not has_z_image_keys and not has_flux_keys: return - raise NotAMatchError("model does not match Qwen Image Edit LoRA heuristics") + raise NotAMatchError("model does not match Qwen Image LoRA heuristics") @classmethod def _get_base_or_raise(cls, mod: ModelOnDisk) -> BaseModelType: @@ -791,7 +801,15 @@ def _get_base_or_raise(cls, mod: ModelOnDisk) -> BaseModelType: state_dict, {"transformer_blocks.", "transformer.transformer_blocks."} ) has_z_image_keys = state_dict_has_any_keys_starting_with(state_dict, {"diffusion_model.layers."}) - has_flux_keys = state_dict_has_any_keys_starting_with(state_dict, {"double_blocks.", "single_blocks."}) + has_flux_keys = state_dict_has_any_keys_starting_with( + state_dict, + { + "double_blocks.", + "single_blocks.", + "single_transformer_blocks.", + "transformer.single_transformer_blocks.", + }, + ) if has_qwen_ie_keys and not has_z_image_keys and not has_flux_keys: return BaseModelType.QwenImage diff --git a/invokeai/backend/model_manager/configs/main.py b/invokeai/backend/model_manager/configs/main.py index 484a95f4bb8..6ec0611fdf3 100644 --- a/invokeai/backend/model_manager/configs/main.py +++ b/invokeai/backend/model_manager/configs/main.py @@ -1208,7 +1208,7 @@ class Main_Diffusers_QwenImage_Config(Diffusers_Config_Base, Main_Config_Base, C """Model config for Qwen Image diffusers models (both txt2img and edit).""" base: Literal[BaseModelType.QwenImage] = Field(BaseModelType.QwenImage) - variant: QwenImageVariantType = Field(default=QwenImageVariantType.Generate) + variant: QwenImageVariantType | None = Field(default=None) @classmethod def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self: @@ -1269,7 +1269,7 @@ class Main_GGUF_QwenImage_Config(Checkpoint_Config_Base, Main_Config_Base, Confi base: Literal[BaseModelType.QwenImage] = Field(default=BaseModelType.QwenImage) format: Literal[ModelFormat.GGUFQuantized] = Field(default=ModelFormat.GGUFQuantized) - variant: QwenImageVariantType = Field(default=QwenImageVariantType.Generate) + variant: QwenImageVariantType | None = Field(default=None) @classmethod def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self: diff --git a/invokeai/backend/model_manager/load/model_loaders/qwen_image.py b/invokeai/backend/model_manager/load/model_loaders/qwen_image.py index 15fcedba166..a025e727945 100644 --- a/invokeai/backend/model_manager/load/model_loaders/qwen_image.py +++ b/invokeai/backend/model_manager/load/model_loaders/qwen_image.py @@ -15,6 +15,7 @@ BaseModelType, ModelFormat, ModelType, + QwenImageVariantType, SubModelType, ) from invokeai.backend.quantization.gguf.ggml_tensor import GGMLTensor @@ -160,10 +161,13 @@ def _load_from_singlefile(self, config: AnyModelConfig) -> AnyModel: "axes_dims_rope": (16, 56, 56), } - # zero_cond_t was added in diffusers 0.37+; skip it on older versions + # zero_cond_t is only used by edit-variant models. It enables dual modulation + # for noisy vs reference patches. Setting it on txt2img models produces garbage. + # Also requires diffusers 0.37+ (the parameter doesn't exist in older versions). import inspect - if "zero_cond_t" in inspect.signature(QwenImageTransformer2DModel.__init__).parameters: + is_edit = getattr(config, "variant", None) == QwenImageVariantType.Edit + if is_edit and "zero_cond_t" in inspect.signature(QwenImageTransformer2DModel.__init__).parameters: model_config["zero_cond_t"] = True with accelerate.init_empty_weights(): diff --git a/invokeai/backend/model_manager/starter_models.py b/invokeai/backend/model_manager/starter_models.py index de5f1e1b8b6..ef7b25431a0 100644 --- a/invokeai/backend/model_manager/starter_models.py +++ b/invokeai/backend/model_manager/starter_models.py @@ -650,7 +650,7 @@ class StarterModelBundle(BaseModel): # endregion # region Qwen Image Edit -qwen_image = StarterModel( +qwen_image_edit = StarterModel( name="Qwen Image Edit 2511", base=BaseModelType.QwenImage, source="Qwen/Qwen-Image-Edit-2511", @@ -658,43 +658,43 @@ class StarterModelBundle(BaseModel): type=ModelType.Main, ) -qwen_image_gguf_q4_k_m = StarterModel( +qwen_image_edit_gguf_q4_k_m = StarterModel( name="Qwen Image Edit 2511 (Q4_K_M)", base=BaseModelType.QwenImage, - source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-2511-Q4_K_M.gguf", + source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-edit-2511-Q4_K_M.gguf", description="Qwen Image Edit 2511 - Q4_K_M quantized transformer. Good quality/size balance. (~13GB)", type=ModelType.Main, format=ModelFormat.GGUFQuantized, ) -qwen_image_gguf_q2_k = StarterModel( +qwen_image_edit_gguf_q2_k = StarterModel( name="Qwen Image Edit 2511 (Q2_K)", base=BaseModelType.QwenImage, - source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-2511-Q2_K.gguf", + source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-edit-2511-Q2_K.gguf", description="Qwen Image Edit 2511 - Q2_K heavily quantized transformer. Smallest size, lower quality. (~7.5GB)", type=ModelType.Main, format=ModelFormat.GGUFQuantized, ) -qwen_image_gguf_q6_k = StarterModel( +qwen_image_edit_gguf_q6_k = StarterModel( name="Qwen Image Edit 2511 (Q6_K)", base=BaseModelType.QwenImage, - source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-2511-Q6_K.gguf", + source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-edit-2511-Q6_K.gguf", description="Qwen Image Edit 2511 - Q6_K quantized transformer. Near-lossless quality. (~17GB)", type=ModelType.Main, format=ModelFormat.GGUFQuantized, ) -qwen_image_gguf_q8_0 = StarterModel( +qwen_image_edit_gguf_q8_0 = StarterModel( name="Qwen Image Edit 2511 (Q8_0)", base=BaseModelType.QwenImage, - source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-2511-Q8_0.gguf", + source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-edit-2511-Q8_0.gguf", description="Qwen Image Edit 2511 - Q8_0 quantized transformer. Highest quality quantization. (~22GB)", type=ModelType.Main, format=ModelFormat.GGUFQuantized, ) -qwen_image_lightning_4step = StarterModel( +qwen_image_edit_lightning_4step = StarterModel( name="Qwen Image Edit Lightning (4-step, bf16)", base=BaseModelType.QwenImage, source="https://huggingface.co/lightx2v/Qwen-Image-Edit-2511-Lightning/resolve/main/Qwen-Image-Edit-2511-Lightning-4steps-V1.0-bf16.safetensors", @@ -703,7 +703,7 @@ class StarterModelBundle(BaseModel): type=ModelType.LoRA, ) -qwen_image_lightning_8step = StarterModel( +qwen_image_edit_lightning_8step = StarterModel( name="Qwen Image Edit Lightning (8-step, bf16)", base=BaseModelType.QwenImage, source="https://huggingface.co/lightx2v/Qwen-Image-Edit-2511-Lightning/resolve/main/Qwen-Image-Edit-2511-Lightning-8steps-V1.0-bf16.safetensors", @@ -711,6 +711,69 @@ class StarterModelBundle(BaseModel): "Settings: Steps=8, CFG=1, Shift Override=3.", type=ModelType.LoRA, ) + +# Qwen Image (txt2img) +qwen_image = StarterModel( + name="Qwen Image 2512", + base=BaseModelType.QwenImage, + source="Qwen/Qwen-Image-2512", + description="Qwen Image 2512 full diffusers model. High-quality text-to-image generation. (~40GB)", + type=ModelType.Main, +) + +qwen_image_gguf_q4_k_m = StarterModel( + name="Qwen Image 2512 (Q4_K_M)", + base=BaseModelType.QwenImage, + source="https://huggingface.co/unsloth/Qwen-Image-2512-GGUF/resolve/main/qwen-image-2512-Q4_K_M.gguf", + description="Qwen Image 2512 - Q4_K_M quantized transformer. Good quality/size balance. (~13GB)", + type=ModelType.Main, + format=ModelFormat.GGUFQuantized, +) + +qwen_image_gguf_q2_k = StarterModel( + name="Qwen Image 2512 (Q2_K)", + base=BaseModelType.QwenImage, + source="https://huggingface.co/unsloth/Qwen-Image-2512-GGUF/resolve/main/qwen-image-2512-Q2_K.gguf", + description="Qwen Image 2512 - Q2_K heavily quantized transformer. Smallest size, lower quality. (~7.5GB)", + type=ModelType.Main, + format=ModelFormat.GGUFQuantized, +) + +qwen_image_gguf_q6_k = StarterModel( + name="Qwen Image 2512 (Q6_K)", + base=BaseModelType.QwenImage, + source="https://huggingface.co/unsloth/Qwen-Image-2512-GGUF/resolve/main/qwen-image-2512-Q6_K.gguf", + description="Qwen Image 2512 - Q6_K quantized transformer. Near-lossless quality. (~17GB)", + type=ModelType.Main, + format=ModelFormat.GGUFQuantized, +) + +qwen_image_gguf_q8_0 = StarterModel( + name="Qwen Image 2512 (Q8_0)", + base=BaseModelType.QwenImage, + source="https://huggingface.co/unsloth/Qwen-Image-2512-GGUF/resolve/main/qwen-image-2512-Q8_0.gguf", + description="Qwen Image 2512 - Q8_0 quantized transformer. Highest quality quantization. (~22GB)", + type=ModelType.Main, + format=ModelFormat.GGUFQuantized, +) + +qwen_image_lightning_4step = StarterModel( + name="Qwen Image Lightning (4-step, V2.0, bf16)", + base=BaseModelType.QwenImage, + source="https://huggingface.co/lightx2v/Qwen-Image-Lightning/resolve/main/Qwen-Image-Lightning-4steps-V2.0-bf16.safetensors", + description="Lightning distillation LoRA for Qwen Image — enables generation in just 4 steps. " + "Settings: Steps=4, CFG=1, Shift Override=3.", + type=ModelType.LoRA, +) + +qwen_image_lightning_8step = StarterModel( + name="Qwen Image Lightning (8-step, V2.0, bf16)", + base=BaseModelType.QwenImage, + source="https://huggingface.co/lightx2v/Qwen-Image-Lightning/resolve/main/Qwen-Image-Lightning-8steps-V2.0-bf16.safetensors", + description="Lightning distillation LoRA for Qwen Image — enables generation in 8 steps with better quality. " + "Settings: Steps=8, CFG=1, Shift Override=3.", + type=ModelType.LoRA, +) # endregion # region SigLIP @@ -1012,6 +1075,13 @@ class StarterModelBundle(BaseModel): flux2_klein_qwen3_4b_encoder, flux2_klein_qwen3_8b_encoder, cogview4, + qwen_image_edit, + qwen_image_edit_gguf_q2_k, + qwen_image_edit_gguf_q4_k_m, + qwen_image_edit_gguf_q6_k, + qwen_image_edit_gguf_q8_0, + qwen_image_edit_lightning_4step, + qwen_image_edit_lightning_8step, qwen_image, qwen_image_gguf_q2_k, qwen_image_gguf_q4_k_m, @@ -1097,9 +1167,13 @@ class StarterModelBundle(BaseModel): ] qwen_image_bundle: list[StarterModel] = [ + qwen_image_edit, + qwen_image_edit_gguf_q4_k_m, + qwen_image_edit_gguf_q8_0, + qwen_image_edit_lightning_4step, + qwen_image_edit_lightning_8step, qwen_image, qwen_image_gguf_q4_k_m, - qwen_image_gguf_q8_0, qwen_image_lightning_4step, qwen_image_lightning_8step, ] diff --git a/invokeai/backend/model_manager/taxonomy.py b/invokeai/backend/model_manager/taxonomy.py index 9250310a29a..587c0b0625f 100644 --- a/invokeai/backend/model_manager/taxonomy.py +++ b/invokeai/backend/model_manager/taxonomy.py @@ -225,8 +225,28 @@ class FluxLoRAFormat(str, Enum): AnyVariant: TypeAlias = Union[ - ModelVariantType, ClipVariantType, FluxVariantType, Flux2VariantType, ZImageVariantType, QwenImageVariantType, Qwen3VariantType + ModelVariantType, + ClipVariantType, + FluxVariantType, + Flux2VariantType, + ZImageVariantType, + QwenImageVariantType, + Qwen3VariantType, ] variant_type_adapter = TypeAdapter[ - ModelVariantType | ClipVariantType | FluxVariantType | Flux2VariantType | ZImageVariantType | QwenImageVariantType | Qwen3VariantType -](ModelVariantType | ClipVariantType | FluxVariantType | Flux2VariantType | ZImageVariantType | QwenImageVariantType | Qwen3VariantType) + ModelVariantType + | ClipVariantType + | FluxVariantType + | Flux2VariantType + | ZImageVariantType + | QwenImageVariantType + | Qwen3VariantType +]( + ModelVariantType + | ClipVariantType + | FluxVariantType + | Flux2VariantType + | ZImageVariantType + | QwenImageVariantType + | Qwen3VariantType +) diff --git a/invokeai/frontend/web/src/features/controlLayers/hooks/addLayerHooks.ts b/invokeai/frontend/web/src/features/controlLayers/hooks/addLayerHooks.ts index 3cd28b5f2a0..2027ff41741 100644 --- a/invokeai/frontend/web/src/features/controlLayers/hooks/addLayerHooks.ts +++ b/invokeai/frontend/web/src/features/controlLayers/hooks/addLayerHooks.ts @@ -80,11 +80,7 @@ export const selectDefaultControlAdapter = createSelector( export const getDefaultRefImageConfig = ( getState: AppGetState -): - | IPAdapterConfig - | FluxKontextReferenceImageConfig - | Flux2ReferenceImageConfig - | QwenImageReferenceImageConfig => { +): IPAdapterConfig | FluxKontextReferenceImageConfig | Flux2ReferenceImageConfig | QwenImageReferenceImageConfig => { const state = getState(); const mainModelConfig = selectMainModelConfig(state); diff --git a/invokeai/frontend/web/src/features/metadata/parsing.tsx b/invokeai/frontend/web/src/features/metadata/parsing.tsx index 7d1d511a3c2..4f179d6b017 100644 --- a/invokeai/frontend/web/src/features/metadata/parsing.tsx +++ b/invokeai/frontend/web/src/features/metadata/parsing.tsx @@ -13,6 +13,9 @@ import { kleinVaeModelSelected, negativePromptChanged, positivePromptChanged, + qwenImageComponentSourceSelected, + qwenImageQuantizationChanged, + qwenImageShiftChanged, refinerModelChanged, selectBase, setCfgRescaleMultiplier, @@ -677,6 +680,83 @@ const ZImageSeedVarianceRandomizePercent: SingleMetadataHandler = { }; //#endregion ZImageSeedVarianceRandomizePercent +//#region QwenImageComponentSource +const QwenImageComponentSource: SingleMetadataHandler = { + [SingleMetadataKey]: true, + type: 'QwenImageComponentSource', + parse: (metadata, _store) => { + try { + const raw = getProperty(metadata, 'qwen_image_component_source'); + if (raw === null || raw === undefined) { + return Promise.resolve(null); + } + return Promise.resolve(zModelIdentifierField.parse(raw)); + } catch { + return Promise.resolve(null); + } + }, + recall: (value, store) => { + store.dispatch(qwenImageComponentSourceSelected(value)); + }, + i18nKey: 'modelManager.qwenImageComponentSource', + LabelComponent: MetadataLabel, + ValueComponent: ({ value }: SingleMetadataValueProps) => ( + + ), +}; +//#endregion QwenImageComponentSource + +//#region QwenImageQuantization +const QwenImageQuantization: SingleMetadataHandler<'none' | 'int8' | 'nf4'> = { + [SingleMetadataKey]: true, + type: 'QwenImageQuantization', + parse: (metadata, _store) => { + try { + const raw = getProperty(metadata, 'qwen_image_quantization'); + const parsed = z.enum(['none', 'int8', 'nf4']).parse(raw); + return Promise.resolve(parsed); + } catch { + return Promise.resolve('none' as const); + } + }, + recall: (value, store) => { + store.dispatch(qwenImageQuantizationChanged(value)); + }, + i18nKey: 'modelManager.qwenImageQuantization', + LabelComponent: MetadataLabel, + ValueComponent: ({ value }: SingleMetadataValueProps<'none' | 'int8' | 'nf4'>) => ( + + ), +}; +//#endregion QwenImageQuantization + +//#region QwenImageShift +const QwenImageShift: SingleMetadataHandler = { + [SingleMetadataKey]: true, + type: 'QwenImageShift', + parse: (metadata, _store) => { + try { + const raw = getProperty(metadata, 'qwen_image_shift'); + if (raw === null || raw === undefined) { + return Promise.resolve(null); + } + const parsed = z.number().parse(raw); + return Promise.resolve(parsed); + } catch { + return Promise.resolve(null); + } + }, + recall: (value, store) => { + store.dispatch(qwenImageShiftChanged(value)); + }, + i18nKey: 'modelManager.qwenImageShift', + LabelComponent: MetadataLabel, + ValueComponent: ({ value }: SingleMetadataValueProps) => ( + + ), +}; +//#endregion QwenImageShift + //#region RefinerModel const RefinerModel: SingleMetadataHandler = { [SingleMetadataKey]: true, @@ -1233,6 +1313,9 @@ export const ImageMetadataHandlers = { ZImageSeedVarianceEnabled, ZImageSeedVarianceStrength, ZImageSeedVarianceRandomizePercent, + QwenImageComponentSource, + QwenImageQuantization, + QwenImageShift, LoRAs, CanvasLayers, RefImages, diff --git a/invokeai/frontend/web/src/features/nodes/types/common.ts b/invokeai/frontend/web/src/features/nodes/types/common.ts index ca1d42c5a44..10afd6e44bb 100644 --- a/invokeai/frontend/web/src/features/nodes/types/common.ts +++ b/invokeai/frontend/web/src/features/nodes/types/common.ts @@ -153,7 +153,7 @@ export const zModelVariantType = z.enum(['normal', 'inpaint', 'depth']); export const zFluxVariantType = z.enum(['dev', 'dev_fill', 'schnell']); export const zFlux2VariantType = z.enum(['klein_4b', 'klein_9b', 'klein_9b_base']); export const zZImageVariantType = z.enum(['turbo', 'zbase']); -export const zQwenImageVariantType = z.enum(['generate', 'edit']); +const zQwenImageVariantType = z.enum(['generate', 'edit']); export const zQwen3VariantType = z.enum(['qwen3_4b', 'qwen3_8b']); export const zAnyModelVariant = z.union([ zModelVariantType, diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts index e7c04744d4e..1ea20a377e6 100644 --- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts +++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts @@ -15,11 +15,7 @@ import { addQwenImageLoRAs } from 'features/nodes/util/graph/generation/addQwenI import { addTextToImage } from 'features/nodes/util/graph/generation/addTextToImage'; import { addWatermarker } from 'features/nodes/util/graph/generation/addWatermarker'; import { Graph } from 'features/nodes/util/graph/generation/Graph'; -import { - getOriginalAndScaledSizesForTextToImage, - selectCanvasOutputFields, - selectPresetModifiedPrompts, -} from 'features/nodes/util/graph/graphBuilderUtils'; +import { selectCanvasOutputFields, selectPresetModifiedPrompts } from 'features/nodes/util/graph/graphBuilderUtils'; import type { GraphBuilderArg, GraphBuilderReturn, ImageOutputNodes } from 'features/nodes/util/graph/types'; import { selectActiveTab } from 'features/ui/store/uiSelectors'; import type { Invocation } from 'services/api/types'; @@ -103,14 +99,18 @@ export const buildQwenImageGraph = async (arg: GraphBuilderArg): Promise - entity.isEnabled && - isQwenImageReferenceImageConfig(entity.config) && - entity.config.image !== null && - getGlobalReferenceImageWarnings(entity, model).length === 0 - ); + // Only collect reference images for edit-variant models. + // For txt2img (generate) models, reference images are not used even if they exist in state. + const isEditModel = 'variant' in model && model.variant === 'edit'; + const validRefImageConfigs = isEditModel + ? selectRefImagesSlice(state).entities.filter( + (entity) => + entity.isEnabled && + isQwenImageReferenceImageConfig(entity.config) && + entity.config.image !== null && + getGlobalReferenceImageWarnings(entity, model).length === 0 + ) + : []; if (validRefImageConfigs.length > 0) { const refImgCollect = g.addNode({ @@ -135,14 +135,12 @@ export const buildQwenImageGraph = async (arg: GraphBuilderArg): Promise { if (!modelSupportsRefImages) { return false; } - if (modelConfig?.base === 'qwen-image' && 'variant' in modelConfig && modelConfig.variant !== 'edit') { - return false; + if (modelConfig?.base === 'qwen-image') { + const variant = 'variant' in modelConfig ? modelConfig.variant : null; + if (variant !== 'edit') { + return false; + } } return true; }, [modelSupportsRefImages, modelConfig]); diff --git a/invokeai/frontend/web/src/services/api/schema.ts b/invokeai/frontend/web/src/services/api/schema.ts index a23217c3a81..2a8a3d243b7 100644 --- a/invokeai/frontend/web/src/services/api/schema.ts +++ b/invokeai/frontend/web/src/services/api/schema.ts @@ -18500,8 +18500,7 @@ export type components = { * @constant */ base: "qwen-image"; - /** @default generate */ - variant: components["schemas"]["QwenImageVariantType"]; + variant: components["schemas"]["QwenImageVariantType"] | null; }; /** Main_Diffusers_SD1_Config */ Main_Diffusers_SD1_Config: { @@ -19234,8 +19233,7 @@ export type components = { * @constant */ format: "gguf_quantized"; - /** @default generate */ - variant: components["schemas"]["QwenImageVariantType"]; + variant: components["schemas"]["QwenImageVariantType"] | null; }; /** * Main_GGUF_ZImage_Config diff --git a/invokeai/frontend/web/src/services/api/types.ts b/invokeai/frontend/web/src/services/api/types.ts index cfeb672d95e..b447f9debbe 100644 --- a/invokeai/frontend/web/src/services/api/types.ts +++ b/invokeai/frontend/web/src/services/api/types.ts @@ -330,10 +330,6 @@ export const isQwenImageDiffusersMainModelConfig = (config: AnyModelConfig): con return config.type === 'main' && config.base === 'qwen-image' && config.format === 'diffusers'; }; -export const isQwenImageEditMainModelConfig = (config: AnyModelConfig): config is MainModelConfig => { - return config.type === 'main' && config.base === 'qwen-image' && 'variant' in config && config.variant === 'edit'; -}; - export const isTIModelConfig = (config: AnyModelConfig): config is MainModelConfig => { return config.type === 'embedding'; };