lstein
diff --git a/‎invokeai/app/invocations/qwen_image_edit_denoise.py‎
Lines changed: 122 additions & 47 deletions b/‎invokeai/app/invocations/qwen_image_edit_denoise.py‎
Lines changed: 122 additions & 47 deletions
diff --git a/‎invokeai/app/invocations/qwen_image_edit_image_to_latents.py‎
Lines changed: 14 additions & 0 deletions b/‎invokeai/app/invocations/qwen_image_edit_image_to_latents.py‎
Lines changed: 14 additions & 0 deletions
@@ -1,4 +1,5 @@
-from typing import Callable, Optional
+from contextlib import ExitStack
+from typing import Callable, Iterator, Optional, Tuple
 
 import torch
 import torchvision.transforms as tv_transforms
@@ -22,7 +23,12 @@
 from invokeai.app.invocations.primitives import LatentsOutput
 from invokeai.app.services.shared.invocation_context import InvocationContext
 from invokeai.backend.flux.sampling_utils import clip_timestep_schedule_fractional
-from invokeai.backend.model_manager.taxonomy import BaseModelType
+from invokeai.backend.model_manager.taxonomy import BaseModelType, ModelFormat
+from invokeai.backend.patches.layer_patcher import LayerPatcher
+from invokeai.backend.patches.lora_conversions.qwen_image_edit_lora_constants import (
+    QWEN_IMAGE_EDIT_LORA_TRANSFORMER_PREFIX,
+)
+from invokeai.backend.patches.model_patch_raw import ModelPatchRaw
 from invokeai.backend.rectified_flow.rectified_flow_inpaint_extension import RectifiedFlowInpaintExtension
 from invokeai.backend.stable_diffusion.diffusers_pipeline import PipelineIntermediateState
 from invokeai.backend.stable_diffusion.diffusion.conditioning_data import QwenImageEditConditioningInfo
@@ -70,6 +76,12 @@ class QwenImageEditDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard):
     height: int = InputField(default=1024, multiple_of=16, description="Height of the generated image.")
     steps: int = InputField(default=40, gt=0, description=FieldDescriptions.steps)
     seed: int = InputField(default=0, description="Randomness seed for reproducibility.")
+    shift: Optional[float] = InputField(
+        default=None,
+        description="Override the sigma schedule shift. "
+        "When set, uses a fixed shift (e.g. 3.0 for Lightning LoRAs) instead of the default dynamic shifting. "
+        "Leave unset for the base model's default schedule.",
+    )
 
     @torch.no_grad()
     def invoke(self, context: InvocationContext) -> LatentsOutput:
@@ -143,39 +155,47 @@ def _prepare_cfg_scale(self, num_timesteps: int) -> list[float]:
             raise ValueError(f"Invalid CFG scale type: {type(self.cfg_scale)}")
         return cfg_scale
 
-    def _compute_sigmas(self, image_seq_len: int, num_steps: int) -> list[float]:
+    def _compute_sigmas(self, image_seq_len: int, num_steps: int, shift_override: float | None = None) -> list[float]:
         """Compute sigmas matching the diffusers FlowMatchEulerDiscreteScheduler.
 
-        Reproduces the full pipeline: linspace → exponential time_shift → stretch_shift_to_terminal → append 0.
+        When shift_override is None, reproduces the full base-model pipeline:
+        linspace → dynamic exponential time_shift → stretch_shift_to_terminal → append 0.
+
+        When shift_override is set (e.g. 3.0 for Lightning LoRAs), uses a fixed mu = log(shift)
+        with no shift_terminal stretching.
         """
         import math
 
         import numpy as np
 
-        # Scheduler config values (from scheduler_config.json)
-        base_shift = 0.5
-        max_shift = 0.9
-        base_image_seq_len = 256
-        max_image_seq_len = 8192
-        shift_terminal = 0.02
-
         # 1. Initial sigmas: N values from 1.0 to 1/N (same as diffusers pipeline)
         sigmas = np.linspace(1.0, 1.0 / num_steps, num_steps).astype(np.float64)
 
-        # 2. Calculate mu (linear interpolation, matching diffusers calculate_shift)
-        m = (max_shift - base_shift) / (max_image_seq_len - base_image_seq_len)
-        b = base_shift - m * base_image_seq_len
-        mu = image_seq_len * m + b
+        if shift_override is not None:
+            # Fixed shift (e.g. Lightning LoRA): mu = log(shift), no terminal stretching
+            mu = math.log(shift_override)
+        else:
+            # Dynamic shift from scheduler config
+            base_shift = 0.5
+            max_shift = 0.9
+            base_image_seq_len = 256
+            max_image_seq_len = 8192
+
+            m = (max_shift - base_shift) / (max_image_seq_len - base_image_seq_len)
+            b = base_shift - m * base_image_seq_len
+            mu = image_seq_len * m + b
 
-        # 3. Exponential time shift
+        # 2. Exponential time shift
         sigmas = np.array([math.exp(mu) / (math.exp(mu) + (1.0 / s - 1.0)) for s in sigmas])
 
-        # 4. Stretch shift to terminal
-        one_minus = 1.0 - sigmas
-        scale_factor = one_minus[-1] / (1.0 - shift_terminal)
-        sigmas = 1.0 - (one_minus / scale_factor)
+        # 3. Stretch shift to terminal (only for base model schedule)
+        if shift_override is None:
+            shift_terminal = 0.02
+            one_minus = 1.0 - sigmas
+            scale_factor = one_minus[-1] / (1.0 - shift_terminal)
+            sigmas = 1.0 - (one_minus / scale_factor)
 
-        # 5. Append terminal 0
+        # 4. Append terminal 0
         sigmas = np.append(sigmas, 0.0)
 
         return sigmas.tolist()
@@ -219,7 +239,10 @@ def _run_diffusion(self, context: InvocationContext):
 
         neg_prompt_embeds = None
         neg_prompt_mask = None
-        do_classifier_free_guidance = self.negative_conditioning is not None
+        # Match the diffusers pipeline: only enable CFG when cfg_scale > 1 AND negative conditioning is provided.
+        # With cfg_scale <= 1, the negative prediction is unused, so skip it entirely.
+        cfg_scale_value = self.cfg_scale if isinstance(self.cfg_scale, float) else self.cfg_scale[0]
+        do_classifier_free_guidance = self.negative_conditioning is not None and cfg_scale_value > 1.0
         if do_classifier_free_guidance:
             neg_prompt_embeds, neg_prompt_mask = self._load_text_conditioning(
                 context=context,
@@ -238,11 +261,40 @@ def _run_diffusion(self, context: InvocationContext):
         latent_height = self.height // LATENT_SCALE_FACTOR
         latent_width = self.width // LATENT_SCALE_FACTOR
         image_seq_len = (latent_height * latent_width) // (patch_size**2)
-        # Compute the shifted sigma schedule (N+1 values, last is 0.0).
-        # The sigmas serve both as the Euler step sizes AND the timestep conditioning to the model.
-        sigmas = self._compute_sigmas(image_seq_len, self.steps)
-        sigmas = clip_timestep_schedule_fractional(sigmas, self.denoising_start, self.denoising_end)
-        total_steps = len(sigmas) - 1
+
+        # Use the actual FlowMatchEulerDiscreteScheduler to compute sigmas/timesteps,
+        # exactly matching the diffusers pipeline.
+        from diffusers.schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler
+
+        scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+            str(context.models.get_absolute_path(context.models.get_config(self.transformer.transformer)) / "scheduler"),
+            local_files_only=True,
+        )
+
+        import math
+        import numpy as np
+
+        if self.shift is not None:
+            # Lightning LoRA: fixed shift
+            mu = math.log(self.shift)
+        else:
+            # Default dynamic shifting from scheduler config
+            from diffusers.pipelines.qwenimage.pipeline_qwenimage_edit import calculate_shift
+
+            mu = calculate_shift(
+                image_seq_len,
+                scheduler.config.get("base_image_seq_len", 256),
+                scheduler.config.get("max_image_seq_len", 4096),
+                scheduler.config.get("base_shift", 0.5),
+                scheduler.config.get("max_shift", 1.15),
+            )
+
+        init_sigmas = np.linspace(1.0, 1.0 / self.steps, self.steps).tolist()
+        scheduler.set_timesteps(sigmas=init_sigmas, mu=mu, device=device)
+
+        timesteps_sched = scheduler.timesteps
+        sigmas_sched = scheduler.sigmas
+        total_steps = len(timesteps_sched)
 
         cfg_scale = self._prepare_cfg_scale(total_steps)
 
@@ -276,14 +328,14 @@ def _run_diffusion(self, context: InvocationContext):
 
         # Prepare input latent image
         if init_latents is not None:
-            s_0 = sigmas[0]
+            s_0 = sigmas_sched[0].item()
             latents = s_0 * noise + (1.0 - s_0) * init_latents
         else:
             if self.denoising_start > 1e-5:
                 raise ValueError("denoising_start should be 0 when initial latents are not provided.")
             latents = noise
 
-        if len(sigmas) <= 1:
+        if total_steps <= 0:
             return latents
 
         # Pack latents into 2x2 patches: (B, C, H, W) -> (B, H/2*W/2, C*4)
@@ -292,7 +344,7 @@ def _run_diffusion(self, context: InvocationContext):
         # Pack reference image latents and concatenate along the sequence dimension.
         # The edit transformer always expects [noisy_patches ; ref_patches] in its sequence.
         if ref_latents is not None:
-            _, _, rh, rw = ref_latents.shape
+            _, ref_ch, rh, rw = ref_latents.shape
             if rh != latent_height or rw != latent_width:
                 ref_latents = torch.nn.functional.interpolate(
                     ref_latents, size=(latent_height, latent_width), mode="bilinear"
@@ -329,23 +381,38 @@ def _run_diffusion(self, context: InvocationContext):
                 step=0,
                 order=1,
                 total_steps=total_steps,
-                timestep=int(sigmas[0] * 1000),
+                timestep=int(timesteps_sched[0].item()) if len(timesteps_sched) > 0 else 0,
                 latents=self._unpack_latents(latents, latent_height, latent_width),
             ),
         )
 
         noisy_seq_len = latents.shape[1]
 
-        with transformer_info.model_on_device() as (_, transformer):
+        # Determine if the model is quantized — GGUF models need sidecar patching for LoRAs
+        transformer_config = context.models.get_config(self.transformer.transformer)
+        model_is_quantized = transformer_config.format in (ModelFormat.GGUFQuantized,)
+
+        with ExitStack() as exit_stack:
+            (cached_weights, transformer) = exit_stack.enter_context(transformer_info.model_on_device())
             assert isinstance(transformer, QwenImageTransformer2DModel)
 
-            for step_idx in tqdm(range(total_steps)):
-                sigma_curr = sigmas[step_idx]
-                sigma_next = sigmas[step_idx + 1]
+            # Apply LoRA patches to the transformer
+            exit_stack.enter_context(
+                LayerPatcher.apply_smart_model_patches(
+                    model=transformer,
+                    patches=self._lora_iterator(context),
+                    prefix=QWEN_IMAGE_EDIT_LORA_TRANSFORMER_PREFIX,
+                    dtype=inference_dtype,
+                    cached_weights=cached_weights,
+                    force_sidecar_patching=model_is_quantized,
+                )
+            )
+
+            scheduler.set_begin_index(0)
 
-                # The model receives the shifted sigma as its time conditioning.
-                # Diffusers stores timesteps = sigma * 1000 and passes timestep / 1000.
-                timestep = torch.tensor([sigma_curr], device=device).expand(1).to(inference_dtype)
+            for step_idx, t in enumerate(tqdm(timesteps_sched)):
+                # The pipeline passes timestep / 1000 to the transformer
+                timestep = t.expand(latents.shape[0]).to(inference_dtype)
 
                 # Concatenate noisy and reference patches along the sequence dim
                 model_input = torch.cat([latents, ref_latents_packed], dim=1)
@@ -354,7 +421,7 @@ def _run_diffusion(self, context: InvocationContext):
                     hidden_states=model_input,
                     encoder_hidden_states=pos_prompt_embeds,
                     encoder_hidden_states_mask=pos_prompt_mask,
-                    timestep=timestep,
+                    timestep=timestep / 1000,
                     img_shapes=img_shapes,
                     return_dict=False,
                 )[0]
@@ -366,7 +433,7 @@ def _run_diffusion(self, context: InvocationContext):
                         hidden_states=model_input,
                         encoder_hidden_states=neg_prompt_embeds,
                         encoder_hidden_states_mask=neg_prompt_mask,
-                        timestep=timestep,
+                        timestep=timestep / 1000,
                         img_shapes=img_shapes,
                         return_dict=False,
                     )[0]
@@ -376,14 +443,11 @@ def _run_diffusion(self, context: InvocationContext):
                 else:
                     noise_pred = noise_pred_cond
 
-                latents_dtype = latents.dtype
-                latents = latents.to(dtype=torch.float32)
-                dt = sigma_next - sigma_curr
-                latents = latents + dt * noise_pred
-                latents = latents.to(dtype=latents_dtype)
+                # Use the scheduler's step method — exactly matching the pipeline
+                latents = scheduler.step(noise_pred, t, latents, return_dict=False)[0]
 
                 if inpaint_extension is not None:
-                    # Unpack to 4D for inpaint merging, then repack
+                    sigma_next = sigmas_sched[step_idx + 1].item()
                     latents_4d = self._unpack_latents(latents, latent_height, latent_width)
                     latents_4d = inpaint_extension.merge_intermediate_latents_with_init_latents(latents_4d, sigma_next)
                     latents = self._pack_latents(latents_4d, 1, out_channels, latent_height, latent_width)
@@ -393,7 +457,7 @@ def _run_diffusion(self, context: InvocationContext):
                         step=step_idx + 1,
                         order=1,
                         total_steps=total_steps,
-                        timestep=int(sigma_curr * 1000),
+                        timestep=int(t.item()),
                         latents=self._unpack_latents(latents, latent_height, latent_width),
                     ),
                 )
@@ -408,3 +472,14 @@ def step_callback(state: PipelineIntermediateState) -> None:
             context.util.sd_step_callback(state, BaseModelType.QwenImageEdit)
 
         return step_callback
+
+    def _lora_iterator(self, context: InvocationContext) -> Iterator[Tuple[ModelPatchRaw, float]]:
+        """Iterate over LoRA models to apply to the transformer."""
+        for lora in self.transformer.loras:
+            lora_info = context.models.load(lora.lora)
+            if not isinstance(lora_info.model, ModelPatchRaw):
+                raise TypeError(
+                    f"Expected ModelPatchRaw for LoRA '{lora.lora.key}', got {type(lora_info.model).__name__}."
+                )
+            yield (lora_info.model, lora.weight)
+            del lora_info
@@ -1,6 +1,7 @@
 import einops
 import torch
 from diffusers.models.autoencoders.autoencoder_kl_qwenimage import AutoencoderKLQwenImage
+from PIL import Image as PILImage
 
 from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation
 from invokeai.app.invocations.fields import (
@@ -32,6 +33,14 @@ class QwenImageEditImageToLatentsInvocation(BaseInvocation, WithMetadata, WithBo
 
     image: ImageField = InputField(description="The image to encode.")
     vae: VAEField = InputField(description=FieldDescriptions.vae, input=Input.Connection)
+    width: int | None = InputField(
+        default=None,
+        description="Resize the image to this width before encoding. If not set, encodes at the image's original size.",
+    )
+    height: int | None = InputField(
+        default=None,
+        description="Resize the image to this height before encoding. If not set, encodes at the image's original size.",
+    )
 
     @staticmethod
     def vae_encode(vae_info: LoadedModel, image_tensor: torch.Tensor) -> torch.Tensor:
@@ -69,6 +78,11 @@ def vae_encode(vae_info: LoadedModel, image_tensor: torch.Tensor) -> torch.Tenso
     def invoke(self, context: InvocationContext) -> LatentsOutput:
         image = context.images.get_pil(self.image.image_name)
 
+        # If target dimensions are specified, resize the image BEFORE encoding
+        # (matching the diffusers pipeline which resizes in pixel space, not latent space).
+        if self.width is not None and self.height is not None:
+            image = image.convert("RGB").resize((self.width, self.height), resample=PILImage.LANCZOS)
+
         image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
         if image_tensor.dim() == 3:
             image_tensor = einops.rearrange(image_tensor, "c h w -> 1 c h w")