From 33bac0c20a18d6b7436dd7e4ae5763c165200212 Mon Sep 17 00:00:00 2001
From: Juan Treminio <jtreminio@gmail.com>
Date: Tue, 26 May 2026 12:28:18 -0500
Subject: [PATCH 01/13] Adds NVIDIA PixelDiT and PiD support

---
 .../ComfyUIBackend/WorkflowGenerator.cs       |  2 +-
 .../WorkflowGeneratorModelSupport.cs          | 20 +++++++-
 .../ComfyUIBackend/WorkflowGeneratorSteps.cs  | 46 +++++++++++++++++--
 src/Text2Image/T2IModelClassSorter.cs         | 13 ++++++
 4 files changed, 74 insertions(+), 7 deletions(-)
diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs
index cadb9a665..d468224f7 100644
--- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs
+++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs
@@ -959,7 +959,7 @@ public string CreateKSampler(JArray model, JArray pos, JArray neg, JArray latent
             }
         }
         // TODO: Registry of model default preferences instead of this
-        else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1())
+        else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1() || IsPixelDiT() || IsPiD())
         {
             defscheduler ??= "simple";
         }
diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs
index 48d60e7fa..37533e966 100644
--- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs
+++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs
@@ -85,6 +85,12 @@ public bool IsKontext()
     /// <summary>Returns true if the current model is Chroma Radiance.</summary>
     public bool IsChromaRadiance() => IsModelCompatClass(T2IModelClassSorter.CompatChromaRadiance);
 
+    /// <summary>Returns true if the current model is NVIDIA PixelDiT.</summary>
+    public bool IsPixelDiT() => IsModelCompatClass(T2IModelClassSorter.CompatPixelDiT);
+
+    /// <summary>Returns true if the current model is NVIDIA PiD.</summary>
+    public bool IsPiD() => IsModelCompatClass(T2IModelClassSorter.CompatPiD);
+
     /// <summary>Returns true if the current model is HiDream-i1.</summary>
     public bool IsHiDream() => IsModelCompatClass(T2IModelClassSorter.CompatHiDreamI1);
 
@@ -398,7 +404,7 @@ public WGNodeData EmptyImage(int width, int height, int batchSize, string id = n
                 ["width"] = width
             }, id), frames);
         }
-        else if (IsChromaRadiance() || IsZetaChroma())
+        else if (IsChromaRadiance() || IsZetaChroma() || IsPixelDiT())
         {
             return resultImage(CreateNode("EmptyChromaRadianceLatentImage", new JObject()
             {
@@ -649,6 +655,11 @@ public string GetGemma2Model()
             return RequireClipModel("gemma_2_2b_fp16.safetensors", "https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/resolve/main/split_files/text_encoders/gemma_2_2b_fp16.safetensors", "29761442862f8d064d3f854bb6fabf4379dcff511a7f6ba9405a00bd0f7e2dbd", T2IParamTypes.GemmaModel);
         }
 
+        public string GetGemma2_2bElmModel()
+        {
+            return RequireClipModel("gemma_2_2b_it_elm_fp8_scaled.safetensors", "https://huggingface.co/Comfy-Org/PixelDiT/resolve/main/text_encoders/gemma_2_2b_it_elm_fp8_scaled.safetensors", "87692b2ab1714028e29910ea645d96db656505ca0805051048d2298b225c02d1", T2IParamTypes.GemmaModel);
+        }
+
         public string GetGemma3_12bModel()
         {
             return RequireClipModel("gemma_3_12B_it.safetensors", "https://huggingface.co/Comfy-Org/ltx-2/resolve/main/split_files/text_encoders/gemma_3_12B_it_fp4_mixed.safetensors", "aaca463d11e6d8d2a4bdb0d6299214c15ef78a3f73e0ef8113d5a9d0219b3f6d", T2IParamTypes.GemmaModel);
@@ -899,7 +910,7 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC)
                     {
                         dtype = "default";
                     }
-                    else if (IsZImage() || IsZetaChroma() || IsAnima()) // Model is small and dense, so trust user preferred download format
+                    else if (IsZImage() || IsZetaChroma() || IsAnima() || IsPixelDiT() || IsPiD()) // Model is small and dense, so trust user preferred download format
                     {
                         dtype = "default";
                     }
@@ -1108,6 +1119,11 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC)
                 helpers.DoVaeLoader(UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFluxVAE, "flux-1", "flux-ae");
             }
         }
+        else if (IsPixelDiT() || IsPiD())
+        {
+            helpers.LoadClip("pixeldit", helpers.GetGemma2_2bElmModel());
+            LoadingVAE = CreateVAELoader("pixel_space");
+        }
         else if (IsHiDream())
         {
             string loaderType = "QuadrupleCLIPLoader";
diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs
index 067bd1ee8..8fa3ebad8 100644
--- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs
+++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs
@@ -1451,6 +1451,48 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn)
                 g.NoVAEOverride = false;
                 prompt = g.CreateConditioning(g.UserInput.Get(T2IParamTypes.Prompt), g.CurrentTextEnc.Path, g.FinalLoadedModel, true, isRefiner: true);
                 negPrompt = g.CreateConditioning(g.UserInput.Get(T2IParamTypes.NegativePrompt), g.CurrentTextEnc.Path, g.FinalLoadedModel, false, isRefiner: true);
+                string explicitSampler = g.UserInput.Get(ComfyUIBackendExtension.SamplerParam, null, sectionId: T2IParamInput.SectionID_Refiner, includeBase: false) ?? g.UserInput.Get(ComfyUIBackendExtension.RefinerSamplerParam, null);
+                string explicitScheduler = g.UserInput.Get(ComfyUIBackendExtension.SchedulerParam, null, sectionId: T2IParamInput.SectionID_Refiner, includeBase: false) ?? g.UserInput.Get(ComfyUIBackendExtension.RefinerSchedulerParam, null);
+                int steps = g.UserInput.Get(T2IParamTypes.RefinerSteps, g.UserInput.Get(T2IParamTypes.Steps, 20, sectionId: T2IParamInput.SectionID_Refiner), sectionId: T2IParamInput.SectionID_Refiner);
+                double cfg = g.UserInput.Get(T2IParamTypes.RefinerCFGScale, g.UserInput.Get(T2IParamTypes.CFGScale, 7, sectionId: T2IParamInput.SectionID_Refiner), sectionId: T2IParamInput.SectionID_Refiner);
+                if (g.IsPiD())
+                {
+                    string baseCompatId = baseModel.ModelClass?.CompatClass?.ID ?? "";
+                    string pidLatentFormat =
+                        baseCompatId.StartsWith("flux-2") ? "flux2"
+                        : baseCompatId == "flux-1" ? "flux1"
+                        : baseCompatId.StartsWith("stable-diffusion-v3") ? "sd3"
+                        : (baseCompatId == "z-image" || baseCompatId == "zeta-chroma") ? "flux1"
+                        : null;
+                    if (pidLatentFormat is null)
+                    {
+                        throw new SwarmUserErrorException($"PiD requires a Flux.1, Flux.2, SD3, or Z-Image base model, but the base model class is '{baseCompatId}'.");
+                    }
+                    string pidCond = g.CreateNode("PiDConditioning", new JObject()
+                    {
+                        ["positive"] = prompt,
+                        ["latent"] = g.CurrentMedia.Path,
+                        ["latent_format"] = pidLatentFormat,
+                        ["degrade_sigma"] = 0.0
+                    });
+                    prompt = [pidCond, 0];
+                    int pidWidth = g.UserInput.GetImageWidth() * 4 / 16 * 16;
+                    int pidHeight = g.UserInput.GetImageHeight() * 4 / 16 * 16;
+                    string pidLatent = g.CreateNode("EmptyChromaRadianceLatentImage", new JObject()
+                    {
+                        ["batch_size"] = g.UserInput.Get(T2IParamTypes.BatchSize, 1),
+                        ["width"] = pidWidth,
+                        ["height"] = pidHeight
+                    });
+                    g.CreateKSampler(g.CurrentModel.Path, prompt, negPrompt, [pidLatent, 0], cfg, steps, (int)Math.Round(steps * (1 - refinerControl)), 10000,
+                        g.UserInput.Get(T2IParamTypes.Seed) + 1, false, true, id: "23",
+                        explicitSampler: explicitSampler, explicitScheduler: explicitScheduler, sectionId: T2IParamInput.SectionID_Refiner);
+                    g.CurrentMedia = g.CurrentMedia.WithPath(["23", 0], WGNodeData.DT_LATENT_IMAGE, refineModel.ModelClass?.CompatClass);
+                    g.CurrentMedia.Width = pidWidth;
+                    g.CurrentMedia.Height = pidHeight;
+                    g.IsRefinerStage = false;
+                    return;
+                }
                 bool doSave = g.UserInput.Get(T2IParamTypes.OutputIntermediateImages, false);
                 bool doUspcale = g.UserInput.TryGet(T2IParamTypes.RefinerUpscale, out double refineUpscale) && refineUpscale != 1;
                 string upscaleMethod = g.UserInput.Get(ComfyUIBackendExtension.RefinerUpscaleMethod, "None");
@@ -1589,10 +1631,6 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn)
                     model = model.WithPath([hyperTileNode, 0]);
                 }
                 g.CurrentMedia = g.CurrentMedia.AsSamplingLatent(g.CurrentVae, g.CurrentAudioVae);
-                int steps = g.UserInput.Get(T2IParamTypes.RefinerSteps, g.UserInput.Get(T2IParamTypes.Steps, 20, sectionId: T2IParamInput.SectionID_Refiner), sectionId: T2IParamInput.SectionID_Refiner);
-                double cfg = g.UserInput.Get(T2IParamTypes.RefinerCFGScale, g.UserInput.Get(T2IParamTypes.CFGScale, 7, sectionId: T2IParamInput.SectionID_Refiner), sectionId: T2IParamInput.SectionID_Refiner);
-                string explicitSampler = g.UserInput.Get(ComfyUIBackendExtension.SamplerParam, null, sectionId: T2IParamInput.SectionID_Refiner, includeBase: false) ?? g.UserInput.Get(ComfyUIBackendExtension.RefinerSamplerParam, null);
-                string explicitScheduler = g.UserInput.Get(ComfyUIBackendExtension.SchedulerParam, null, sectionId: T2IParamInput.SectionID_Refiner, includeBase: false) ?? g.UserInput.Get(ComfyUIBackendExtension.RefinerSchedulerParam, null);
                 g.CreateKSampler(model.Path, prompt, negPrompt, g.CurrentMedia.Path, cfg, steps, (int)Math.Round(steps * (1 - refinerControl)), 10000,
                     g.UserInput.Get(T2IParamTypes.Seed) + 1, false, method != "StepSwapNoisy", id: "23", doTiled: g.UserInput.Get(T2IParamTypes.RefinerDoTiling, false),
                     explicitSampler: explicitSampler, explicitScheduler: explicitScheduler, sectionId: T2IParamInput.SectionID_Refiner);
diff --git a/src/Text2Image/T2IModelClassSorter.cs b/src/Text2Image/T2IModelClassSorter.cs
index 2df28ecad..9f37a3bb1 100644
--- a/src/Text2Image/T2IModelClassSorter.cs
+++ b/src/Text2Image/T2IModelClassSorter.cs
@@ -71,6 +71,8 @@ public static T2IModelCompatClass
         CompatZetaChroma = RegisterCompat(new() { ID = "zeta-chroma", ShortCode = "ZChr", LorasTargetTextEnc = false }),
         CompatAnima = RegisterCompat(new() { ID = "anima", ShortCode = "Anima", LorasTargetTextEnc = false }),
         CompatHiDreamO1 = RegisterCompat(new() { ID = "hidream-o1", ShortCode = "HiDrO1", LorasTargetTextEnc = false }),
+        CompatPixelDiT = RegisterCompat(new() { ID = "pixeldit", ShortCode = "PixDiT", LorasTargetTextEnc = false }),
+        CompatPiD = RegisterCompat(new() { ID = "pid", ShortCode = "PiD", LorasTargetTextEnc = false }),
         // Audio models
         CompatAceStep15 = RegisterCompat(new() { ID = "ace-step-1_5", ShortCode = "Ace15", IsAudioModel = true }),
         // Obscure old random ones
@@ -204,6 +206,8 @@ bool isZImageLora(JObject h) => (hasLoraKey(h, "layers.0.adaLN_modulation.0") &&
         bool isHiDreamO1Lora(JObject h) => hasLoraKey(h, "final_layer2.linear") && hasLoraKey(h, "language_model.layers.0.self_attn.q_proj");
         bool isChroma(JObject h) => h.ContainsKey("distilled_guidance_layer.in_proj.bias") && h.ContainsKey("double_blocks.0.img_attn.proj.bias");
         bool isChromaRadiance(JObject h) => h.ContainsKey("nerf_image_embedder.embedder.0.bias");
+        bool isPixelDiT(JObject h) => h.ContainsKey("core.pixel_embedder.proj.weight");
+        bool isPiD(JObject h) => h.ContainsKey("net.lq_proj.latent_proj.0.weight");
         bool isOmniGen(JObject h) => h.ContainsKey("time_caption_embed.timestep_embedder.linear_2.weight") && h.ContainsKey("context_refiner.0.attn.norm_k.weight");
         bool isQwenImage(JObject h) => (h.ContainsKey("time_text_embed.timestep_embedder.linear_1.bias") && h.ContainsKey("img_in.bias") && (h.ContainsKey("transformer_blocks.0.attn.add_k_proj.bias") || h.ContainsKey("transformer_blocks.0.attn.add_qkv_proj.bias")))
             || (h.ContainsKey("model.diffusion_model.time_text_embed.timestep_embedder.linear_1.bias") && h.ContainsKey("model.diffusion_model.img_in.bias") && (h.ContainsKey("model.diffusion_model.transformer_blocks.0.attn.add_k_proj.bias") || h.ContainsKey("model.diffusion_model.transformer_blocks.0.attn.add_qkv_proj.bias")));
@@ -699,6 +703,15 @@ JToken GetEmbeddingKey(JObject h)
         {
             return isChroma(h) && isChromaRadiance(h);
         }});
+        // ====================== NVIDIA PixelDiT / PiD ======================
+        Register(new() { ID = "pixeldit", CompatClass = CompatPixelDiT, Name = "NVIDIA PixelDiT", StandardWidth = 1024, StandardHeight = 1024, IsThisModelOfClass = (m, h) =>
+        {
+            return isPixelDiT(h);
+        }});
+        Register(new() { ID = "pid", CompatClass = CompatPiD, Name = "NVIDIA PiD", StandardWidth = 1024, StandardHeight = 1024, IsThisModelOfClass = (m, h) =>
+        {
+            return isPiD(h);
+        }});
         Register(new() { ID = "alt_diffusion_v1_512_placeholder", CompatClass = CompatAltDiffusion, Name = "Alt-Diffusion", StandardWidth = 512, StandardHeight = 512, IsThisModelOfClass = (m, h) =>
         {
             return IsAlt(h);

From 3d3a933bb75d9004fb5a3d2458525c01c7250b8c Mon Sep 17 00:00:00 2001
From: Juan Treminio <jtreminio@gmail.com>
Date: Wed, 27 May 2026 18:07:08 -0500
Subject: [PATCH 02/13] PiD replaces Refiner upscaler, not refiner stage itself

---
 .../ComfyUIBackend/ComfyUIBackendExtension.cs |   5 +-
 .../ComfyUIBackend/WorkflowGenerator.cs       |  11 +-
 .../ComfyUIBackend/WorkflowGeneratorSteps.cs  | 114 ++++++++++--------
 src/Text2Image/T2IModelClassSorter.cs         |   2 +-
 src/Text2Image/T2IParamInput.cs               |   2 +-
 src/Text2Image/T2IPromptHandling.cs           |   6 +
 src/Utils/PromptRegion.cs                     |  10 +-
 7 files changed, 97 insertions(+), 53 deletions(-)

diff --git a/src/BuiltinExtensions/ComfyUIBackend/ComfyUIBackendExtension.cs b/src/BuiltinExtensions/ComfyUIBackend/ComfyUIBackendExtension.cs
index 92a239823..0f5fc092e 100644
--- a/src/BuiltinExtensions/ComfyUIBackend/ComfyUIBackendExtension.cs
+++ b/src/BuiltinExtensions/ComfyUIBackend/ComfyUIBackendExtension.cs
@@ -635,6 +635,9 @@ public static void AssignValuesFromRaw(JObject rawObjectInfo)
         ],
         Schedulers = ["normal///Normal", "karras///Karras", "exponential///Exponential", "simple///Simple", "ddim_uniform///DDIM Uniform", "sgm_uniform///SGM Uniform", "turbo///Turbo (for turbo models, max 10 steps)", "align_your_steps///Align Your Steps (Model-specific behavior)", "beta///Beta", "linear_quadratic///Linear Quadratic (Mochi)", "ltxv///LTX-Video", "ltxv-image///LTXV-Image", "kl_optimal///KL Optimal (Nvidia AYS)", "flux2///Flux.2"];
 
+    /// <summary>Lists PiD decoder models.</summary>
+    public static List<string> PidUpscaleModels(Session session) => [.. Program.MainSDModels.ListModelsFor(session).Where(m => m.ModelClass?.CompatClass?.ID == "pid").OrderBy(m => m.Name).Select(m => $"pidmodel-{m.Name}///PiD Model: {m.Name}")];
+
     public static List<string> IPAdapterModels = ["None"], IPAdapterWeightTypes = ["standard", "prompt is more important", "style transfer"];
 
     public static List<string> GligenModels = ["None"], YoloModels = [], StyleModels = ["None"], SetClipDevices = ["cpu"];
@@ -752,7 +755,7 @@ public override void OnInit()
             ));
         RefinerUpscaleMethod = T2IParamTypes.Register<string>(new("Refiner Upscale Method", "How to upscale the image, if upscaling is used.",
             "pixel-lanczos", Group: T2IParamTypes.GroupRefiners, OrderPriority: -1, FeatureFlag: "comfyui", ChangeWeight: 1,
-            GetValues: (_) => UpscalerModels, DependNonDefault: T2IParamTypes.RefinerUpscale.Type.ID
+            GetValues: (session) => [.. UpscalerModels, .. PidUpscaleModels(session)], DependNonDefault: T2IParamTypes.RefinerUpscale.Type.ID
             ));
         RefinerSamplerParam = T2IParamTypes.Register<string>(new("Refiner Sampler", SamplerParam.Type.Description + "\nThis is an override to only affect the Refine/Upscale stage.",
             "euler", Toggleable: true, FeatureFlag: "comfyui", Group: T2IParamTypes.GroupRefinerOverrides, OrderPriority: -2,
diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs
index f70fcf5e7..66750efe1 100644
--- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs
+++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs
@@ -166,6 +166,9 @@ public JArray FinalImageOut
     /// <summary>If true, the generator is currently working on the refiner stage.</summary>
     public bool IsRefinerStage = false;
 
+    /// <summary>If true, the generator is currently working on the pixel-decoder stage.</summary>
+    public bool IsPixelDecoderStage = false;
+
     /// <summary>If true, the generator is currently working on Image2Video.</summary>
     public bool IsImageToVideo = false;
 
@@ -2518,7 +2521,7 @@ public bool ShouldZeroNegative()
     }
 
     /// <summary>Creates a "CLIPTextEncode" or equivalent node for the given input, applying prompt-given conditioning modifiers as relevant.</summary>
-    public JArray CreateConditioning(string prompt, JArray clip, T2IModel model, bool isPositive, string firstId = null, bool isRefiner = false, bool isVideo = false, bool isVideoSwap = false)
+    public JArray CreateConditioning(string prompt, JArray clip, T2IModel model, bool isPositive, string firstId = null, bool isRefiner = false, bool isVideo = false, bool isVideoSwap = false, bool isPixelDecoder = false)
     {
         PromptRegion regionalizer = new(prompt);
         string globalPromptText = regionalizer.GlobalPrompt;
@@ -2534,7 +2537,11 @@ public JArray CreateConditioning(string prompt, JArray clip, T2IModel model, boo
         {
             globalPromptText = $"{globalPromptText} {regionalizer.RefinerPrompt}";
         }
-        else if (!isVideo && !isRefiner && !string.IsNullOrWhiteSpace(regionalizer.BasePrompt))
+        else if (isPixelDecoder && !string.IsNullOrWhiteSpace(regionalizer.PixelDecoderPrompt))
+        {
+            globalPromptText = $"{globalPromptText} {regionalizer.PixelDecoderPrompt}";
+        }
+        else if (!isVideo && !isRefiner && !isPixelDecoder && !string.IsNullOrWhiteSpace(regionalizer.BasePrompt))
         {
             globalPromptText = $"{globalPromptText} {regionalizer.BasePrompt}";
         }
diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs
index 8fa3ebad8..d75ccfad9 100644
--- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs
+++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs
@@ -106,7 +106,11 @@ public static void Register()
             (g.LoadingModel, g.LoadingClip) = g.LoadLorasForConfinement(0, g.LoadingModel, g.LoadingClip);
             if (g.IsRefinerStage)
             {
-                (g.LoadingModel, g.LoadingClip) = g.LoadLorasForConfinement(1, g.LoadingModel, g.LoadingClip);
+                (g.LoadingModel, g.LoadingClip) = g.LoadLorasForConfinement(T2IParamInput.SectionID_Refiner, g.LoadingModel, g.LoadingClip);
+            }
+            else if (g.IsPixelDecoderStage)
+            {
+                (g.LoadingModel, g.LoadingClip) = g.LoadLorasForConfinement(T2IParamInput.SectionID_PixelDecoder, g.LoadingModel, g.LoadingClip);
             }
             else if (g.IsImageToVideoSwap)
             {
@@ -1451,53 +1455,11 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn)
                 g.NoVAEOverride = false;
                 prompt = g.CreateConditioning(g.UserInput.Get(T2IParamTypes.Prompt), g.CurrentTextEnc.Path, g.FinalLoadedModel, true, isRefiner: true);
                 negPrompt = g.CreateConditioning(g.UserInput.Get(T2IParamTypes.NegativePrompt), g.CurrentTextEnc.Path, g.FinalLoadedModel, false, isRefiner: true);
-                string explicitSampler = g.UserInput.Get(ComfyUIBackendExtension.SamplerParam, null, sectionId: T2IParamInput.SectionID_Refiner, includeBase: false) ?? g.UserInput.Get(ComfyUIBackendExtension.RefinerSamplerParam, null);
-                string explicitScheduler = g.UserInput.Get(ComfyUIBackendExtension.SchedulerParam, null, sectionId: T2IParamInput.SectionID_Refiner, includeBase: false) ?? g.UserInput.Get(ComfyUIBackendExtension.RefinerSchedulerParam, null);
-                int steps = g.UserInput.Get(T2IParamTypes.RefinerSteps, g.UserInput.Get(T2IParamTypes.Steps, 20, sectionId: T2IParamInput.SectionID_Refiner), sectionId: T2IParamInput.SectionID_Refiner);
-                double cfg = g.UserInput.Get(T2IParamTypes.RefinerCFGScale, g.UserInput.Get(T2IParamTypes.CFGScale, 7, sectionId: T2IParamInput.SectionID_Refiner), sectionId: T2IParamInput.SectionID_Refiner);
-                if (g.IsPiD())
-                {
-                    string baseCompatId = baseModel.ModelClass?.CompatClass?.ID ?? "";
-                    string pidLatentFormat =
-                        baseCompatId.StartsWith("flux-2") ? "flux2"
-                        : baseCompatId == "flux-1" ? "flux1"
-                        : baseCompatId.StartsWith("stable-diffusion-v3") ? "sd3"
-                        : (baseCompatId == "z-image" || baseCompatId == "zeta-chroma") ? "flux1"
-                        : null;
-                    if (pidLatentFormat is null)
-                    {
-                        throw new SwarmUserErrorException($"PiD requires a Flux.1, Flux.2, SD3, or Z-Image base model, but the base model class is '{baseCompatId}'.");
-                    }
-                    string pidCond = g.CreateNode("PiDConditioning", new JObject()
-                    {
-                        ["positive"] = prompt,
-                        ["latent"] = g.CurrentMedia.Path,
-                        ["latent_format"] = pidLatentFormat,
-                        ["degrade_sigma"] = 0.0
-                    });
-                    prompt = [pidCond, 0];
-                    int pidWidth = g.UserInput.GetImageWidth() * 4 / 16 * 16;
-                    int pidHeight = g.UserInput.GetImageHeight() * 4 / 16 * 16;
-                    string pidLatent = g.CreateNode("EmptyChromaRadianceLatentImage", new JObject()
-                    {
-                        ["batch_size"] = g.UserInput.Get(T2IParamTypes.BatchSize, 1),
-                        ["width"] = pidWidth,
-                        ["height"] = pidHeight
-                    });
-                    g.CreateKSampler(g.CurrentModel.Path, prompt, negPrompt, [pidLatent, 0], cfg, steps, (int)Math.Round(steps * (1 - refinerControl)), 10000,
-                        g.UserInput.Get(T2IParamTypes.Seed) + 1, false, true, id: "23",
-                        explicitSampler: explicitSampler, explicitScheduler: explicitScheduler, sectionId: T2IParamInput.SectionID_Refiner);
-                    g.CurrentMedia = g.CurrentMedia.WithPath(["23", 0], WGNodeData.DT_LATENT_IMAGE, refineModel.ModelClass?.CompatClass);
-                    g.CurrentMedia.Width = pidWidth;
-                    g.CurrentMedia.Height = pidHeight;
-                    g.IsRefinerStage = false;
-                    return;
-                }
                 bool doSave = g.UserInput.Get(T2IParamTypes.OutputIntermediateImages, false);
-                bool doUspcale = g.UserInput.TryGet(T2IParamTypes.RefinerUpscale, out double refineUpscale) && refineUpscale != 1;
+                bool doUpscale = g.UserInput.TryGet(T2IParamTypes.RefinerUpscale, out double refineUpscale) && refineUpscale != 1;
                 string upscaleMethod = g.UserInput.Get(ComfyUIBackendExtension.RefinerUpscaleMethod, "None");
                 // TODO: Better same-VAE check
-                bool doPixelUpscale = doUspcale && (upscaleMethod.StartsWith("pixel-") || upscaleMethod.StartsWith("model-"));
+                bool doPixelUpscale = doUpscale && (upscaleMethod.StartsWith("pixel-") || upscaleMethod.StartsWith("model-"));
                 int width = (int)Math.Round(g.UserInput.GetImageWidth() * refineUpscale);
                 int height = (int)Math.Round(g.UserInput.GetImageHeight() * refineUpscale);
                 width = (width / 16) * 16; // avoid unworkable output sizes
@@ -1559,7 +1521,7 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn)
                         g.CurrentMedia = decoded.EncodeToLatent(g.CurrentVae, "25");
                     }
                 }
-                if (doUspcale && upscaleMethod.StartsWith("latent-"))
+                if (doUpscale && upscaleMethod.StartsWith("latent-"))
                 {
                     g.CurrentMedia = g.CurrentMedia.AsLatentImage(g.CurrentVae);
                     g.CreateNode("LatentUpscaleBy", new JObject()
@@ -1572,7 +1534,7 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn)
                     g.CurrentMedia.Width = width;
                     g.CurrentMedia.Height = height;
                 }
-                else if (doUspcale && upscaleMethod.StartsWith("latentmodel-"))
+                else if (doUpscale && upscaleMethod.StartsWith("latentmodel-"))
                 {
                     g.CreateNode("LatentUpscaleModelLoader", new JObject()
                     {
@@ -1631,11 +1593,69 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn)
                     model = model.WithPath([hyperTileNode, 0]);
                 }
                 g.CurrentMedia = g.CurrentMedia.AsSamplingLatent(g.CurrentVae, g.CurrentAudioVae);
+                int steps = g.UserInput.Get(T2IParamTypes.RefinerSteps, g.UserInput.Get(T2IParamTypes.Steps, 20, sectionId: T2IParamInput.SectionID_Refiner), sectionId: T2IParamInput.SectionID_Refiner);
+                double cfg = g.UserInput.Get(T2IParamTypes.RefinerCFGScale, g.UserInput.Get(T2IParamTypes.CFGScale, 7, sectionId: T2IParamInput.SectionID_Refiner), sectionId: T2IParamInput.SectionID_Refiner);
+                string explicitSampler = g.UserInput.Get(ComfyUIBackendExtension.SamplerParam, null, sectionId: T2IParamInput.SectionID_Refiner, includeBase: false) ?? g.UserInput.Get(ComfyUIBackendExtension.RefinerSamplerParam, null);
+                string explicitScheduler = g.UserInput.Get(ComfyUIBackendExtension.SchedulerParam, null, sectionId: T2IParamInput.SectionID_Refiner, includeBase: false) ?? g.UserInput.Get(ComfyUIBackendExtension.RefinerSchedulerParam, null);
                 g.CreateKSampler(model.Path, prompt, negPrompt, g.CurrentMedia.Path, cfg, steps, (int)Math.Round(steps * (1 - refinerControl)), 10000,
                     g.UserInput.Get(T2IParamTypes.Seed) + 1, false, method != "StepSwapNoisy", id: "23", doTiled: g.UserInput.Get(T2IParamTypes.RefinerDoTiling, false),
                     explicitSampler: explicitSampler, explicitScheduler: explicitScheduler, sectionId: T2IParamInput.SectionID_Refiner);
                 g.CurrentMedia = g.CurrentMedia.WithPath(["23", 0]);
                 g.IsRefinerStage = false;
+                if (doUpscale && upscaleMethod.StartsWith("pidmodel-"))
+                {
+                    string pidModelName = upscaleMethod.After("pidmodel-");
+                    T2IModel pidModel = Program.MainSDModels.GetModel(pidModelName);
+                    if (pidModel is null || pidModel.ModelClass?.CompatClass?.ID != "pid")
+                    {
+                        throw new SwarmUserErrorException($"Refiner Upscale Method is set to PiD model '{pidModelName}', but that model could not be found or is not a valid PiD model.");
+                    }
+                    string pidLatentFormat = g.IsSD3() ? "sd3" : (g.IsFlux() || g.IsAnyFlux2() || g.IsZImage() || g.IsZetaChroma()) ? "flux" : null;
+                    if (pidLatentFormat is null)
+                    {
+                        throw new SwarmUserErrorException($"PiD model requires the refiner model's VAE to be Flux.1, Flux.2, or SD3, but model '{refineModel.Name}' is '{refineModel.ModelClass?.CompatClass?.ID ?? "unknown"}'.");
+                    }
+                    JArray refinedLatent = g.CurrentMedia.Path;
+                    int pidWidth = g.UserInput.GetImageWidth() * 4;
+                    int pidHeight = g.UserInput.GetImageHeight() * 4;
+                    pidWidth = (pidWidth / 16) * 16;
+                    pidHeight = (pidHeight / 16) * 16;
+                    T2IModel refinerFinalModel = g.FinalLoadedModel;
+                    List<T2IModel> refinerFinalModelList = g.FinalLoadedModelList;
+                    g.FinalLoadedModel = pidModel;
+                    g.FinalLoadedModelList = [pidModel];
+                    g.NoVAEOverride = true;
+                    g.IsPixelDecoderStage = true;
+                    (g.FinalLoadedModel, g.CurrentModel, g.CurrentTextEnc, g.CurrentVae) = g.CreateModelLoader(pidModel, "PixelDecoder", sectionId: T2IParamInput.SectionID_PixelDecoder);
+                    g.IsPixelDecoderStage = false;
+                    g.NoVAEOverride = false;
+                    JArray pidPos = g.CreateConditioning(g.UserInput.Get(T2IParamTypes.Prompt), g.CurrentTextEnc.Path, g.FinalLoadedModel, true, isPixelDecoder: true);
+                    JArray pidNeg = g.CreateConditioning(g.UserInput.Get(T2IParamTypes.NegativePrompt), g.CurrentTextEnc.Path, g.FinalLoadedModel, false, isPixelDecoder: true);
+                    string pidCond = g.CreateNode("PiDConditioning", new JObject()
+                    {
+                        ["positive"] = pidPos,
+                        ["latent"] = refinedLatent,
+                        ["latent_format"] = pidLatentFormat,
+                        ["degrade_sigma"] = 0.0
+                    });
+                    string pidEmptyLatent = g.CreateNode("EmptyChromaRadianceLatentImage", new JObject()
+                    {
+                        ["batch_size"] = g.UserInput.Get(T2IParamTypes.BatchSize, 1),
+                        ["width"] = pidWidth,
+                        ["height"] = pidHeight
+                    });
+                    int pidSteps = g.UserInput.GetNullable(T2IParamTypes.Steps, T2IParamInput.SectionID_PixelDecoder, false) ?? 4;
+                    double pidCfg = g.UserInput.GetNullable(T2IParamTypes.CFGScale, T2IParamInput.SectionID_PixelDecoder, false) ?? 1.0;
+                    string pidSampler = g.UserInput.Get(ComfyUIBackendExtension.SamplerParam, null, sectionId: T2IParamInput.SectionID_PixelDecoder, includeBase: false);
+                    string pidScheduler = g.UserInput.Get(ComfyUIBackendExtension.SchedulerParam, null, sectionId: T2IParamInput.SectionID_PixelDecoder, includeBase: false);
+                    string pidSampled = g.CreateKSampler(g.CurrentModel.Path, [pidCond, 0], pidNeg, [pidEmptyLatent, 0], pidCfg, pidSteps, 0, 10000,
+                        g.UserInput.Get(T2IParamTypes.Seed) + 2, false, true, defsampler: "lcm", defscheduler: "simple", explicitSampler: pidSampler, explicitScheduler: pidScheduler, sectionId: T2IParamInput.SectionID_PixelDecoder);
+                    g.CurrentMedia = g.CurrentMedia.WithPath([pidSampled, 0], WGNodeData.DT_LATENT_IMAGE, pidModel.ModelClass?.CompatClass);
+                    g.CurrentMedia.Width = pidWidth;
+                    g.CurrentMedia.Height = pidHeight;
+                    g.FinalLoadedModel = refinerFinalModel;
+                    g.FinalLoadedModelList = refinerFinalModelList;
+                }
             }
         }, -4);
         #endregion
diff --git a/src/Text2Image/T2IModelClassSorter.cs b/src/Text2Image/T2IModelClassSorter.cs
index c48be7710..830bfbfc1 100644
--- a/src/Text2Image/T2IModelClassSorter.cs
+++ b/src/Text2Image/T2IModelClassSorter.cs
@@ -208,8 +208,8 @@ bool isZImageLora(JObject h) => (hasLoraKey(h, "layers.0.adaLN_modulation.0") &&
         bool isHiDreamO1Lora(JObject h) => hasLoraKey(h, "final_layer2.linear") && hasLoraKey(h, "language_model.layers.0.self_attn.q_proj");
         bool isChroma(JObject h) => h.ContainsKey("distilled_guidance_layer.in_proj.bias") && h.ContainsKey("double_blocks.0.img_attn.proj.bias");
         bool isChromaRadiance(JObject h) => h.ContainsKey("nerf_image_embedder.embedder.0.bias");
-        bool isPixelDiT(JObject h) => h.ContainsKey("core.pixel_embedder.proj.weight");
         bool isPiD(JObject h) => h.ContainsKey("net.lq_proj.latent_proj.0.weight");
+        bool isPixelDiT(JObject h) => h.ContainsKey("core.pixel_embedder.proj.weight") && !isPiD(h);
         bool isOmniGen(JObject h) => h.ContainsKey("time_caption_embed.timestep_embedder.linear_2.weight") && h.ContainsKey("context_refiner.0.attn.norm_k.weight");
         bool isQwenImage(JObject h) => (h.ContainsKey("time_text_embed.timestep_embedder.linear_1.bias") && h.ContainsKey("img_in.bias") && (h.ContainsKey("transformer_blocks.0.attn.add_k_proj.bias") || h.ContainsKey("transformer_blocks.0.attn.add_qkv_proj.bias")))
             || (h.ContainsKey("model.diffusion_model.time_text_embed.timestep_embedder.linear_1.bias") && h.ContainsKey("model.diffusion_model.img_in.bias") && (h.ContainsKey("model.diffusion_model.transformer_blocks.0.attn.add_k_proj.bias") || h.ContainsKey("model.diffusion_model.transformer_blocks.0.attn.add_qkv_proj.bias")));
diff --git a/src/Text2Image/T2IParamInput.cs b/src/Text2Image/T2IParamInput.cs
index dc8ada4f5..a8ce3b084 100644
--- a/src/Text2Image/T2IParamInput.cs
+++ b/src/Text2Image/T2IParamInput.cs
@@ -13,7 +13,7 @@ namespace SwarmUI.Text2Image;
 public class T2IParamInput
 {
     /// <summary>Core section ID numbers.</summary>
-    public static int SectionID_BaseOnly = 5, SectionID_Refiner = 1, SectionID_Video = 2, SectionID_VideoSwap = 3;
+    public static int SectionID_BaseOnly = 5, SectionID_Refiner = 1, SectionID_Video = 2, SectionID_VideoSwap = 3, SectionID_PixelDecoder = 4;
 
     /// <summary>Parameter IDs that must be loaded early on, eg extracted from presets in prompts early. Primarily things that affect backend selection.</summary>
     public static readonly string[] ParamsMustLoadEarly = ["model", "images", "internalbackendtype", "exactbackendid"];
diff --git a/src/Text2Image/T2IPromptHandling.cs b/src/Text2Image/T2IPromptHandling.cs
index 669e9c70a..723af0ff6 100644
--- a/src/Text2Image/T2IPromptHandling.cs
+++ b/src/Text2Image/T2IPromptHandling.cs
@@ -598,6 +598,12 @@ static string estimateAsSectionBreak(string data, PromptTagContext context)
             return $"<refiner//cid={T2IParamInput.SectionID_Refiner}>";
         };
         PromptTagLengthEstimators["refiner"] = estimateAsSectionBreak;
+        PromptTagBasicProcessors["pixeldecoder"] = (data, context) =>
+        {
+            context.SectionID = T2IParamInput.SectionID_PixelDecoder;
+            return $"<pixeldecoder//cid={T2IParamInput.SectionID_PixelDecoder}>";
+        };
+        PromptTagLengthEstimators["pixeldecoder"] = estimateAsSectionBreak;
         PromptTagBasicProcessors["video"] = (data, context) =>
         {
             context.SectionID = T2IParamInput.SectionID_Video;
diff --git a/src/Utils/PromptRegion.cs b/src/Utils/PromptRegion.cs
index 77a031a3e..b3fc5f402 100644
--- a/src/Utils/PromptRegion.cs
+++ b/src/Utils/PromptRegion.cs
@@ -13,6 +13,8 @@ public class PromptRegion
 
     public string RefinerPrompt = "";
 
+    public string PixelDecoderPrompt = "";
+
     public string VideoPrompt = "";
 
     public string VideoSwapPrompt = "";
@@ -26,7 +28,7 @@ public enum PartType
     public static HashSet<string> CustomPartPrefixes = [];
 
     /// <summary>List of all prefixes for parts. Use <see cref="RegisterCustomPrefix(string)"/> to add to this.</summary>
-    public static List<string> PartPrefixes = ["<region:", "<object:", "<segment:", "<clear:", "<extend:", "<refiner", "<base", "<video"];
+    public static List<string> PartPrefixes = ["<region:", "<object:", "<segment:", "<clear:", "<extend:", "<refiner", "<base", "<video", "<pixeldecoder"];
 
     /// <summary>Custom Extensions can add new prompt part types here.
     /// <para>For example, this will add prompt parsing for &lt;example&gt; or &lt;example:somedata&gt; or etc:
@@ -129,6 +131,12 @@ public PromptRegion(string prompt)
                 addMore = s => RefinerPrompt += s;
                 continue;
             }
+            else if (prefix == "pixeldecoder")
+            {
+                PixelDecoderPrompt += content;
+                addMore = s => PixelDecoderPrompt += s;
+                continue;
+            }
             else if (prefix == "video")
             {
                 VideoPrompt += content;

From 05836eb80b4d12a3d54a90c033d140b85bfc5fc8 Mon Sep 17 00:00:00 2001
From: Juan Treminio <jtreminio@gmail.com>
Date: Wed, 27 May 2026 18:19:35 -0500
Subject: [PATCH 03/13] Cleanup

---
 .../ComfyUIBackend/WorkflowGeneratorModelSupport.cs  |  4 ++--
 src/Text2Image/T2IModelClassSorter.cs                | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs
index cae6f4736..4c0ebaaa6 100644
--- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs
+++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs
@@ -85,10 +85,10 @@ public bool IsKontext()
     /// <summary>Returns true if the current model is Chroma Radiance.</summary>
     public bool IsChromaRadiance() => IsModelCompatClass(T2IModelClassSorter.CompatChromaRadiance);
 
-    /// <summary>Returns true if the current model is NVIDIA PixelDiT.</summary>
+    /// <summary>Returns true if the current model is PixelDiT.</summary>
     public bool IsPixelDiT() => IsModelCompatClass(T2IModelClassSorter.CompatPixelDiT);
 
-    /// <summary>Returns true if the current model is NVIDIA PiD.</summary>
+    /// <summary>Returns true if the current model is PiD.</summary>
     public bool IsPiD() => IsModelCompatClass(T2IModelClassSorter.CompatPiD);
 
     /// <summary>Returns true if the current model is HiDream-i1.</summary>
diff --git a/src/Text2Image/T2IModelClassSorter.cs b/src/Text2Image/T2IModelClassSorter.cs
index 830bfbfc1..b9abb006c 100644
--- a/src/Text2Image/T2IModelClassSorter.cs
+++ b/src/Text2Image/T2IModelClassSorter.cs
@@ -72,8 +72,8 @@ public static T2IModelCompatClass
         CompatAnima = RegisterCompat(new() { ID = "anima", ShortCode = "Anima", LorasTargetTextEnc = false }),
         CompatHiDreamO1 = RegisterCompat(new() { ID = "hidream-o1", ShortCode = "HiDrO1", LorasTargetTextEnc = false }),
         CompatLens = RegisterCompat(new() { ID = "lens", ShortCode = "Lens", LorasTargetTextEnc = false }),
-        CompatPixelDiT = RegisterCompat(new() { ID = "pixeldit", ShortCode = "PixDiT", LorasTargetTextEnc = false }),
         CompatPiD = RegisterCompat(new() { ID = "pid", ShortCode = "PiD", LorasTargetTextEnc = false }),
+        CompatPixelDiT = RegisterCompat(new() { ID = "pixeldit", ShortCode = "PixDiT", LorasTargetTextEnc = false }),
         // Audio models
         CompatAceStep15 = RegisterCompat(new() { ID = "ace-step-1_5", ShortCode = "Ace15", IsAudioModel = true }),
         // Obscure old random ones
@@ -709,14 +709,14 @@ JToken GetEmbeddingKey(JObject h)
         {
             return isChroma(h) && isChromaRadiance(h);
         }});
-        // ====================== NVIDIA PixelDiT / PiD ======================
-        Register(new() { ID = "pixeldit", CompatClass = CompatPixelDiT, Name = "NVIDIA PixelDiT", StandardWidth = 1024, StandardHeight = 1024, IsThisModelOfClass = (m, h) =>
+        // ====================== PixelDiT / PiD ======================
+        Register(new() { ID = "pid", CompatClass = CompatPiD, Name = "PiD", StandardWidth = 1024, StandardHeight = 1024, IsThisModelOfClass = (m, h) =>
         {
-            return isPixelDiT(h);
+            return isPiD(h);
         }});
-        Register(new() { ID = "pid", CompatClass = CompatPiD, Name = "NVIDIA PiD", StandardWidth = 1024, StandardHeight = 1024, IsThisModelOfClass = (m, h) =>
+        Register(new() { ID = "pixeldit", CompatClass = CompatPixelDiT, Name = "PixelDiT", StandardWidth = 1024, StandardHeight = 1024, IsThisModelOfClass = (m, h) =>
         {
-            return isPiD(h);
+            return isPixelDiT(h);
         }});
         Register(new() { ID = "alt_diffusion_v1_512_placeholder", CompatClass = CompatAltDiffusion, Name = "Alt-Diffusion", StandardWidth = 512, StandardHeight = 512, IsThisModelOfClass = (m, h) =>
         {

From ab0665d7f588cb6d54317989d5099a628158b0be Mon Sep 17 00:00:00 2001
From: Juan Treminio <jtreminio@gmail.com>
Date: Wed, 27 May 2026 18:47:31 -0500
Subject: [PATCH 04/13] Docs for PixelDiT

---
 docs/Model Support.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/docs/Model Support.md b/docs/Model Support.md
index 01db974c0..54fc335f2 100644
--- a/docs/Model Support.md	
+++ b/docs/Model Support.md	
@@ -21,6 +21,7 @@
 [ERNIE](#ernie) | DiT | 2026 | Baidu | 8B | Minimal | Modern, intelligent, good quality, fast |
 [HiDream O1](#hidream-o1) | "Pixel UiT" | 2026 | HiDream | 8B | Minimal | Modern, intelligent, fast, decent quality |
 [Lens](#lens) | MMDiT | 2026 | Microsoft | 4B | Minimal | Modern, lightweight |
+[PixelDiT](#pixeldit) | Pixel DiT | 2026 | NVIDIA | 1.3B | Minimal | Modern, fast, pixel-space |
 
 Old or bad options also tracked listed via [Obscure Model Support](/docs/Obscure%20Model%20Support.md):
 
@@ -640,6 +641,21 @@ For upscaling with SD3, the `Refiner Do Tiling` parameter is highly recommended
     - **Steps:** For Turbo, `4` is recommended, `8` works well. For Base, `20` as normal.
     - **Resolution:** Side length `1440` is the official default, but 1024 is a reasonable option. It retains coherence down to about 512 and up to about 2048.
 
+# PixelDiT
+
+- NVIDIA's [PixelDiT](<https://huggingface.co/Comfy-Org/PixelDiT>) is supported in SwarmUI!
+    - Or the smaller FP8 version: [Comfy-Org/PixelDiT - mxfp8](<https://huggingface.co/Comfy-Org/PixelDiT/resolve/main/diffusion_models/pixeldit_1300m_1024px_mxfp8.safetensors>)
+    - Download the fat BF16: [Comfy-Org/PixelDiT - bf16](<https://huggingface.co/Comfy-Org/PixelDiT/resolve/main/diffusion_models/pixeldit_1300m_1024px_bf16.safetensors>)
+    - Save in `diffusion_models`
+- It does not use a VAE
+- Uses the Gemma 2 2B text encoder, will be downloaded and handled automatically
+- **Parameters:**
+    - **Sampler:** Default is fine.
+    - **Scheduler:** Default is fine.
+    - **CFG Scale:** `4` is recommended.
+    - **Steps:** `30` is recommended.
+    - **Resolution:** Side length `1024` is the standard.
+
 # Video Models
 
 - Video models are documented in [Video Model Support](/docs/Video%20Model%20Support.md).

From f10bb98a9a1733eddb4383394aab79f8ce12bfe4 Mon Sep 17 00:00:00 2001
From: Juan Treminio <jtreminio@gmail.com>
Date: Wed, 27 May 2026 18:50:12 -0500
Subject: [PATCH 05/13] doc fix

---
 docs/Model Support.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/Model Support.md b/docs/Model Support.md
index 54fc335f2..d6705bfa5 100644
--- a/docs/Model Support.md	
+++ b/docs/Model Support.md	
@@ -644,8 +644,8 @@ For upscaling with SD3, the `Refiner Do Tiling` parameter is highly recommended
 # PixelDiT
 
 - NVIDIA's [PixelDiT](<https://huggingface.co/Comfy-Org/PixelDiT>) is supported in SwarmUI!
-    - Or the smaller FP8 version: [Comfy-Org/PixelDiT - mxfp8](<https://huggingface.co/Comfy-Org/PixelDiT/resolve/main/diffusion_models/pixeldit_1300m_1024px_mxfp8.safetensors>)
-    - Download the fat BF16: [Comfy-Org/PixelDiT - bf16](<https://huggingface.co/Comfy-Org/PixelDiT/resolve/main/diffusion_models/pixeldit_1300m_1024px_bf16.safetensors>)
+    - The smaller FP8 model can be downloaded here: [Comfy-Org/PixelDiT - fp8](<https://huggingface.co/Comfy-Org/PixelDiT/resolve/main/diffusion_models/pixeldit_1300m_1024px_mxfp8.safetensors>)
+    - Or fat BF16 version: [Comfy-Org/PixelDiT - bf16](<https://huggingface.co/Comfy-Org/PixelDiT/resolve/main/diffusion_models/pixeldit_1300m_1024px_bf16.safetensors>)
     - Save in `diffusion_models`
 - It does not use a VAE
 - Uses the Gemma 2 2B text encoder, will be downloaded and handled automatically

From 145fec390a2d0677978e98b06039dd8ce718adc4 Mon Sep 17 00:00:00 2001
From: Juan Treminio <jtreminio@gmail.com>
Date: Wed, 27 May 2026 18:55:15 -0500
Subject: [PATCH 06/13] Add <pixeldecoder> hint

---
 src/wwwroot/js/genpage/gentab/prompttools.js | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/wwwroot/js/genpage/gentab/prompttools.js b/src/wwwroot/js/genpage/gentab/prompttools.js
index b688932d1..cfaa14972 100644
--- a/src/wwwroot/js/genpage/gentab/prompttools.js
+++ b/src/wwwroot/js/genpage/gentab/prompttools.js
@@ -137,6 +137,9 @@ class PromptTabCompleteClass {
         this.registerPrefix('refiner', 'Add a section of prompt text that is only used for the Refine/Upscale pass.', (prefix) => {
             return [];
         }, true);
+        this.registerPrefix('pixeldecoder', 'Add a section of prompt text that is only used for the PiD pixel-decoder upscale pass.', (prefix) => {
+            return [];
+        }, true);
         this.registerPrefix('video', 'Add a section of prompt text that replaces the prompt for the image-to-video generation pass.', (prefix) => {
             return [];
         }, true);

From f42cf7ab8c67f497eec0f9d37717719b7bcbe226 Mon Sep 17 00:00:00 2001
From: Juan Treminio <jtreminio@gmail.com>
Date: Wed, 27 May 2026 19:15:11 -0500
Subject: [PATCH 07/13] Use T2IParamTypes.GetBestModelInList()

---
 .../ComfyUIBackend/WorkflowGeneratorSteps.cs               | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs
index d75ccfad9..59e3813b6 100644
--- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs
+++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs
@@ -1605,7 +1605,12 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn)
                 if (doUpscale && upscaleMethod.StartsWith("pidmodel-"))
                 {
                     string pidModelName = upscaleMethod.After("pidmodel-");
-                    T2IModel pidModel = Program.MainSDModels.GetModel(pidModelName);
+                    string pidMatched = T2IParamTypes.GetBestModelInList(pidModelName, Program.MainSDModels.ListModelNamesFor(g.UserInput.SourceSession));
+                    if (pidMatched is not null && pidMatched.EndsWith(".safetensors"))
+                    {
+                        pidMatched = pidMatched.BeforeLast('.');
+                    }
+                    T2IModel pidModel = pidMatched is null ? null : Program.MainSDModels.GetModel(pidMatched);
                     if (pidModel is null || pidModel.ModelClass?.CompatClass?.ID != "pid")
                     {
                         throw new SwarmUserErrorException($"Refiner Upscale Method is set to PiD model '{pidModelName}', but that model could not be found or is not a valid PiD model.");

From 4502cb4ffd8be8a79dbc3d12dea7b3f5f2cd3006 Mon Sep 17 00:00:00 2001
From: Juan Treminio <jtreminio@gmail.com>
Date: Wed, 27 May 2026 19:19:08 -0500
Subject: [PATCH 08/13] Add more (pixel) keys

---
 src/Text2Image/T2IModelClassSorter.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Text2Image/T2IModelClassSorter.cs b/src/Text2Image/T2IModelClassSorter.cs
index b9abb006c..87b7dbf8d 100644
--- a/src/Text2Image/T2IModelClassSorter.cs
+++ b/src/Text2Image/T2IModelClassSorter.cs
@@ -208,8 +208,8 @@ bool isZImageLora(JObject h) => (hasLoraKey(h, "layers.0.adaLN_modulation.0") &&
         bool isHiDreamO1Lora(JObject h) => hasLoraKey(h, "final_layer2.linear") && hasLoraKey(h, "language_model.layers.0.self_attn.q_proj");
         bool isChroma(JObject h) => h.ContainsKey("distilled_guidance_layer.in_proj.bias") && h.ContainsKey("double_blocks.0.img_attn.proj.bias");
         bool isChromaRadiance(JObject h) => h.ContainsKey("nerf_image_embedder.embedder.0.bias");
-        bool isPiD(JObject h) => h.ContainsKey("net.lq_proj.latent_proj.0.weight");
-        bool isPixelDiT(JObject h) => h.ContainsKey("core.pixel_embedder.proj.weight") && !isPiD(h);
+        bool isPiD(JObject h) => h.ContainsKey("net.lq_proj.latent_proj.0.weight") && h.ContainsKey("net.pixel_blocks.0.attn.q_norm.weight");
+        bool isPixelDiT(JObject h) => h.ContainsKey("core.pixel_embedder.proj.weight") && h.ContainsKey("core.pixel_blocks.0.attn.q_norm.weight") && !isPiD(h);
         bool isOmniGen(JObject h) => h.ContainsKey("time_caption_embed.timestep_embedder.linear_2.weight") && h.ContainsKey("context_refiner.0.attn.norm_k.weight");
         bool isQwenImage(JObject h) => (h.ContainsKey("time_text_embed.timestep_embedder.linear_1.bias") && h.ContainsKey("img_in.bias") && (h.ContainsKey("transformer_blocks.0.attn.add_k_proj.bias") || h.ContainsKey("transformer_blocks.0.attn.add_qkv_proj.bias")))
             || (h.ContainsKey("model.diffusion_model.time_text_embed.timestep_embedder.linear_1.bias") && h.ContainsKey("model.diffusion_model.img_in.bias") && (h.ContainsKey("model.diffusion_model.transformer_blocks.0.attn.add_k_proj.bias") || h.ContainsKey("model.diffusion_model.transformer_blocks.0.attn.add_qkv_proj.bias")));

From eb099fc51796530a1ebf15723177e1c195340df8 Mon Sep 17 00:00:00 2001
From: Juan Treminio <jtreminio@gmail.com>
Date: Wed, 27 May 2026 19:21:57 -0500
Subject: [PATCH 09/13] add more why not

---
 src/Text2Image/T2IModelClassSorter.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Text2Image/T2IModelClassSorter.cs b/src/Text2Image/T2IModelClassSorter.cs
index 87b7dbf8d..df9d24c4c 100644
--- a/src/Text2Image/T2IModelClassSorter.cs
+++ b/src/Text2Image/T2IModelClassSorter.cs
@@ -208,8 +208,8 @@ bool isZImageLora(JObject h) => (hasLoraKey(h, "layers.0.adaLN_modulation.0") &&
         bool isHiDreamO1Lora(JObject h) => hasLoraKey(h, "final_layer2.linear") && hasLoraKey(h, "language_model.layers.0.self_attn.q_proj");
         bool isChroma(JObject h) => h.ContainsKey("distilled_guidance_layer.in_proj.bias") && h.ContainsKey("double_blocks.0.img_attn.proj.bias");
         bool isChromaRadiance(JObject h) => h.ContainsKey("nerf_image_embedder.embedder.0.bias");
-        bool isPiD(JObject h) => h.ContainsKey("net.lq_proj.latent_proj.0.weight") && h.ContainsKey("net.pixel_blocks.0.attn.q_norm.weight");
-        bool isPixelDiT(JObject h) => h.ContainsKey("core.pixel_embedder.proj.weight") && h.ContainsKey("core.pixel_blocks.0.attn.q_norm.weight") && !isPiD(h);
+        bool isPiD(JObject h) => h.ContainsKey("net.lq_proj.latent_proj.0.weight") && h.ContainsKey("net.pixel_blocks.0.attn.q_norm.weight") && h.ContainsKey("net.pixel_blocks.0.compress_to_attn.weight");
+        bool isPixelDiT(JObject h) => h.ContainsKey("core.pixel_embedder.proj.weight") && h.ContainsKey("core.pixel_blocks.0.attn.q_norm.weight") && h.ContainsKey("core.pixel_blocks.0.compress_to_attn.weight") && !isPiD(h);
         bool isOmniGen(JObject h) => h.ContainsKey("time_caption_embed.timestep_embedder.linear_2.weight") && h.ContainsKey("context_refiner.0.attn.norm_k.weight");
         bool isQwenImage(JObject h) => (h.ContainsKey("time_text_embed.timestep_embedder.linear_1.bias") && h.ContainsKey("img_in.bias") && (h.ContainsKey("transformer_blocks.0.attn.add_k_proj.bias") || h.ContainsKey("transformer_blocks.0.attn.add_qkv_proj.bias")))
             || (h.ContainsKey("model.diffusion_model.time_text_embed.timestep_embedder.linear_1.bias") && h.ContainsKey("model.diffusion_model.img_in.bias") && (h.ContainsKey("model.diffusion_model.transformer_blocks.0.attn.add_k_proj.bias") || h.ContainsKey("model.diffusion_model.transformer_blocks.0.attn.add_qkv_proj.bias")));

From d8716d56fdbd7e7ee1a6448ea3dfaf40fba4f365 Mon Sep 17 00:00:00 2001
From: Juan Treminio <jtreminio@gmail.com>
Date: Thu, 4 Jun 2026 11:20:34 -0600
Subject: [PATCH 10/13] Implements more pathways for DiT

* as base model - for when a user uploads an image I guess
* as refiner model - if base model isn't a compatible vae user, load the vae and add a vae decode/encode pair
* refiner upscale model - base -> pid -> downscale or upscale with lanczos (if needed) -> refiner swarmksampler
* after the refiner swarmksampler; if refiner model isn't a compatible vae user, load the vae and add a vae decode/encode pair
---
 .../ComfyUIBackend/ComfyUIBackendExtension.cs |  22 +++
 .../ComfyUIBackend/WorkflowGenerator.cs       | 104 +++++++++++-
 .../WorkflowGeneratorModelSupport.cs          |   2 +-
 .../ComfyUIBackend/WorkflowGeneratorSteps.cs  | 157 +++++++++++-------
 src/Text2Image/T2IModelClass.cs               |   6 +-
 src/Text2Image/T2IModelClassSorter.cs         |  60 ++++---
 6 files changed, 263 insertions(+), 88 deletions(-)

diff --git a/src/BuiltinExtensions/ComfyUIBackend/ComfyUIBackendExtension.cs b/src/BuiltinExtensions/ComfyUIBackend/ComfyUIBackendExtension.cs
index 0f5fc092e..d63283df9 100644
--- a/src/BuiltinExtensions/ComfyUIBackend/ComfyUIBackendExtension.cs
+++ b/src/BuiltinExtensions/ComfyUIBackend/ComfyUIBackendExtension.cs
@@ -614,6 +614,8 @@ public static void AssignValuesFromRaw(JObject rawObjectInfo)
 
     public static T2IRegisteredParam<int> RefinerHyperTile, VideoFrameInterpolationMultiplier;
 
+    public static T2IRegisteredParam<T2IModel> PixelDecoderModel;
+
     public static T2IRegisteredParam<string>[] ControlNetPreprocessorParams = new T2IRegisteredParam<string>[3], ControlNetUnionTypeParams = new T2IRegisteredParam<string>[3];
 
     public static List<string> UpscalerModels = ["pixel-lanczos///Pixel: Lanczos (cheap + high quality)", "pixel-bicubic///Pixel: Bicubic (Basic)", "pixel-area///Pixel: Area", "pixel-bilinear///Pixel: Bilinear", "pixel-nearest-exact///Pixel: Nearest-Exact (Pixel art)", "latent-bislerp///Latent: Bislerp", "latent-bicubic///Latent: Bicubic", "latent-area///Latent: Area", "latent-bilinear///Latent: Bilinear", "latent-nearest-exact///Latent: Nearest-Exact"],
@@ -638,6 +640,22 @@ public static void AssignValuesFromRaw(JObject rawObjectInfo)
     /// <summary>Lists PiD decoder models.</summary>
     public static List<string> PidUpscaleModels(Session session) => [.. Program.MainSDModels.ListModelsFor(session).Where(m => m.ModelClass?.CompatClass?.ID == "pid").OrderBy(m => m.Name).Select(m => $"pidmodel-{m.Name}///PiD Model: {m.Name}")];
 
+    /// <summary>Resolves a PiD model from a model name.</summary>
+    public static T2IModel GetPidModel(string name, Session session)
+    {
+        string matched = T2IParamTypes.GetBestModelInList(name, Program.MainSDModels.ListModelNamesFor(session));
+        if (matched is not null && matched.EndsWith(".safetensors"))
+        {
+            matched = matched.BeforeLast('.');
+        }
+        T2IModel model = matched is null ? null : Program.MainSDModels.GetModel(matched);
+        if (model is null || model.ModelClass?.CompatClass?.ID != "pid")
+        {
+            throw new SwarmUserErrorException($"PiD model '{name}' could not be found, or is not a valid PiD model.");
+        }
+        return model;
+    }
+
     public static List<string> IPAdapterModels = ["None"], IPAdapterWeightTypes = ["standard", "prompt is more important", "style transfer"];
 
     public static List<string> GligenModels = ["None"], YoloModels = [], StyleModels = ["None"], SetClipDevices = ["cpu"];
@@ -757,6 +775,10 @@ public override void OnInit()
             "pixel-lanczos", Group: T2IParamTypes.GroupRefiners, OrderPriority: -1, FeatureFlag: "comfyui", ChangeWeight: 1,
             GetValues: (session) => [.. UpscalerModels, .. PidUpscaleModels(session)], DependNonDefault: T2IParamTypes.RefinerUpscale.Type.ID
             ));
+        PixelDecoderModel = T2IParamTypes.Register<T2IModel>(new("Pixel Decoder Model", "Optionally use a PiD (Pixel Diffusion Decoder) model.",
+            "", Toggleable: true, FeatureFlag: "comfyui", Group: T2IParamTypes.GroupAdvancedModelAddons, IsAdvanced: true, Subtype: "Stable-Diffusion", ChangeWeight: 4, DoNotPreview: true, OrderPriority: 14,
+            GetValues: (session) => T2IParamTypes.CleanModelList(Program.MainSDModels.ListModelsFor(session).Where(m => m.ModelClass?.CompatClass?.ID == "pid").OrderBy(m => m.Name).Select(m => m.Name))
+            ));
         RefinerSamplerParam = T2IParamTypes.Register<string>(new("Refiner Sampler", SamplerParam.Type.Description + "\nThis is an override to only affect the Refine/Upscale stage.",
             "euler", Toggleable: true, FeatureFlag: "comfyui", Group: T2IParamTypes.GroupRefinerOverrides, OrderPriority: -2,
             GetValues: (_) => Samplers
diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs
index 66750efe1..104ffdb57 100644
--- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs
+++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs
@@ -961,8 +961,13 @@ public string CreateKSampler(JArray model, JArray pos, JArray neg, JArray latent
                 latent = [srCond, 2];
             }
         }
+        else if (IsPiD())
+        {
+            defsampler ??= "lcm";
+            defscheduler ??= "simple";
+        }
         // TODO: Registry of model default preferences instead of this
-        else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1() || IsLens() || IsPixelDiT() || IsPiD())
+        else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1() || IsLens() || IsPixelDiT())
         {
             defscheduler ??= "simple";
         }
@@ -2520,6 +2525,103 @@ public bool ShouldZeroNegative()
         return false;
     }
 
+    /// <summary>The PiDConditioning node's latent_format value for each VAE family that PiD models exist for.</summary>
+    public static Dictionary<string, string> PidLatentFormats = new()
+    {
+        ["flux1"] = "flux",
+        ["flux2"] = "flux",
+        ["sd3"] = "sd3",
+        ["sdxl"] = "sdxl",
+        ["qwenimage"] = "qwenimage"
+    };
+
+    /// <summary>Detects which VAE family a PiD model was trained against.</summary>
+    public static string PidFamilyOfModel(T2IModel pidModel)
+    {
+        string name = pidModel.Name.ToLowerFast();
+        return PidLatentFormats.Keys.FirstOrDefault(name.Contains);
+    }
+
+    /// <summary>Converts media into a latent in the PiD model's native latent space, re-encoding through an auto-loaded matching VAE if needed.</summary>
+    public (WGNodeData, string) CreatePidCompatLatent(T2IModel pidModel, WGNodeData media, WGNodeData decodeVae)
+    {
+        string mediaFamily = media.IsLatentData ? media.Compat?.VaeFamily : null;
+        string family = PidFamilyOfModel(pidModel) ?? mediaFamily ?? "flux1";
+        string format = PidLatentFormats[family];
+        if (mediaFamily == family)
+        {
+            return (media, format);
+        }
+        WGNodeData decoded = media.AsRawImage(decodeVae);
+        (string knownVae, string vaeCompat) = T2IModelClassSorter.VaeFamilies[family];
+        string defaultVae = family switch
+        {
+            "flux1" => UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFluxVAE,
+            "flux2" => UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFlux2VAE,
+            "sd3" => UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultSD3VAE,
+            "sdxl" => UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultSDXLVAE,
+            _ => null
+        };
+        ModelLoadHelpers helpers = new(this);
+        bool priorNoVae = NoVAEOverride;
+        NoVAEOverride = true;
+        helpers.DoVaeLoader(defaultVae, vaeCompat, knownVae);
+        NoVAEOverride = priorNoVae;
+        WGNodeData encodeVae = new(LoadingVAE, this, WGNodeData.DT_VAE, T2IModelClassSorter.CompatClasses[vaeCompat]);
+        return (decoded.EncodeToLatent(encodeVae), format);
+    }
+
+    /// <summary>Creates a PiD pixel-decode stage: converts to a PiD-space latent and samples a 4x pixel image from it.</summary>
+    public WGNodeData CreatePixelDecode(T2IModel pidModel, WGNodeData media, WGNodeData decodeVae, long seed, bool isRefiner = false)
+    {
+        (WGNodeData latent, string format) = CreatePidCompatLatent(pidModel, media, decodeVae);
+        T2IModel priorFinalModel = FinalLoadedModel;
+        List<T2IModel> priorFinalModelList = FinalLoadedModelList;
+        WGNodeData priorModel = CurrentModel, priorTextEnc = CurrentTextEnc, priorVae = CurrentVae;
+        bool priorNoVae = NoVAEOverride;
+        int sectionId = isRefiner ? T2IParamInput.SectionID_Refiner : T2IParamInput.SectionID_PixelDecoder;
+        FinalLoadedModel = pidModel;
+        FinalLoadedModelList = [pidModel];
+        NoVAEOverride = true;
+        IsPixelDecoderStage = !isRefiner;
+        (FinalLoadedModel, CurrentModel, CurrentTextEnc, CurrentVae) = CreateModelLoader(pidModel, isRefiner ? "Refiner" : "PixelDecoder", sectionId: sectionId);
+        IsPixelDecoderStage = false;
+        NoVAEOverride = priorNoVae;
+        JArray pos = CreateConditioning(UserInput.Get(T2IParamTypes.Prompt), CurrentTextEnc.Path, pidModel, true, isRefiner: isRefiner, isPixelDecoder: !isRefiner);
+        JArray neg = CreateConditioning(UserInput.Get(T2IParamTypes.NegativePrompt), CurrentTextEnc.Path, pidModel, false, isRefiner: isRefiner, isPixelDecoder: !isRefiner);
+        string cond = CreateNode("PiDConditioning", new JObject()
+        {
+            ["positive"] = pos,
+            ["latent"] = latent.Path,
+            ["latent_format"] = format,
+            ["degrade_sigma"] = 0.0
+        });
+        int width = ((media.Width ?? UserInput.GetImageWidth()) * 4 / 16) * 16;
+        int height = ((media.Height ?? UserInput.GetImageHeight()) * 4 / 16) * 16;
+        string emptyLatent = CreateNode("EmptyChromaRadianceLatentImage", new JObject()
+        {
+            ["batch_size"] = UserInput.Get(T2IParamTypes.BatchSize, 1),
+            ["width"] = width,
+            ["height"] = height
+        });
+        int steps = UserInput.GetNullable(T2IParamTypes.Steps, sectionId, false) ?? (isRefiner ? UserInput.GetNullable(T2IParamTypes.RefinerSteps) : null) ?? 4;
+        double cfg = UserInput.GetNullable(T2IParamTypes.CFGScale, sectionId, false) ?? (isRefiner ? UserInput.GetNullable(T2IParamTypes.RefinerCFGScale) : null) ?? 1;
+        string explicitSampler = UserInput.Get(ComfyUIBackendExtension.SamplerParam, null, sectionId: sectionId, includeBase: false) ?? (isRefiner ? UserInput.Get(ComfyUIBackendExtension.RefinerSamplerParam, null) : null);
+        string explicitScheduler = UserInput.Get(ComfyUIBackendExtension.SchedulerParam, null, sectionId: sectionId, includeBase: false) ?? (isRefiner ? UserInput.Get(ComfyUIBackendExtension.RefinerSchedulerParam, null) : null);
+        string sampled = CreateKSampler(CurrentModel.Path, [cond, 0], neg, [emptyLatent, 0], cfg, steps, 0, 10000, seed, false, true,
+            explicitSampler: explicitSampler, explicitScheduler: explicitScheduler, sectionId: sectionId);
+        WGNodeData result = media.WithPath([sampled, 0], WGNodeData.DT_LATENT_IMAGE, pidModel.ModelClass?.CompatClass);
+        result.Width = width;
+        result.Height = height;
+        result = result.DecodeLatents(CurrentVae, false);
+        FinalLoadedModel = priorFinalModel;
+        FinalLoadedModelList = priorFinalModelList;
+        CurrentModel = priorModel;
+        CurrentTextEnc = priorTextEnc;
+        CurrentVae = priorVae;
+        return result;
+    }
+
     /// <summary>Creates a "CLIPTextEncode" or equivalent node for the given input, applying prompt-given conditioning modifiers as relevant.</summary>
     public JArray CreateConditioning(string prompt, JArray clip, T2IModel model, bool isPositive, string firstId = null, bool isRefiner = false, bool isVideo = false, bool isVideoSwap = false, bool isPixelDecoder = false)
     {
diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs
index 4c0ebaaa6..7ebc02cc5 100644
--- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs
+++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs
@@ -407,7 +407,7 @@ public WGNodeData EmptyImage(int width, int height, int batchSize, string id = n
                 ["width"] = width
             }, id), frames);
         }
-        else if (IsChromaRadiance() || IsZetaChroma() || IsPixelDiT())
+        else if (IsChromaRadiance() || IsZetaChroma() || IsPixelDiT() || IsPiD())
         {
             return resultImage(CreateNode("EmptyChromaRadianceLatentImage", new JObject()
             {
diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs
index 59e3813b6..ee35d9c5e 100644
--- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs
+++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs
@@ -1360,6 +1360,34 @@ bool getBestFor(string phrase)
             {
                 endStep = (int)(steps * (1 - endEarly));
             }
+            if (g.IsPiD())
+            {
+                if (g.BasicInputImage is null)
+                {
+                    throw new SwarmUserErrorException("PiD models are pixel decoders/upscalers, not image generators, an Init Image is required.");
+                }
+                (WGNodeData pidLatent, string pidFormat) = g.CreatePidCompatLatent(g.FinalLoadedModel, g.BasicInputImage, g.CurrentVae);
+                string pidCond = g.CreateNode("PiDConditioning", new JObject()
+                {
+                    ["positive"] = g.FinalPrompt,
+                    ["latent"] = pidLatent.Path,
+                    ["latent_format"] = pidFormat,
+                    ["degrade_sigma"] = 0.0
+                });
+                g.FinalPrompt = [pidCond, 0];
+                int pidWidth = (g.UserInput.GetImageWidth() * 4 / 16) * 16;
+                int pidHeight = (g.UserInput.GetImageHeight() * 4 / 16) * 16;
+                string pidEmptyLatent = g.CreateNode("EmptyChromaRadianceLatentImage", new JObject()
+                {
+                    ["batch_size"] = g.UserInput.Get(T2IParamTypes.BatchSize, 1),
+                    ["width"] = pidWidth,
+                    ["height"] = pidHeight
+                });
+                g.CurrentMedia = new WGNodeData([pidEmptyLatent, 0], g, WGNodeData.DT_LATENT_IMAGE, g.CurrentCompat()) { Width = pidWidth, Height = pidHeight };
+                startStep = 0;
+                endStep = 10000;
+                g.MainSamplerAddNoise = true;
+            }
             double cfg = g.UserInput.Get(T2IParamTypes.CFGScale);
             if (!noSkip && (steps == 0 || endStep <= startStep))
             {
@@ -1444,6 +1472,36 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn)
                     }
                     loaderNodeId = "20";
                 }
+                if (refineModel.ModelClass?.CompatClass?.ID == "pid")
+                {
+                    if (g.UserInput.Get(T2IParamTypes.OutputIntermediateImages, false))
+                    {
+                        g.CurrentMedia.DecodeLatents(origVae, false, "24").SaveOutput(null, null, id: "29");
+                    }
+                    WGNodeData pidDecoded = g.CreatePixelDecode(refineModel, g.CurrentMedia, origVae, g.UserInput.Get(T2IParamTypes.Seed) + 1, isRefiner: true);
+                    if (g.UserInput.TryGet(T2IParamTypes.RefinerUpscale, out double pidUpscale) && pidUpscale != 1)
+                    {
+                        int targetWidth = ((int)Math.Round(g.UserInput.GetImageWidth() * pidUpscale) / 16) * 16;
+                        int targetHeight = ((int)Math.Round(g.UserInput.GetImageHeight() * pidUpscale) / 16) * 16;
+                        if (targetWidth != pidDecoded.Width || targetHeight != pidDecoded.Height)
+                        {
+                            g.CreateNode("ImageScale", new JObject()
+                            {
+                                ["image"] = pidDecoded.Path,
+                                ["width"] = targetWidth,
+                                ["height"] = targetHeight,
+                                ["upscale_method"] = "lanczos",
+                                ["crop"] = "disabled"
+                            }, "26");
+                            pidDecoded = pidDecoded.WithPath(["26", 0]);
+                            pidDecoded.Width = targetWidth;
+                            pidDecoded.Height = targetHeight;
+                        }
+                    }
+                    g.CurrentMedia = pidDecoded;
+                    g.IsRefinerStage = false;
+                    return;
+                }
                 if (g.UserInput.TryGet(T2IParamTypes.RefinerVAE, out _))
                 {
                     modelMustReencode = true;
@@ -1460,11 +1518,41 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn)
                 string upscaleMethod = g.UserInput.Get(ComfyUIBackendExtension.RefinerUpscaleMethod, "None");
                 // TODO: Better same-VAE check
                 bool doPixelUpscale = doUpscale && (upscaleMethod.StartsWith("pixel-") || upscaleMethod.StartsWith("model-"));
+                bool doPidUpscale = doUpscale && upscaleMethod.StartsWith("pidmodel-");
                 int width = (int)Math.Round(g.UserInput.GetImageWidth() * refineUpscale);
                 int height = (int)Math.Round(g.UserInput.GetImageHeight() * refineUpscale);
                 width = (width / 16) * 16; // avoid unworkable output sizes
                 height = (height / 16) * 16;
-                if (modelMustReencode || doPixelUpscale || doSave || g.MaskShrunkInfo.BoundsNode is not null)
+                if (doPidUpscale)
+                {
+                    T2IModel pidModel = ComfyUIBackendExtension.GetPidModel(upscaleMethod.After("pidmodel-"), g.UserInput.SourceSession);
+                    WGNodeData decoded = g.CreatePixelDecode(pidModel, g.CurrentMedia, origVae, g.UserInput.Get(T2IParamTypes.Seed) + 2);
+                    if (doSave)
+                    {
+                        decoded.SaveOutput(null, null, id: "29");
+                    }
+                    if (decoded.Width != width || decoded.Height != height)
+                    {
+                        g.CreateNode("ImageScale", new JObject()
+                        {
+                            ["image"] = decoded.Path,
+                            ["width"] = width,
+                            ["height"] = height,
+                            ["upscale_method"] = "lanczos",
+                            ["crop"] = "disabled"
+                        }, "26");
+                        decoded = decoded.WithPath(["26", 0]);
+                        decoded.Width = width;
+                        decoded.Height = height;
+                    }
+                    if (refinerControl <= 0)
+                    {
+                        g.CurrentMedia = decoded;
+                        return;
+                    }
+                    g.CurrentMedia = decoded.EncodeToLatent(g.CurrentVae, "25");
+                }
+                else if (modelMustReencode || doPixelUpscale || doSave || g.MaskShrunkInfo.BoundsNode is not null)
                 {
                     WGNodeData decoded = g.CurrentMedia.DecodeLatents(origVae, false, "24");
                     JArray maskShrunk = doMaskShrinkApply(g, decoded.Path);
@@ -1602,71 +1690,20 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn)
                     explicitSampler: explicitSampler, explicitScheduler: explicitScheduler, sectionId: T2IParamInput.SectionID_Refiner);
                 g.CurrentMedia = g.CurrentMedia.WithPath(["23", 0]);
                 g.IsRefinerStage = false;
-                if (doUpscale && upscaleMethod.StartsWith("pidmodel-"))
-                {
-                    string pidModelName = upscaleMethod.After("pidmodel-");
-                    string pidMatched = T2IParamTypes.GetBestModelInList(pidModelName, Program.MainSDModels.ListModelNamesFor(g.UserInput.SourceSession));
-                    if (pidMatched is not null && pidMatched.EndsWith(".safetensors"))
-                    {
-                        pidMatched = pidMatched.BeforeLast('.');
-                    }
-                    T2IModel pidModel = pidMatched is null ? null : Program.MainSDModels.GetModel(pidMatched);
-                    if (pidModel is null || pidModel.ModelClass?.CompatClass?.ID != "pid")
-                    {
-                        throw new SwarmUserErrorException($"Refiner Upscale Method is set to PiD model '{pidModelName}', but that model could not be found or is not a valid PiD model.");
-                    }
-                    string pidLatentFormat = g.IsSD3() ? "sd3" : (g.IsFlux() || g.IsAnyFlux2() || g.IsZImage() || g.IsZetaChroma()) ? "flux" : null;
-                    if (pidLatentFormat is null)
-                    {
-                        throw new SwarmUserErrorException($"PiD model requires the refiner model's VAE to be Flux.1, Flux.2, or SD3, but model '{refineModel.Name}' is '{refineModel.ModelClass?.CompatClass?.ID ?? "unknown"}'.");
-                    }
-                    JArray refinedLatent = g.CurrentMedia.Path;
-                    int pidWidth = g.UserInput.GetImageWidth() * 4;
-                    int pidHeight = g.UserInput.GetImageHeight() * 4;
-                    pidWidth = (pidWidth / 16) * 16;
-                    pidHeight = (pidHeight / 16) * 16;
-                    T2IModel refinerFinalModel = g.FinalLoadedModel;
-                    List<T2IModel> refinerFinalModelList = g.FinalLoadedModelList;
-                    g.FinalLoadedModel = pidModel;
-                    g.FinalLoadedModelList = [pidModel];
-                    g.NoVAEOverride = true;
-                    g.IsPixelDecoderStage = true;
-                    (g.FinalLoadedModel, g.CurrentModel, g.CurrentTextEnc, g.CurrentVae) = g.CreateModelLoader(pidModel, "PixelDecoder", sectionId: T2IParamInput.SectionID_PixelDecoder);
-                    g.IsPixelDecoderStage = false;
-                    g.NoVAEOverride = false;
-                    JArray pidPos = g.CreateConditioning(g.UserInput.Get(T2IParamTypes.Prompt), g.CurrentTextEnc.Path, g.FinalLoadedModel, true, isPixelDecoder: true);
-                    JArray pidNeg = g.CreateConditioning(g.UserInput.Get(T2IParamTypes.NegativePrompt), g.CurrentTextEnc.Path, g.FinalLoadedModel, false, isPixelDecoder: true);
-                    string pidCond = g.CreateNode("PiDConditioning", new JObject()
-                    {
-                        ["positive"] = pidPos,
-                        ["latent"] = refinedLatent,
-                        ["latent_format"] = pidLatentFormat,
-                        ["degrade_sigma"] = 0.0
-                    });
-                    string pidEmptyLatent = g.CreateNode("EmptyChromaRadianceLatentImage", new JObject()
-                    {
-                        ["batch_size"] = g.UserInput.Get(T2IParamTypes.BatchSize, 1),
-                        ["width"] = pidWidth,
-                        ["height"] = pidHeight
-                    });
-                    int pidSteps = g.UserInput.GetNullable(T2IParamTypes.Steps, T2IParamInput.SectionID_PixelDecoder, false) ?? 4;
-                    double pidCfg = g.UserInput.GetNullable(T2IParamTypes.CFGScale, T2IParamInput.SectionID_PixelDecoder, false) ?? 1.0;
-                    string pidSampler = g.UserInput.Get(ComfyUIBackendExtension.SamplerParam, null, sectionId: T2IParamInput.SectionID_PixelDecoder, includeBase: false);
-                    string pidScheduler = g.UserInput.Get(ComfyUIBackendExtension.SchedulerParam, null, sectionId: T2IParamInput.SectionID_PixelDecoder, includeBase: false);
-                    string pidSampled = g.CreateKSampler(g.CurrentModel.Path, [pidCond, 0], pidNeg, [pidEmptyLatent, 0], pidCfg, pidSteps, 0, 10000,
-                        g.UserInput.Get(T2IParamTypes.Seed) + 2, false, true, defsampler: "lcm", defscheduler: "simple", explicitSampler: pidSampler, explicitScheduler: pidScheduler, sectionId: T2IParamInput.SectionID_PixelDecoder);
-                    g.CurrentMedia = g.CurrentMedia.WithPath([pidSampled, 0], WGNodeData.DT_LATENT_IMAGE, pidModel.ModelClass?.CompatClass);
-                    g.CurrentMedia.Width = pidWidth;
-                    g.CurrentMedia.Height = pidHeight;
-                    g.FinalLoadedModel = refinerFinalModel;
-                    g.FinalLoadedModelList = refinerFinalModelList;
-                }
             }
         }, -4);
         #endregion
         #region VAEDecode
         AddStep(g =>
         {
+            if (g.UserInput.TryGet(ComfyUIBackendExtension.PixelDecoderModel, out T2IModel pixelDecoder) && g.CurrentMedia.DataType == WGNodeData.DT_LATENT_IMAGE)
+            {
+                if (pixelDecoder.ModelClass?.CompatClass?.ID != "pid")
+                {
+                    throw new SwarmUserErrorException($"Pixel Decoder Model is set to '{pixelDecoder.Name}', but that is not a PiD model.");
+                }
+                g.CurrentMedia = g.CreatePixelDecode(pixelDecoder, g.CurrentMedia, g.CurrentVae, g.UserInput.Get(T2IParamTypes.Seed) + 3);
+            }
             g.CurrentMedia = g.CurrentMedia.DecodeLatents(g.CurrentVae, null, "8");
             JArray maskShrinkApply = doMaskShrinkApply(g, g.CurrentMedia.Path);
             g.CurrentMedia = g.CurrentMedia.WithPath(maskShrinkApply);
diff --git a/src/Text2Image/T2IModelClass.cs b/src/Text2Image/T2IModelClass.cs
index 1a34278d5..de6678cf0 100644
--- a/src/Text2Image/T2IModelClass.cs
+++ b/src/Text2Image/T2IModelClass.cs
@@ -57,6 +57,9 @@ public record class T2IModelCompatClass
     /// <summary>If true, this is a model that primarily operates on audio.</summary>
     public bool IsAudioModel = false;
 
+    /// <summary>If this class natively works in a standard shared VAE/latent space, the ID of that family (see <see cref="T2IModelClassSorter.VaeFamilies"/>).</summary>
+    public string VaeFamily = null;
+
     /// <summary>Get a networkable JObject for this compat class.</summary>
     public JObject ToNetData()
     {
@@ -67,7 +70,8 @@ public JObject ToNetData()
             ["loras_target_text_enc"] = LorasTargetTextEnc,
             ["is_text2video"] = IsText2Video,
             ["is_image2video"] = IsImage2Video,
-            ["is_audio_model"] = IsAudioModel
+            ["is_audio_model"] = IsAudioModel,
+            ["vae_family"] = VaeFamily
         };
     }
 }
diff --git a/src/Text2Image/T2IModelClassSorter.cs b/src/Text2Image/T2IModelClassSorter.cs
index df9d24c4c..1a8a08c24 100644
--- a/src/Text2Image/T2IModelClassSorter.cs
+++ b/src/Text2Image/T2IModelClassSorter.cs
@@ -38,62 +38,72 @@ public static T2IModelCompatClass
         CompatSdv1 = RegisterCompat(new() { ID = "stable-diffusion-v1", ShortCode = "SDv1" }),
         CompatSdv2 = RegisterCompat(new() { ID = "stable-diffusion-v2", ShortCode = "SDv2" }),
         CompatSdv2Turbo = RegisterCompat(new() { ID = "stable-diffusion-v2-turbo", ShortCode = "SDv2" }),
-        CompatSdxl = RegisterCompat(new() { ID = "stable-diffusion-xl-v1", ShortCode = "SDXL" }),
-        CompatSdxlRefiner = RegisterCompat(new() { ID = "stable-diffusion-xl-v1-refiner", ShortCode = "SDXL" }),
+        CompatSdxl = RegisterCompat(new() { ID = "stable-diffusion-xl-v1", ShortCode = "SDXL", VaeFamily = "sdxl" }),
+        CompatSdxlRefiner = RegisterCompat(new() { ID = "stable-diffusion-xl-v1-refiner", ShortCode = "SDXL", VaeFamily = "sdxl" }),
         CompatSvd = RegisterCompat(new() { ID = "stable-video-diffusion-img2vid-v1", ShortCode = "SVD", IsImage2Video = true }),
         CompatCascade = RegisterCompat(new() { ID = "stable-cascade-v1", ShortCode = "Casc" }),
-        CompatSd3Medium = RegisterCompat(new() { ID = "stable-diffusion-v3-medium", ShortCode = "SD3m" }),
-        CompatSd35Large = RegisterCompat(new() { ID = "stable-diffusion-v3.5-large", ShortCode = "SD35L" }),
-        CompatSd35Medium = RegisterCompat(new() { ID = "stable-diffusion-v3.5-medium", ShortCode = "SD35m" }),
-        CompatSd3 = RegisterCompat(new() { ID = "stable-diffusion-v3", ShortCode = "SD3" }),
+        CompatSd3Medium = RegisterCompat(new() { ID = "stable-diffusion-v3-medium", ShortCode = "SD3m", VaeFamily = "sd3" }),
+        CompatSd35Large = RegisterCompat(new() { ID = "stable-diffusion-v3.5-large", ShortCode = "SD35L", VaeFamily = "sd3" }),
+        CompatSd35Medium = RegisterCompat(new() { ID = "stable-diffusion-v3.5-medium", ShortCode = "SD35m", VaeFamily = "sd3" }),
+        CompatSd3 = RegisterCompat(new() { ID = "stable-diffusion-v3", ShortCode = "SD3", VaeFamily = "sd3" }),
         // 2024-2025 era models
-        CompatFlux = RegisterCompat(new() { ID = "flux-1", ShortCode = "Flux", LorasTargetTextEnc = false }),
+        CompatFlux = RegisterCompat(new() { ID = "flux-1", ShortCode = "Flux", LorasTargetTextEnc = false, VaeFamily = "flux1" }),
         CompatWan21 = RegisterCompat(new() { ID = "wan-21", ShortCode = "Wan14B", LorasTargetTextEnc = false, IsText2Video = true, IsImage2Video = true }),
         CompatWan21_1_3b = RegisterCompat(new() { ID = "wan-21-1_3b", ShortCode = "Wan1B", LorasTargetTextEnc = false, IsText2Video = true, IsImage2Video = true }),
         CompatWan21_14b = RegisterCompat(new() { ID = "wan-21-14b", ShortCode = "Wan14B", LorasTargetTextEnc = false, IsText2Video = true, IsImage2Video = true }),
         CompatWan22_5b = RegisterCompat(new() { ID = "wan-22-5b", ShortCode = "Wan5B", LorasTargetTextEnc = false, IsText2Video = true, IsImage2Video = true }),
         CompatHunyuanVideo = RegisterCompat(new() { ID = "hunyuan-video", ShortCode = "HyVid", LorasTargetTextEnc = false, IsText2Video = true, IsImage2Video = true }),
-        CompatChroma = RegisterCompat(new() { ID = "chroma", ShortCode = "Chroma" }),
+        CompatChroma = RegisterCompat(new() { ID = "chroma", ShortCode = "Chroma", VaeFamily = "flux1" }),
         CompatChromaRadiance = RegisterCompat(new() { ID = "chroma-radiance", ShortCode = "ChrRad" }),
         CompatLtxv = RegisterCompat(new() { ID = "lightricks-ltx-video", ShortCode = "LTXV", IsText2Video = true, IsImage2Video = true }),
-        CompatLumina2 = RegisterCompat(new() { ID = "lumina-2", ShortCode = "Lumi2" }),
-        CompatQwenImage = RegisterCompat(new() { ID = "qwen-image", ShortCode = "Qwen", LorasTargetTextEnc = false }),
+        CompatLumina2 = RegisterCompat(new() { ID = "lumina-2", ShortCode = "Lumi2", VaeFamily = "flux1" }),
+        CompatQwenImage = RegisterCompat(new() { ID = "qwen-image", ShortCode = "Qwen", LorasTargetTextEnc = false, VaeFamily = "qwenimage" }),
         CompatHunyuanImage2_1 = RegisterCompat(new() { ID = "hunyuan-image-2_1", ShortCode = "HyImg", LorasTargetTextEnc = false }),
         CompatHunyuanImage2_1Refiner = RegisterCompat(new() { ID = "hunyuan-image-2_1-refiner", ShortCode = "HyImg", LorasTargetTextEnc = false }),
         CompatHunyuanVideo1_5 = RegisterCompat(new() { ID = "hunyuan-video-1_5", ShortCode = "HyVid", LorasTargetTextEnc = false, IsText2Video = true, IsImage2Video = true }),
         // 2025-2026 era models
-        CompatFlux2 = RegisterCompat(new() { ID = "flux-2", ShortCode = "Flux2", LorasTargetTextEnc = false }),
-        CompatFlux2Klein4B = RegisterCompat(new() { ID = "flux-2-klein-4b", ShortCode = "Fl2K4", LorasTargetTextEnc = false }),
-        CompatFlux2Klein9B = RegisterCompat(new() { ID = "flux-2-klein-9b", ShortCode = "Fl2K9", LorasTargetTextEnc = false }),
-        CompatErnieImage = RegisterCompat(new() { ID = "ernie-image", ShortCode = "Ernie", LorasTargetTextEnc = false }),
+        CompatFlux2 = RegisterCompat(new() { ID = "flux-2", ShortCode = "Flux2", LorasTargetTextEnc = false, VaeFamily = "flux2" }),
+        CompatFlux2Klein4B = RegisterCompat(new() { ID = "flux-2-klein-4b", ShortCode = "Fl2K4", LorasTargetTextEnc = false, VaeFamily = "flux2" }),
+        CompatFlux2Klein9B = RegisterCompat(new() { ID = "flux-2-klein-9b", ShortCode = "Fl2K9", LorasTargetTextEnc = false, VaeFamily = "flux2" }),
+        CompatErnieImage = RegisterCompat(new() { ID = "ernie-image", ShortCode = "Ernie", LorasTargetTextEnc = false, VaeFamily = "flux2" }),
         CompatLtxv2 = RegisterCompat(new() { ID = "lightricks-ltx-video-2", ShortCode = "LTXV2", IsText2Video = true, IsImage2Video = true }),
-        CompatZImage = RegisterCompat(new() { ID = "z-image", ShortCode = "ZImg", LorasTargetTextEnc = false }),
+        CompatZImage = RegisterCompat(new() { ID = "z-image", ShortCode = "ZImg", LorasTargetTextEnc = false, VaeFamily = "flux1" }),
         CompatZetaChroma = RegisterCompat(new() { ID = "zeta-chroma", ShortCode = "ZChr", LorasTargetTextEnc = false }),
-        CompatAnima = RegisterCompat(new() { ID = "anima", ShortCode = "Anima", LorasTargetTextEnc = false }),
+        CompatAnima = RegisterCompat(new() { ID = "anima", ShortCode = "Anima", LorasTargetTextEnc = false, VaeFamily = "qwenimage" }),
         CompatHiDreamO1 = RegisterCompat(new() { ID = "hidream-o1", ShortCode = "HiDrO1", LorasTargetTextEnc = false }),
-        CompatLens = RegisterCompat(new() { ID = "lens", ShortCode = "Lens", LorasTargetTextEnc = false }),
+        CompatLens = RegisterCompat(new() { ID = "lens", ShortCode = "Lens", LorasTargetTextEnc = false, VaeFamily = "flux2" }),
         CompatPiD = RegisterCompat(new() { ID = "pid", ShortCode = "PiD", LorasTargetTextEnc = false }),
         CompatPixelDiT = RegisterCompat(new() { ID = "pixeldit", ShortCode = "PixDiT", LorasTargetTextEnc = false }),
         // Audio models
         CompatAceStep15 = RegisterCompat(new() { ID = "ace-step-1_5", ShortCode = "Ace15", IsAudioModel = true }),
         // Obscure old random ones
-        CompatAuraFlow = RegisterCompat(new() { ID = "auraflow-v1", ShortCode = "Aura" }),
-        CompatHiDreamI1 = RegisterCompat(new() { ID = "hidream-i1", ShortCode = "HiDrm", LorasTargetTextEnc = false }),
-        CompatOmniGen2 = RegisterCompat(new() { ID = "omnigen-2", ShortCode = "Omni2" }),
-        CompatSegmindStableDiffusion1b = RegisterCompat(new() { ID = "segmind-stable-diffusion-1b", ShortCode = "SSD1B" }),
+        CompatAuraFlow = RegisterCompat(new() { ID = "auraflow-v1", ShortCode = "Aura", VaeFamily = "sdxl" }),
+        CompatHiDreamI1 = RegisterCompat(new() { ID = "hidream-i1", ShortCode = "HiDrm", LorasTargetTextEnc = false, VaeFamily = "flux1" }),
+        CompatOmniGen2 = RegisterCompat(new() { ID = "omnigen-2", ShortCode = "Omni2", VaeFamily = "flux1" }),
+        CompatSegmindStableDiffusion1b = RegisterCompat(new() { ID = "segmind-stable-diffusion-1b", ShortCode = "SSD1B", VaeFamily = "sdxl" }),
         CompatCosmos = RegisterCompat(new() { ID = "nvidia-cosmos-1", ShortCode = "Cosmos", IsText2Video = true, IsImage2Video = true }),
         CompatCosmosPredict2_2b = RegisterCompat(new() { ID = "nvidia-cosmos-predict2-t2i-2b", ShortCode = "Pred2", IsText2Video = true }),
         CompatCosmosPredict2_14b = RegisterCompat(new() { ID = "nvidia-cosmos-predict2-t2i-14b", ShortCode = "Pred2", IsText2Video = true }),
         CompatAltDiffusion = RegisterCompat(new() { ID = "alt_diffusion_v1", ShortCode = "AltD" }),
         CompatSana = RegisterCompat(new() { ID = "nvidia-sana-1600", ShortCode = "Sana" }),
-        CompatPixartMsSigmaXl2 = RegisterCompat(new() { ID = "pixart-ms-sigma-xl-2", ShortCode = "Pix" }),
-        CompatOvis = RegisterCompat(new() { ID = "ovis", ShortCode = "Ovis", LorasTargetTextEnc = false }),
-        CompatLongcatImage = RegisterCompat(new() { ID = "longcat-image", ShortCode = "LCat", LorasTargetTextEnc = false }),
+        CompatPixartMsSigmaXl2 = RegisterCompat(new() { ID = "pixart-ms-sigma-xl-2", ShortCode = "Pix", VaeFamily = "sdxl" }),
+        CompatOvis = RegisterCompat(new() { ID = "ovis", ShortCode = "Ovis", LorasTargetTextEnc = false, VaeFamily = "flux1" }),
+        CompatLongcatImage = RegisterCompat(new() { ID = "longcat-image", ShortCode = "LCat", LorasTargetTextEnc = false, VaeFamily = "flux1" }),
         CompatGenmoMochi = RegisterCompat(new() { ID = "genmo-mochi-1", IsText2Video = true, ShortCode = "Mochi" }),
-        CompatKandinsky5ImgLite = RegisterCompat(new() { ID = "kandinsky5-imglite", ShortCode = "Kan5IL", LorasTargetTextEnc = false }),
+        CompatKandinsky5ImgLite = RegisterCompat(new() { ID = "kandinsky5-imglite", ShortCode = "Kan5IL", LorasTargetTextEnc = false, VaeFamily = "flux1" }),
         CompatKandinsky5VidLite = RegisterCompat(new() { ID = "kandinsky5-vidlite", ShortCode = "Kan5VL", LorasTargetTextEnc = false, IsText2Video = true, IsImage2Video = true }),
         CompatKandinsky5VidPro = RegisterCompat(new() { ID = "kandinsky5-vidpro", ShortCode = "Kan5VP", LorasTargetTextEnc = false, IsText2Video = true, IsImage2Video = true });
 
+    /// <summary>Standard shared VAE/latent-space families.</summary>
+    public static Dictionary<string, (string KnownVae, string VaeCompat)> VaeFamilies = new()
+    {
+        ["flux1"] = ("flux-ae", "flux-1"),
+        ["flux2"] = ("flux2-vae", "flux-2"),
+        ["sd3"] = ("sd35-vae", "stable-diffusion-v3"),
+        ["sdxl"] = ("sdxl-vae", "stable-diffusion-xl-v1"),
+        ["qwenimage"] = ("qwen-image-vae", "qwen-image")
+    };
+
     /// <summary>Initialize the class sorter.</summary>
     public static void Init()
     {

From 5a796a3164e294071b4b3c547941da0f40d9dbae Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Thu, 4 Jun 2026 15:46:34 -0700
Subject: [PATCH 11/13] minor doc

---
 docs/Model Support.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/Model Support.md b/docs/Model Support.md
index a1c8f1ab0..ac2348e58 100644
--- a/docs/Model Support.md	
+++ b/docs/Model Support.md	
@@ -650,7 +650,7 @@ For upscaling with SD3, the `Refiner Do Tiling` parameter is highly recommended
 - It does not use a VAE
 - Uses the Gemma 2 2B text encoder, will be downloaded and handled automatically
 - **Parameters:**
-    - **Sampler:** Default is fine.
+    - **Sampler:** Default is fine (LCM).
     - **Scheduler:** Default is fine.
     - **CFG Scale:** `4` is recommended.
     - **Steps:** `30` is recommended.

From dccc351d86dccdba18a6dea49a2880912e2e9ddb Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Thu, 4 Jun 2026 18:21:46 -0700
Subject: [PATCH 12/13] docs

---
 docs/Features/README.md            |  1 +
 docs/Features/Upscaling.md         |  9 +++++++++
 docs/Model Support.md              | 17 +----------------
 docs/Obscure Model Support.md      | 17 +++++++++++++++++
 launchtools/comfy-install-linux.sh |  2 +-
 5 files changed, 29 insertions(+), 17 deletions(-)
 create mode 100644 docs/Features/Upscaling.md

diff --git a/docs/Features/README.md b/docs/Features/README.md
index de695173e..6b29e9e72 100644
--- a/docs/Features/README.md
+++ b/docs/Features/README.md
@@ -11,3 +11,4 @@ See [The Docs Readme](/docs/README.md) for general listing of documentation and
 - [Webhooks](/docs/Features/Webhooks.md) for info about custom defined webhooks triggered by your SwarmUI server.
 - [UISounds](/docs/Features/UISounds.md) for info about sound playback in the UI (eg a sound to play after generations complete).
 - [AutoScalingBackend](/docs/Features/AutoScalingBackend.md) for info about the specialty advanced usage "Auto-Scaling" backend (for Slurm/Kubernetes/etc).
+- [Upscaling](/docs/Features/Upscaling.md) for info about upscaling images and videos (ie increasing resolution, especially to improve quality).
diff --git a/docs/Features/Upscaling.md b/docs/Features/Upscaling.md
new file mode 100644
index 000000000..0da35122e
--- /dev/null
+++ b/docs/Features/Upscaling.md
@@ -0,0 +1,9 @@
+# Upscaling In SwarmUI
+
+(TODO)
+
+# Pixel Decoder (PiD)
+
+(TODO)
+
+Downloads here: <https://huggingface.co/Comfy-Org/PixelDiT/tree/main/diffusion_models>
diff --git a/docs/Model Support.md b/docs/Model Support.md
index ac2348e58..c2697e2cb 100644
--- a/docs/Model Support.md	
+++ b/docs/Model Support.md	
@@ -21,7 +21,6 @@
 [ERNIE](#ernie) | DiT | 2026 | Baidu | 8B | Minimal | Modern, intelligent, good quality, fast |
 [HiDream O1](#hidream-o1) | "Pixel UiT" | 2026 | HiDream | 8B | Minimal | Modern, intelligent, fast, decent quality |
 [Lens](#lens) | MMDiT | 2026 | Microsoft | 4B | Minimal | Modern, lightweight |
-[PixelDiT](#pixeldit) | Pixel DiT | 2026 | NVIDIA | 1.3B | Minimal | Modern, fast, pixel-space |
 
 Old or bad options also tracked listed via [Obscure Model Support](/docs/Obscure%20Model%20Support.md):
 
@@ -39,6 +38,7 @@ Old or bad options also tracked listed via [Obscure Model Support](/docs/Obscure
 [Ovis](/docs/Obscure%20Model%20Support.md#ovis) | MMDiT | 2025 | AIDC-AI (Alibaba) | 7B | No | Passable quality, but outclassed on launch |
 [LongCat-Image](/docs/Obscure%20Model%20Support.md#longcat-image) | MMDiT | 2025 | LongCat | 6B | No | Passable quality, but outclassed on launch |
 [Zeta Chroma](/docs/Obscure%20Model%20Support.md#zeta-chroma) | Pixel S3-DiT | 2026 | Lodestone Rock | 6B | No | Modern, Pixel-space Z-Image variant |
+[PixelDiT](/docs/Obscure%20Model%20Support.md#pixeldit) | Pixel DiT | 2026 | NVIDIA | 1.3B | Minimal | Modern, fast, pixel-space, but very bad relative quality on launch |
 
 - **Architecture** is the fundamental machine learning structure used for the model, UNet's were used in the past but DiT (Diffusion Transformers) are the modern choice
 - **Scale** is how big the model is - "B" for "Billion", so for example "2B" means "Two billion parameters".
@@ -641,21 +641,6 @@ For upscaling with SD3, the `Refiner Do Tiling` parameter is highly recommended
     - **Steps:** For Turbo, `4` is recommended, `8` works well. For Base, `20` as normal.
     - **Resolution:** Side length `1440` is the official default, but 1024 is a reasonable option. It retains coherence down to about 512 and up to about 2048.
 
-# PixelDiT
-
-- NVIDIA's [PixelDiT](<https://huggingface.co/Comfy-Org/PixelDiT>) is supported in SwarmUI!
-    - The smaller FP8 model can be downloaded here: [Comfy-Org/PixelDiT - fp8](<https://huggingface.co/Comfy-Org/PixelDiT/resolve/main/diffusion_models/pixeldit_1300m_1024px_mxfp8.safetensors>)
-    - Or fat BF16 version: [Comfy-Org/PixelDiT - bf16](<https://huggingface.co/Comfy-Org/PixelDiT/resolve/main/diffusion_models/pixeldit_1300m_1024px_bf16.safetensors>)
-    - Save in `diffusion_models`
-- It does not use a VAE
-- Uses the Gemma 2 2B text encoder, will be downloaded and handled automatically
-- **Parameters:**
-    - **Sampler:** Default is fine (LCM).
-    - **Scheduler:** Default is fine.
-    - **CFG Scale:** `4` is recommended.
-    - **Steps:** `30` is recommended.
-    - **Resolution:** Side length `1024` is the standard.
-
 # Video Models
 
 - Video models are documented in [Video Model Support](/docs/Video%20Model%20Support.md).
diff --git a/docs/Obscure Model Support.md b/docs/Obscure Model Support.md
index bf0064c7b..31ab00063 100644
--- a/docs/Obscure Model Support.md	
+++ b/docs/Obscure Model Support.md	
@@ -18,6 +18,7 @@ This doc tracks specifically the old, bad, unpopular, etc. models that are suppo
 [Ovis](#ovis) | MMDiT | 2025 | AIDC-AI (Alibaba) | 7B | No | Passable quality, but outclassed on launch |
 [LongCat-Image](#longcat-image) | MMDiT | 2025 | LongCat | 6B | No | Passable quality, but outclassed on launch |
 [Zeta Chroma](#zeta-chroma) | Pixel S3-DiT | 2026 | Lodestone Rock | 6B | No | Modern, Pixel-space Z-Image variant |
+[PixelDiT](#pixeldit) | Pixel DiT | 2026 | NVIDIA | 1.3B | Minimal | Modern, fast, pixel-space, but very bad relative quality on launch |
 
 Obscure video models are tracked at the [Video Models heading](#video-models)
 
@@ -200,6 +201,22 @@ These steps are not friendly to beginners (if Sana gains popularity, likely more
     - **Scheduler:** Default is fine
     - **Resolution:** Side length `1024` is the standard, broadly supports the same range as regular Z-Image (roughly 512 to 2048)
 
+# PixelDiT
+
+- NVIDIA's [PixelDiT](<https://huggingface.co/Comfy-Org/PixelDiT>) is supported in SwarmUI!
+    - The smaller FP8 model can be downloaded here: [Comfy-Org/PixelDiT - fp8](<https://huggingface.co/Comfy-Org/PixelDiT/resolve/main/diffusion_models/pixeldit_1300m_1024px_mxfp8.safetensors>)
+    - Or fat BF16 version: [Comfy-Org/PixelDiT - bf16](<https://huggingface.co/Comfy-Org/PixelDiT/resolve/main/diffusion_models/pixeldit_1300m_1024px_bf16.safetensors>)
+    - Save in `diffusion_models`
+    - Released as a pair with PiD (Pixel Decoder), which is a separate pixel decode/upscale model. See [Features/Upscaling: PiD](/docs/Features/Upscaling.md#pixel-decoder-pid) for more info.
+- It does not use a VAE
+- Uses the Gemma 2 2B text encoder, will be downloaded and handled automatically
+- **Parameters:**
+    - **Sampler:** Default is fine.
+    - **Scheduler:** Default is fine.
+    - **CFG Scale:** `4` is recommended.
+    - **Steps:** `30` is recommended.
+    - **Resolution:** Side length `1024` is the standard.
+
 --------------------------------------------------------------------------
 
 # Video Models
diff --git a/launchtools/comfy-install-linux.sh b/launchtools/comfy-install-linux.sh
index 77778e945..c006ce1b2 100644
--- a/launchtools/comfy-install-linux.sh
+++ b/launchtools/comfy-install-linux.sh
@@ -64,7 +64,7 @@ fi
 # Install PyTorch based on GPU type
 if [ "$GPU_TYPE" == "nv" ]; then
     echo "install nvidia torch..."
-    $python -s -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu129 --no-cache-dir
+    $python -s -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 --no-cache-dir
 elif [ "$GPU_TYPE" == "amd" ]; then
     echo "install amd torch..."
     $python -s -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm7.1 --no-cache-dir

From a774977242a61ae6a9beffdf48bffe0ef0948cb0 Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Thu, 4 Jun 2026 18:27:55 -0700
Subject: [PATCH 13/13] base sampler input wrong pattern

---
 .../ComfyUIBackend/WorkflowGeneratorSteps.cs                | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs
index ee35d9c5e..01148a142 100644
--- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs
+++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs
@@ -1362,11 +1362,7 @@ bool getBestFor(string phrase)
             }
             if (g.IsPiD())
             {
-                if (g.BasicInputImage is null)
-                {
-                    throw new SwarmUserErrorException("PiD models are pixel decoders/upscalers, not image generators, an Init Image is required.");
-                }
-                (WGNodeData pidLatent, string pidFormat) = g.CreatePidCompatLatent(g.FinalLoadedModel, g.BasicInputImage, g.CurrentVae);
+                (WGNodeData pidLatent, string pidFormat) = g.CreatePidCompatLatent(g.FinalLoadedModel, g.CurrentMedia, g.CurrentVae);
                 string pidCond = g.CreateNode("PiDConditioning", new JObject()
                 {
                     ["positive"] = g.FinalPrompt,