From 33bac0c20a18d6b7436dd7e4ae5763c165200212 Mon Sep 17 00:00:00 2001 From: Juan Treminio Date: Tue, 26 May 2026 12:28:18 -0500 Subject: [PATCH 01/13] Adds NVIDIA PixelDiT and PiD support --- .../ComfyUIBackend/WorkflowGenerator.cs | 2 +- .../WorkflowGeneratorModelSupport.cs | 20 +++++++- .../ComfyUIBackend/WorkflowGeneratorSteps.cs | 46 +++++++++++++++++-- src/Text2Image/T2IModelClassSorter.cs | 13 ++++++ 4 files changed, 74 insertions(+), 7 deletions(-) diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs index cadb9a665..d468224f7 100644 --- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs +++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs @@ -959,7 +959,7 @@ public string CreateKSampler(JArray model, JArray pos, JArray neg, JArray latent } } // TODO: Registry of model default preferences instead of this - else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1()) + else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1() || IsPixelDiT() || IsPiD()) { defscheduler ??= "simple"; } diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs index 48d60e7fa..37533e966 100644 --- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs +++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs @@ -85,6 +85,12 @@ public bool IsKontext() /// Returns true if the current model is Chroma Radiance. public bool IsChromaRadiance() => IsModelCompatClass(T2IModelClassSorter.CompatChromaRadiance); + /// Returns true if the current model is NVIDIA PixelDiT. + public bool IsPixelDiT() => IsModelCompatClass(T2IModelClassSorter.CompatPixelDiT); + + /// Returns true if the current model is NVIDIA PiD. + public bool IsPiD() => IsModelCompatClass(T2IModelClassSorter.CompatPiD); + /// Returns true if the current model is HiDream-i1. public bool IsHiDream() => IsModelCompatClass(T2IModelClassSorter.CompatHiDreamI1); @@ -398,7 +404,7 @@ public WGNodeData EmptyImage(int width, int height, int batchSize, string id = n ["width"] = width }, id), frames); } - else if (IsChromaRadiance() || IsZetaChroma()) + else if (IsChromaRadiance() || IsZetaChroma() || IsPixelDiT()) { return resultImage(CreateNode("EmptyChromaRadianceLatentImage", new JObject() { @@ -649,6 +655,11 @@ public string GetGemma2Model() return RequireClipModel("gemma_2_2b_fp16.safetensors", "https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/resolve/main/split_files/text_encoders/gemma_2_2b_fp16.safetensors", "29761442862f8d064d3f854bb6fabf4379dcff511a7f6ba9405a00bd0f7e2dbd", T2IParamTypes.GemmaModel); } + public string GetGemma2_2bElmModel() + { + return RequireClipModel("gemma_2_2b_it_elm_fp8_scaled.safetensors", "https://huggingface.co/Comfy-Org/PixelDiT/resolve/main/text_encoders/gemma_2_2b_it_elm_fp8_scaled.safetensors", "87692b2ab1714028e29910ea645d96db656505ca0805051048d2298b225c02d1", T2IParamTypes.GemmaModel); + } + public string GetGemma3_12bModel() { return RequireClipModel("gemma_3_12B_it.safetensors", "https://huggingface.co/Comfy-Org/ltx-2/resolve/main/split_files/text_encoders/gemma_3_12B_it_fp4_mixed.safetensors", "aaca463d11e6d8d2a4bdb0d6299214c15ef78a3f73e0ef8113d5a9d0219b3f6d", T2IParamTypes.GemmaModel); @@ -899,7 +910,7 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC) { dtype = "default"; } - else if (IsZImage() || IsZetaChroma() || IsAnima()) // Model is small and dense, so trust user preferred download format + else if (IsZImage() || IsZetaChroma() || IsAnima() || IsPixelDiT() || IsPiD()) // Model is small and dense, so trust user preferred download format { dtype = "default"; } @@ -1108,6 +1119,11 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC) helpers.DoVaeLoader(UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFluxVAE, "flux-1", "flux-ae"); } } + else if (IsPixelDiT() || IsPiD()) + { + helpers.LoadClip("pixeldit", helpers.GetGemma2_2bElmModel()); + LoadingVAE = CreateVAELoader("pixel_space"); + } else if (IsHiDream()) { string loaderType = "QuadrupleCLIPLoader"; diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs index 067bd1ee8..8fa3ebad8 100644 --- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs +++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs @@ -1451,6 +1451,48 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn) g.NoVAEOverride = false; prompt = g.CreateConditioning(g.UserInput.Get(T2IParamTypes.Prompt), g.CurrentTextEnc.Path, g.FinalLoadedModel, true, isRefiner: true); negPrompt = g.CreateConditioning(g.UserInput.Get(T2IParamTypes.NegativePrompt), g.CurrentTextEnc.Path, g.FinalLoadedModel, false, isRefiner: true); + string explicitSampler = g.UserInput.Get(ComfyUIBackendExtension.SamplerParam, null, sectionId: T2IParamInput.SectionID_Refiner, includeBase: false) ?? g.UserInput.Get(ComfyUIBackendExtension.RefinerSamplerParam, null); + string explicitScheduler = g.UserInput.Get(ComfyUIBackendExtension.SchedulerParam, null, sectionId: T2IParamInput.SectionID_Refiner, includeBase: false) ?? g.UserInput.Get(ComfyUIBackendExtension.RefinerSchedulerParam, null); + int steps = g.UserInput.Get(T2IParamTypes.RefinerSteps, g.UserInput.Get(T2IParamTypes.Steps, 20, sectionId: T2IParamInput.SectionID_Refiner), sectionId: T2IParamInput.SectionID_Refiner); + double cfg = g.UserInput.Get(T2IParamTypes.RefinerCFGScale, g.UserInput.Get(T2IParamTypes.CFGScale, 7, sectionId: T2IParamInput.SectionID_Refiner), sectionId: T2IParamInput.SectionID_Refiner); + if (g.IsPiD()) + { + string baseCompatId = baseModel.ModelClass?.CompatClass?.ID ?? ""; + string pidLatentFormat = + baseCompatId.StartsWith("flux-2") ? "flux2" + : baseCompatId == "flux-1" ? "flux1" + : baseCompatId.StartsWith("stable-diffusion-v3") ? "sd3" + : (baseCompatId == "z-image" || baseCompatId == "zeta-chroma") ? "flux1" + : null; + if (pidLatentFormat is null) + { + throw new SwarmUserErrorException($"PiD requires a Flux.1, Flux.2, SD3, or Z-Image base model, but the base model class is '{baseCompatId}'."); + } + string pidCond = g.CreateNode("PiDConditioning", new JObject() + { + ["positive"] = prompt, + ["latent"] = g.CurrentMedia.Path, + ["latent_format"] = pidLatentFormat, + ["degrade_sigma"] = 0.0 + }); + prompt = [pidCond, 0]; + int pidWidth = g.UserInput.GetImageWidth() * 4 / 16 * 16; + int pidHeight = g.UserInput.GetImageHeight() * 4 / 16 * 16; + string pidLatent = g.CreateNode("EmptyChromaRadianceLatentImage", new JObject() + { + ["batch_size"] = g.UserInput.Get(T2IParamTypes.BatchSize, 1), + ["width"] = pidWidth, + ["height"] = pidHeight + }); + g.CreateKSampler(g.CurrentModel.Path, prompt, negPrompt, [pidLatent, 0], cfg, steps, (int)Math.Round(steps * (1 - refinerControl)), 10000, + g.UserInput.Get(T2IParamTypes.Seed) + 1, false, true, id: "23", + explicitSampler: explicitSampler, explicitScheduler: explicitScheduler, sectionId: T2IParamInput.SectionID_Refiner); + g.CurrentMedia = g.CurrentMedia.WithPath(["23", 0], WGNodeData.DT_LATENT_IMAGE, refineModel.ModelClass?.CompatClass); + g.CurrentMedia.Width = pidWidth; + g.CurrentMedia.Height = pidHeight; + g.IsRefinerStage = false; + return; + } bool doSave = g.UserInput.Get(T2IParamTypes.OutputIntermediateImages, false); bool doUspcale = g.UserInput.TryGet(T2IParamTypes.RefinerUpscale, out double refineUpscale) && refineUpscale != 1; string upscaleMethod = g.UserInput.Get(ComfyUIBackendExtension.RefinerUpscaleMethod, "None"); @@ -1589,10 +1631,6 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn) model = model.WithPath([hyperTileNode, 0]); } g.CurrentMedia = g.CurrentMedia.AsSamplingLatent(g.CurrentVae, g.CurrentAudioVae); - int steps = g.UserInput.Get(T2IParamTypes.RefinerSteps, g.UserInput.Get(T2IParamTypes.Steps, 20, sectionId: T2IParamInput.SectionID_Refiner), sectionId: T2IParamInput.SectionID_Refiner); - double cfg = g.UserInput.Get(T2IParamTypes.RefinerCFGScale, g.UserInput.Get(T2IParamTypes.CFGScale, 7, sectionId: T2IParamInput.SectionID_Refiner), sectionId: T2IParamInput.SectionID_Refiner); - string explicitSampler = g.UserInput.Get(ComfyUIBackendExtension.SamplerParam, null, sectionId: T2IParamInput.SectionID_Refiner, includeBase: false) ?? g.UserInput.Get(ComfyUIBackendExtension.RefinerSamplerParam, null); - string explicitScheduler = g.UserInput.Get(ComfyUIBackendExtension.SchedulerParam, null, sectionId: T2IParamInput.SectionID_Refiner, includeBase: false) ?? g.UserInput.Get(ComfyUIBackendExtension.RefinerSchedulerParam, null); g.CreateKSampler(model.Path, prompt, negPrompt, g.CurrentMedia.Path, cfg, steps, (int)Math.Round(steps * (1 - refinerControl)), 10000, g.UserInput.Get(T2IParamTypes.Seed) + 1, false, method != "StepSwapNoisy", id: "23", doTiled: g.UserInput.Get(T2IParamTypes.RefinerDoTiling, false), explicitSampler: explicitSampler, explicitScheduler: explicitScheduler, sectionId: T2IParamInput.SectionID_Refiner); diff --git a/src/Text2Image/T2IModelClassSorter.cs b/src/Text2Image/T2IModelClassSorter.cs index 2df28ecad..9f37a3bb1 100644 --- a/src/Text2Image/T2IModelClassSorter.cs +++ b/src/Text2Image/T2IModelClassSorter.cs @@ -71,6 +71,8 @@ public static T2IModelCompatClass CompatZetaChroma = RegisterCompat(new() { ID = "zeta-chroma", ShortCode = "ZChr", LorasTargetTextEnc = false }), CompatAnima = RegisterCompat(new() { ID = "anima", ShortCode = "Anima", LorasTargetTextEnc = false }), CompatHiDreamO1 = RegisterCompat(new() { ID = "hidream-o1", ShortCode = "HiDrO1", LorasTargetTextEnc = false }), + CompatPixelDiT = RegisterCompat(new() { ID = "pixeldit", ShortCode = "PixDiT", LorasTargetTextEnc = false }), + CompatPiD = RegisterCompat(new() { ID = "pid", ShortCode = "PiD", LorasTargetTextEnc = false }), // Audio models CompatAceStep15 = RegisterCompat(new() { ID = "ace-step-1_5", ShortCode = "Ace15", IsAudioModel = true }), // Obscure old random ones @@ -204,6 +206,8 @@ bool isZImageLora(JObject h) => (hasLoraKey(h, "layers.0.adaLN_modulation.0") && bool isHiDreamO1Lora(JObject h) => hasLoraKey(h, "final_layer2.linear") && hasLoraKey(h, "language_model.layers.0.self_attn.q_proj"); bool isChroma(JObject h) => h.ContainsKey("distilled_guidance_layer.in_proj.bias") && h.ContainsKey("double_blocks.0.img_attn.proj.bias"); bool isChromaRadiance(JObject h) => h.ContainsKey("nerf_image_embedder.embedder.0.bias"); + bool isPixelDiT(JObject h) => h.ContainsKey("core.pixel_embedder.proj.weight"); + bool isPiD(JObject h) => h.ContainsKey("net.lq_proj.latent_proj.0.weight"); bool isOmniGen(JObject h) => h.ContainsKey("time_caption_embed.timestep_embedder.linear_2.weight") && h.ContainsKey("context_refiner.0.attn.norm_k.weight"); bool isQwenImage(JObject h) => (h.ContainsKey("time_text_embed.timestep_embedder.linear_1.bias") && h.ContainsKey("img_in.bias") && (h.ContainsKey("transformer_blocks.0.attn.add_k_proj.bias") || h.ContainsKey("transformer_blocks.0.attn.add_qkv_proj.bias"))) || (h.ContainsKey("model.diffusion_model.time_text_embed.timestep_embedder.linear_1.bias") && h.ContainsKey("model.diffusion_model.img_in.bias") && (h.ContainsKey("model.diffusion_model.transformer_blocks.0.attn.add_k_proj.bias") || h.ContainsKey("model.diffusion_model.transformer_blocks.0.attn.add_qkv_proj.bias"))); @@ -699,6 +703,15 @@ JToken GetEmbeddingKey(JObject h) { return isChroma(h) && isChromaRadiance(h); }}); + // ====================== NVIDIA PixelDiT / PiD ====================== + Register(new() { ID = "pixeldit", CompatClass = CompatPixelDiT, Name = "NVIDIA PixelDiT", StandardWidth = 1024, StandardHeight = 1024, IsThisModelOfClass = (m, h) => + { + return isPixelDiT(h); + }}); + Register(new() { ID = "pid", CompatClass = CompatPiD, Name = "NVIDIA PiD", StandardWidth = 1024, StandardHeight = 1024, IsThisModelOfClass = (m, h) => + { + return isPiD(h); + }}); Register(new() { ID = "alt_diffusion_v1_512_placeholder", CompatClass = CompatAltDiffusion, Name = "Alt-Diffusion", StandardWidth = 512, StandardHeight = 512, IsThisModelOfClass = (m, h) => { return IsAlt(h); From 3d3a933bb75d9004fb5a3d2458525c01c7250b8c Mon Sep 17 00:00:00 2001 From: Juan Treminio Date: Wed, 27 May 2026 18:07:08 -0500 Subject: [PATCH 02/13] PiD replaces Refiner upscaler, not refiner stage itself --- .../ComfyUIBackend/ComfyUIBackendExtension.cs | 5 +- .../ComfyUIBackend/WorkflowGenerator.cs | 11 +- .../ComfyUIBackend/WorkflowGeneratorSteps.cs | 114 ++++++++++-------- src/Text2Image/T2IModelClassSorter.cs | 2 +- src/Text2Image/T2IParamInput.cs | 2 +- src/Text2Image/T2IPromptHandling.cs | 6 + src/Utils/PromptRegion.cs | 10 +- 7 files changed, 97 insertions(+), 53 deletions(-) diff --git a/src/BuiltinExtensions/ComfyUIBackend/ComfyUIBackendExtension.cs b/src/BuiltinExtensions/ComfyUIBackend/ComfyUIBackendExtension.cs index 92a239823..0f5fc092e 100644 --- a/src/BuiltinExtensions/ComfyUIBackend/ComfyUIBackendExtension.cs +++ b/src/BuiltinExtensions/ComfyUIBackend/ComfyUIBackendExtension.cs @@ -635,6 +635,9 @@ public static void AssignValuesFromRaw(JObject rawObjectInfo) ], Schedulers = ["normal///Normal", "karras///Karras", "exponential///Exponential", "simple///Simple", "ddim_uniform///DDIM Uniform", "sgm_uniform///SGM Uniform", "turbo///Turbo (for turbo models, max 10 steps)", "align_your_steps///Align Your Steps (Model-specific behavior)", "beta///Beta", "linear_quadratic///Linear Quadratic (Mochi)", "ltxv///LTX-Video", "ltxv-image///LTXV-Image", "kl_optimal///KL Optimal (Nvidia AYS)", "flux2///Flux.2"]; + /// Lists PiD decoder models. + public static List PidUpscaleModels(Session session) => [.. Program.MainSDModels.ListModelsFor(session).Where(m => m.ModelClass?.CompatClass?.ID == "pid").OrderBy(m => m.Name).Select(m => $"pidmodel-{m.Name}///PiD Model: {m.Name}")]; + public static List IPAdapterModels = ["None"], IPAdapterWeightTypes = ["standard", "prompt is more important", "style transfer"]; public static List GligenModels = ["None"], YoloModels = [], StyleModels = ["None"], SetClipDevices = ["cpu"]; @@ -752,7 +755,7 @@ public override void OnInit() )); RefinerUpscaleMethod = T2IParamTypes.Register(new("Refiner Upscale Method", "How to upscale the image, if upscaling is used.", "pixel-lanczos", Group: T2IParamTypes.GroupRefiners, OrderPriority: -1, FeatureFlag: "comfyui", ChangeWeight: 1, - GetValues: (_) => UpscalerModels, DependNonDefault: T2IParamTypes.RefinerUpscale.Type.ID + GetValues: (session) => [.. UpscalerModels, .. PidUpscaleModels(session)], DependNonDefault: T2IParamTypes.RefinerUpscale.Type.ID )); RefinerSamplerParam = T2IParamTypes.Register(new("Refiner Sampler", SamplerParam.Type.Description + "\nThis is an override to only affect the Refine/Upscale stage.", "euler", Toggleable: true, FeatureFlag: "comfyui", Group: T2IParamTypes.GroupRefinerOverrides, OrderPriority: -2, diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs index f70fcf5e7..66750efe1 100644 --- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs +++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs @@ -166,6 +166,9 @@ public JArray FinalImageOut /// If true, the generator is currently working on the refiner stage. public bool IsRefinerStage = false; + /// If true, the generator is currently working on the pixel-decoder stage. + public bool IsPixelDecoderStage = false; + /// If true, the generator is currently working on Image2Video. public bool IsImageToVideo = false; @@ -2518,7 +2521,7 @@ public bool ShouldZeroNegative() } /// Creates a "CLIPTextEncode" or equivalent node for the given input, applying prompt-given conditioning modifiers as relevant. - public JArray CreateConditioning(string prompt, JArray clip, T2IModel model, bool isPositive, string firstId = null, bool isRefiner = false, bool isVideo = false, bool isVideoSwap = false) + public JArray CreateConditioning(string prompt, JArray clip, T2IModel model, bool isPositive, string firstId = null, bool isRefiner = false, bool isVideo = false, bool isVideoSwap = false, bool isPixelDecoder = false) { PromptRegion regionalizer = new(prompt); string globalPromptText = regionalizer.GlobalPrompt; @@ -2534,7 +2537,11 @@ public JArray CreateConditioning(string prompt, JArray clip, T2IModel model, boo { globalPromptText = $"{globalPromptText} {regionalizer.RefinerPrompt}"; } - else if (!isVideo && !isRefiner && !string.IsNullOrWhiteSpace(regionalizer.BasePrompt)) + else if (isPixelDecoder && !string.IsNullOrWhiteSpace(regionalizer.PixelDecoderPrompt)) + { + globalPromptText = $"{globalPromptText} {regionalizer.PixelDecoderPrompt}"; + } + else if (!isVideo && !isRefiner && !isPixelDecoder && !string.IsNullOrWhiteSpace(regionalizer.BasePrompt)) { globalPromptText = $"{globalPromptText} {regionalizer.BasePrompt}"; } diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs index 8fa3ebad8..d75ccfad9 100644 --- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs +++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs @@ -106,7 +106,11 @@ public static void Register() (g.LoadingModel, g.LoadingClip) = g.LoadLorasForConfinement(0, g.LoadingModel, g.LoadingClip); if (g.IsRefinerStage) { - (g.LoadingModel, g.LoadingClip) = g.LoadLorasForConfinement(1, g.LoadingModel, g.LoadingClip); + (g.LoadingModel, g.LoadingClip) = g.LoadLorasForConfinement(T2IParamInput.SectionID_Refiner, g.LoadingModel, g.LoadingClip); + } + else if (g.IsPixelDecoderStage) + { + (g.LoadingModel, g.LoadingClip) = g.LoadLorasForConfinement(T2IParamInput.SectionID_PixelDecoder, g.LoadingModel, g.LoadingClip); } else if (g.IsImageToVideoSwap) { @@ -1451,53 +1455,11 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn) g.NoVAEOverride = false; prompt = g.CreateConditioning(g.UserInput.Get(T2IParamTypes.Prompt), g.CurrentTextEnc.Path, g.FinalLoadedModel, true, isRefiner: true); negPrompt = g.CreateConditioning(g.UserInput.Get(T2IParamTypes.NegativePrompt), g.CurrentTextEnc.Path, g.FinalLoadedModel, false, isRefiner: true); - string explicitSampler = g.UserInput.Get(ComfyUIBackendExtension.SamplerParam, null, sectionId: T2IParamInput.SectionID_Refiner, includeBase: false) ?? g.UserInput.Get(ComfyUIBackendExtension.RefinerSamplerParam, null); - string explicitScheduler = g.UserInput.Get(ComfyUIBackendExtension.SchedulerParam, null, sectionId: T2IParamInput.SectionID_Refiner, includeBase: false) ?? g.UserInput.Get(ComfyUIBackendExtension.RefinerSchedulerParam, null); - int steps = g.UserInput.Get(T2IParamTypes.RefinerSteps, g.UserInput.Get(T2IParamTypes.Steps, 20, sectionId: T2IParamInput.SectionID_Refiner), sectionId: T2IParamInput.SectionID_Refiner); - double cfg = g.UserInput.Get(T2IParamTypes.RefinerCFGScale, g.UserInput.Get(T2IParamTypes.CFGScale, 7, sectionId: T2IParamInput.SectionID_Refiner), sectionId: T2IParamInput.SectionID_Refiner); - if (g.IsPiD()) - { - string baseCompatId = baseModel.ModelClass?.CompatClass?.ID ?? ""; - string pidLatentFormat = - baseCompatId.StartsWith("flux-2") ? "flux2" - : baseCompatId == "flux-1" ? "flux1" - : baseCompatId.StartsWith("stable-diffusion-v3") ? "sd3" - : (baseCompatId == "z-image" || baseCompatId == "zeta-chroma") ? "flux1" - : null; - if (pidLatentFormat is null) - { - throw new SwarmUserErrorException($"PiD requires a Flux.1, Flux.2, SD3, or Z-Image base model, but the base model class is '{baseCompatId}'."); - } - string pidCond = g.CreateNode("PiDConditioning", new JObject() - { - ["positive"] = prompt, - ["latent"] = g.CurrentMedia.Path, - ["latent_format"] = pidLatentFormat, - ["degrade_sigma"] = 0.0 - }); - prompt = [pidCond, 0]; - int pidWidth = g.UserInput.GetImageWidth() * 4 / 16 * 16; - int pidHeight = g.UserInput.GetImageHeight() * 4 / 16 * 16; - string pidLatent = g.CreateNode("EmptyChromaRadianceLatentImage", new JObject() - { - ["batch_size"] = g.UserInput.Get(T2IParamTypes.BatchSize, 1), - ["width"] = pidWidth, - ["height"] = pidHeight - }); - g.CreateKSampler(g.CurrentModel.Path, prompt, negPrompt, [pidLatent, 0], cfg, steps, (int)Math.Round(steps * (1 - refinerControl)), 10000, - g.UserInput.Get(T2IParamTypes.Seed) + 1, false, true, id: "23", - explicitSampler: explicitSampler, explicitScheduler: explicitScheduler, sectionId: T2IParamInput.SectionID_Refiner); - g.CurrentMedia = g.CurrentMedia.WithPath(["23", 0], WGNodeData.DT_LATENT_IMAGE, refineModel.ModelClass?.CompatClass); - g.CurrentMedia.Width = pidWidth; - g.CurrentMedia.Height = pidHeight; - g.IsRefinerStage = false; - return; - } bool doSave = g.UserInput.Get(T2IParamTypes.OutputIntermediateImages, false); - bool doUspcale = g.UserInput.TryGet(T2IParamTypes.RefinerUpscale, out double refineUpscale) && refineUpscale != 1; + bool doUpscale = g.UserInput.TryGet(T2IParamTypes.RefinerUpscale, out double refineUpscale) && refineUpscale != 1; string upscaleMethod = g.UserInput.Get(ComfyUIBackendExtension.RefinerUpscaleMethod, "None"); // TODO: Better same-VAE check - bool doPixelUpscale = doUspcale && (upscaleMethod.StartsWith("pixel-") || upscaleMethod.StartsWith("model-")); + bool doPixelUpscale = doUpscale && (upscaleMethod.StartsWith("pixel-") || upscaleMethod.StartsWith("model-")); int width = (int)Math.Round(g.UserInput.GetImageWidth() * refineUpscale); int height = (int)Math.Round(g.UserInput.GetImageHeight() * refineUpscale); width = (width / 16) * 16; // avoid unworkable output sizes @@ -1559,7 +1521,7 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn) g.CurrentMedia = decoded.EncodeToLatent(g.CurrentVae, "25"); } } - if (doUspcale && upscaleMethod.StartsWith("latent-")) + if (doUpscale && upscaleMethod.StartsWith("latent-")) { g.CurrentMedia = g.CurrentMedia.AsLatentImage(g.CurrentVae); g.CreateNode("LatentUpscaleBy", new JObject() @@ -1572,7 +1534,7 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn) g.CurrentMedia.Width = width; g.CurrentMedia.Height = height; } - else if (doUspcale && upscaleMethod.StartsWith("latentmodel-")) + else if (doUpscale && upscaleMethod.StartsWith("latentmodel-")) { g.CreateNode("LatentUpscaleModelLoader", new JObject() { @@ -1631,11 +1593,69 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn) model = model.WithPath([hyperTileNode, 0]); } g.CurrentMedia = g.CurrentMedia.AsSamplingLatent(g.CurrentVae, g.CurrentAudioVae); + int steps = g.UserInput.Get(T2IParamTypes.RefinerSteps, g.UserInput.Get(T2IParamTypes.Steps, 20, sectionId: T2IParamInput.SectionID_Refiner), sectionId: T2IParamInput.SectionID_Refiner); + double cfg = g.UserInput.Get(T2IParamTypes.RefinerCFGScale, g.UserInput.Get(T2IParamTypes.CFGScale, 7, sectionId: T2IParamInput.SectionID_Refiner), sectionId: T2IParamInput.SectionID_Refiner); + string explicitSampler = g.UserInput.Get(ComfyUIBackendExtension.SamplerParam, null, sectionId: T2IParamInput.SectionID_Refiner, includeBase: false) ?? g.UserInput.Get(ComfyUIBackendExtension.RefinerSamplerParam, null); + string explicitScheduler = g.UserInput.Get(ComfyUIBackendExtension.SchedulerParam, null, sectionId: T2IParamInput.SectionID_Refiner, includeBase: false) ?? g.UserInput.Get(ComfyUIBackendExtension.RefinerSchedulerParam, null); g.CreateKSampler(model.Path, prompt, negPrompt, g.CurrentMedia.Path, cfg, steps, (int)Math.Round(steps * (1 - refinerControl)), 10000, g.UserInput.Get(T2IParamTypes.Seed) + 1, false, method != "StepSwapNoisy", id: "23", doTiled: g.UserInput.Get(T2IParamTypes.RefinerDoTiling, false), explicitSampler: explicitSampler, explicitScheduler: explicitScheduler, sectionId: T2IParamInput.SectionID_Refiner); g.CurrentMedia = g.CurrentMedia.WithPath(["23", 0]); g.IsRefinerStage = false; + if (doUpscale && upscaleMethod.StartsWith("pidmodel-")) + { + string pidModelName = upscaleMethod.After("pidmodel-"); + T2IModel pidModel = Program.MainSDModels.GetModel(pidModelName); + if (pidModel is null || pidModel.ModelClass?.CompatClass?.ID != "pid") + { + throw new SwarmUserErrorException($"Refiner Upscale Method is set to PiD model '{pidModelName}', but that model could not be found or is not a valid PiD model."); + } + string pidLatentFormat = g.IsSD3() ? "sd3" : (g.IsFlux() || g.IsAnyFlux2() || g.IsZImage() || g.IsZetaChroma()) ? "flux" : null; + if (pidLatentFormat is null) + { + throw new SwarmUserErrorException($"PiD model requires the refiner model's VAE to be Flux.1, Flux.2, or SD3, but model '{refineModel.Name}' is '{refineModel.ModelClass?.CompatClass?.ID ?? "unknown"}'."); + } + JArray refinedLatent = g.CurrentMedia.Path; + int pidWidth = g.UserInput.GetImageWidth() * 4; + int pidHeight = g.UserInput.GetImageHeight() * 4; + pidWidth = (pidWidth / 16) * 16; + pidHeight = (pidHeight / 16) * 16; + T2IModel refinerFinalModel = g.FinalLoadedModel; + List refinerFinalModelList = g.FinalLoadedModelList; + g.FinalLoadedModel = pidModel; + g.FinalLoadedModelList = [pidModel]; + g.NoVAEOverride = true; + g.IsPixelDecoderStage = true; + (g.FinalLoadedModel, g.CurrentModel, g.CurrentTextEnc, g.CurrentVae) = g.CreateModelLoader(pidModel, "PixelDecoder", sectionId: T2IParamInput.SectionID_PixelDecoder); + g.IsPixelDecoderStage = false; + g.NoVAEOverride = false; + JArray pidPos = g.CreateConditioning(g.UserInput.Get(T2IParamTypes.Prompt), g.CurrentTextEnc.Path, g.FinalLoadedModel, true, isPixelDecoder: true); + JArray pidNeg = g.CreateConditioning(g.UserInput.Get(T2IParamTypes.NegativePrompt), g.CurrentTextEnc.Path, g.FinalLoadedModel, false, isPixelDecoder: true); + string pidCond = g.CreateNode("PiDConditioning", new JObject() + { + ["positive"] = pidPos, + ["latent"] = refinedLatent, + ["latent_format"] = pidLatentFormat, + ["degrade_sigma"] = 0.0 + }); + string pidEmptyLatent = g.CreateNode("EmptyChromaRadianceLatentImage", new JObject() + { + ["batch_size"] = g.UserInput.Get(T2IParamTypes.BatchSize, 1), + ["width"] = pidWidth, + ["height"] = pidHeight + }); + int pidSteps = g.UserInput.GetNullable(T2IParamTypes.Steps, T2IParamInput.SectionID_PixelDecoder, false) ?? 4; + double pidCfg = g.UserInput.GetNullable(T2IParamTypes.CFGScale, T2IParamInput.SectionID_PixelDecoder, false) ?? 1.0; + string pidSampler = g.UserInput.Get(ComfyUIBackendExtension.SamplerParam, null, sectionId: T2IParamInput.SectionID_PixelDecoder, includeBase: false); + string pidScheduler = g.UserInput.Get(ComfyUIBackendExtension.SchedulerParam, null, sectionId: T2IParamInput.SectionID_PixelDecoder, includeBase: false); + string pidSampled = g.CreateKSampler(g.CurrentModel.Path, [pidCond, 0], pidNeg, [pidEmptyLatent, 0], pidCfg, pidSteps, 0, 10000, + g.UserInput.Get(T2IParamTypes.Seed) + 2, false, true, defsampler: "lcm", defscheduler: "simple", explicitSampler: pidSampler, explicitScheduler: pidScheduler, sectionId: T2IParamInput.SectionID_PixelDecoder); + g.CurrentMedia = g.CurrentMedia.WithPath([pidSampled, 0], WGNodeData.DT_LATENT_IMAGE, pidModel.ModelClass?.CompatClass); + g.CurrentMedia.Width = pidWidth; + g.CurrentMedia.Height = pidHeight; + g.FinalLoadedModel = refinerFinalModel; + g.FinalLoadedModelList = refinerFinalModelList; + } } }, -4); #endregion diff --git a/src/Text2Image/T2IModelClassSorter.cs b/src/Text2Image/T2IModelClassSorter.cs index c48be7710..830bfbfc1 100644 --- a/src/Text2Image/T2IModelClassSorter.cs +++ b/src/Text2Image/T2IModelClassSorter.cs @@ -208,8 +208,8 @@ bool isZImageLora(JObject h) => (hasLoraKey(h, "layers.0.adaLN_modulation.0") && bool isHiDreamO1Lora(JObject h) => hasLoraKey(h, "final_layer2.linear") && hasLoraKey(h, "language_model.layers.0.self_attn.q_proj"); bool isChroma(JObject h) => h.ContainsKey("distilled_guidance_layer.in_proj.bias") && h.ContainsKey("double_blocks.0.img_attn.proj.bias"); bool isChromaRadiance(JObject h) => h.ContainsKey("nerf_image_embedder.embedder.0.bias"); - bool isPixelDiT(JObject h) => h.ContainsKey("core.pixel_embedder.proj.weight"); bool isPiD(JObject h) => h.ContainsKey("net.lq_proj.latent_proj.0.weight"); + bool isPixelDiT(JObject h) => h.ContainsKey("core.pixel_embedder.proj.weight") && !isPiD(h); bool isOmniGen(JObject h) => h.ContainsKey("time_caption_embed.timestep_embedder.linear_2.weight") && h.ContainsKey("context_refiner.0.attn.norm_k.weight"); bool isQwenImage(JObject h) => (h.ContainsKey("time_text_embed.timestep_embedder.linear_1.bias") && h.ContainsKey("img_in.bias") && (h.ContainsKey("transformer_blocks.0.attn.add_k_proj.bias") || h.ContainsKey("transformer_blocks.0.attn.add_qkv_proj.bias"))) || (h.ContainsKey("model.diffusion_model.time_text_embed.timestep_embedder.linear_1.bias") && h.ContainsKey("model.diffusion_model.img_in.bias") && (h.ContainsKey("model.diffusion_model.transformer_blocks.0.attn.add_k_proj.bias") || h.ContainsKey("model.diffusion_model.transformer_blocks.0.attn.add_qkv_proj.bias"))); diff --git a/src/Text2Image/T2IParamInput.cs b/src/Text2Image/T2IParamInput.cs index dc8ada4f5..a8ce3b084 100644 --- a/src/Text2Image/T2IParamInput.cs +++ b/src/Text2Image/T2IParamInput.cs @@ -13,7 +13,7 @@ namespace SwarmUI.Text2Image; public class T2IParamInput { /// Core section ID numbers. - public static int SectionID_BaseOnly = 5, SectionID_Refiner = 1, SectionID_Video = 2, SectionID_VideoSwap = 3; + public static int SectionID_BaseOnly = 5, SectionID_Refiner = 1, SectionID_Video = 2, SectionID_VideoSwap = 3, SectionID_PixelDecoder = 4; /// Parameter IDs that must be loaded early on, eg extracted from presets in prompts early. Primarily things that affect backend selection. public static readonly string[] ParamsMustLoadEarly = ["model", "images", "internalbackendtype", "exactbackendid"]; diff --git a/src/Text2Image/T2IPromptHandling.cs b/src/Text2Image/T2IPromptHandling.cs index 669e9c70a..723af0ff6 100644 --- a/src/Text2Image/T2IPromptHandling.cs +++ b/src/Text2Image/T2IPromptHandling.cs @@ -598,6 +598,12 @@ static string estimateAsSectionBreak(string data, PromptTagContext context) return $""; }; PromptTagLengthEstimators["refiner"] = estimateAsSectionBreak; + PromptTagBasicProcessors["pixeldecoder"] = (data, context) => + { + context.SectionID = T2IParamInput.SectionID_PixelDecoder; + return $""; + }; + PromptTagLengthEstimators["pixeldecoder"] = estimateAsSectionBreak; PromptTagBasicProcessors["video"] = (data, context) => { context.SectionID = T2IParamInput.SectionID_Video; diff --git a/src/Utils/PromptRegion.cs b/src/Utils/PromptRegion.cs index 77a031a3e..b3fc5f402 100644 --- a/src/Utils/PromptRegion.cs +++ b/src/Utils/PromptRegion.cs @@ -13,6 +13,8 @@ public class PromptRegion public string RefinerPrompt = ""; + public string PixelDecoderPrompt = ""; + public string VideoPrompt = ""; public string VideoSwapPrompt = ""; @@ -26,7 +28,7 @@ public enum PartType public static HashSet CustomPartPrefixes = []; /// List of all prefixes for parts. Use to add to this. - public static List PartPrefixes = [" PartPrefixes = ["Custom Extensions can add new prompt part types here. /// For example, this will add prompt parsing for <example> or <example:somedata> or etc: @@ -129,6 +131,12 @@ public PromptRegion(string prompt) addMore = s => RefinerPrompt += s; continue; } + else if (prefix == "pixeldecoder") + { + PixelDecoderPrompt += content; + addMore = s => PixelDecoderPrompt += s; + continue; + } else if (prefix == "video") { VideoPrompt += content; From 05836eb80b4d12a3d54a90c033d140b85bfc5fc8 Mon Sep 17 00:00:00 2001 From: Juan Treminio Date: Wed, 27 May 2026 18:19:35 -0500 Subject: [PATCH 03/13] Cleanup --- .../ComfyUIBackend/WorkflowGeneratorModelSupport.cs | 4 ++-- src/Text2Image/T2IModelClassSorter.cs | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs index cae6f4736..4c0ebaaa6 100644 --- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs +++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs @@ -85,10 +85,10 @@ public bool IsKontext() /// Returns true if the current model is Chroma Radiance. public bool IsChromaRadiance() => IsModelCompatClass(T2IModelClassSorter.CompatChromaRadiance); - /// Returns true if the current model is NVIDIA PixelDiT. + /// Returns true if the current model is PixelDiT. public bool IsPixelDiT() => IsModelCompatClass(T2IModelClassSorter.CompatPixelDiT); - /// Returns true if the current model is NVIDIA PiD. + /// Returns true if the current model is PiD. public bool IsPiD() => IsModelCompatClass(T2IModelClassSorter.CompatPiD); /// Returns true if the current model is HiDream-i1. diff --git a/src/Text2Image/T2IModelClassSorter.cs b/src/Text2Image/T2IModelClassSorter.cs index 830bfbfc1..b9abb006c 100644 --- a/src/Text2Image/T2IModelClassSorter.cs +++ b/src/Text2Image/T2IModelClassSorter.cs @@ -72,8 +72,8 @@ public static T2IModelCompatClass CompatAnima = RegisterCompat(new() { ID = "anima", ShortCode = "Anima", LorasTargetTextEnc = false }), CompatHiDreamO1 = RegisterCompat(new() { ID = "hidream-o1", ShortCode = "HiDrO1", LorasTargetTextEnc = false }), CompatLens = RegisterCompat(new() { ID = "lens", ShortCode = "Lens", LorasTargetTextEnc = false }), - CompatPixelDiT = RegisterCompat(new() { ID = "pixeldit", ShortCode = "PixDiT", LorasTargetTextEnc = false }), CompatPiD = RegisterCompat(new() { ID = "pid", ShortCode = "PiD", LorasTargetTextEnc = false }), + CompatPixelDiT = RegisterCompat(new() { ID = "pixeldit", ShortCode = "PixDiT", LorasTargetTextEnc = false }), // Audio models CompatAceStep15 = RegisterCompat(new() { ID = "ace-step-1_5", ShortCode = "Ace15", IsAudioModel = true }), // Obscure old random ones @@ -709,14 +709,14 @@ JToken GetEmbeddingKey(JObject h) { return isChroma(h) && isChromaRadiance(h); }}); - // ====================== NVIDIA PixelDiT / PiD ====================== - Register(new() { ID = "pixeldit", CompatClass = CompatPixelDiT, Name = "NVIDIA PixelDiT", StandardWidth = 1024, StandardHeight = 1024, IsThisModelOfClass = (m, h) => + // ====================== PixelDiT / PiD ====================== + Register(new() { ID = "pid", CompatClass = CompatPiD, Name = "PiD", StandardWidth = 1024, StandardHeight = 1024, IsThisModelOfClass = (m, h) => { - return isPixelDiT(h); + return isPiD(h); }}); - Register(new() { ID = "pid", CompatClass = CompatPiD, Name = "NVIDIA PiD", StandardWidth = 1024, StandardHeight = 1024, IsThisModelOfClass = (m, h) => + Register(new() { ID = "pixeldit", CompatClass = CompatPixelDiT, Name = "PixelDiT", StandardWidth = 1024, StandardHeight = 1024, IsThisModelOfClass = (m, h) => { - return isPiD(h); + return isPixelDiT(h); }}); Register(new() { ID = "alt_diffusion_v1_512_placeholder", CompatClass = CompatAltDiffusion, Name = "Alt-Diffusion", StandardWidth = 512, StandardHeight = 512, IsThisModelOfClass = (m, h) => { From ab0665d7f588cb6d54317989d5099a628158b0be Mon Sep 17 00:00:00 2001 From: Juan Treminio Date: Wed, 27 May 2026 18:47:31 -0500 Subject: [PATCH 04/13] Docs for PixelDiT --- docs/Model Support.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/Model Support.md b/docs/Model Support.md index 01db974c0..54fc335f2 100644 --- a/docs/Model Support.md +++ b/docs/Model Support.md @@ -21,6 +21,7 @@ [ERNIE](#ernie) | DiT | 2026 | Baidu | 8B | Minimal | Modern, intelligent, good quality, fast | [HiDream O1](#hidream-o1) | "Pixel UiT" | 2026 | HiDream | 8B | Minimal | Modern, intelligent, fast, decent quality | [Lens](#lens) | MMDiT | 2026 | Microsoft | 4B | Minimal | Modern, lightweight | +[PixelDiT](#pixeldit) | Pixel DiT | 2026 | NVIDIA | 1.3B | Minimal | Modern, fast, pixel-space | Old or bad options also tracked listed via [Obscure Model Support](/docs/Obscure%20Model%20Support.md): @@ -640,6 +641,21 @@ For upscaling with SD3, the `Refiner Do Tiling` parameter is highly recommended - **Steps:** For Turbo, `4` is recommended, `8` works well. For Base, `20` as normal. - **Resolution:** Side length `1440` is the official default, but 1024 is a reasonable option. It retains coherence down to about 512 and up to about 2048. +# PixelDiT + +- NVIDIA's [PixelDiT]() is supported in SwarmUI! + - Or the smaller FP8 version: [Comfy-Org/PixelDiT - mxfp8]() + - Download the fat BF16: [Comfy-Org/PixelDiT - bf16]() + - Save in `diffusion_models` +- It does not use a VAE +- Uses the Gemma 2 2B text encoder, will be downloaded and handled automatically +- **Parameters:** + - **Sampler:** Default is fine. + - **Scheduler:** Default is fine. + - **CFG Scale:** `4` is recommended. + - **Steps:** `30` is recommended. + - **Resolution:** Side length `1024` is the standard. + # Video Models - Video models are documented in [Video Model Support](/docs/Video%20Model%20Support.md). From f10bb98a9a1733eddb4383394aab79f8ce12bfe4 Mon Sep 17 00:00:00 2001 From: Juan Treminio Date: Wed, 27 May 2026 18:50:12 -0500 Subject: [PATCH 05/13] doc fix --- docs/Model Support.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/Model Support.md b/docs/Model Support.md index 54fc335f2..d6705bfa5 100644 --- a/docs/Model Support.md +++ b/docs/Model Support.md @@ -644,8 +644,8 @@ For upscaling with SD3, the `Refiner Do Tiling` parameter is highly recommended # PixelDiT - NVIDIA's [PixelDiT]() is supported in SwarmUI! - - Or the smaller FP8 version: [Comfy-Org/PixelDiT - mxfp8]() - - Download the fat BF16: [Comfy-Org/PixelDiT - bf16]() + - The smaller FP8 model can be downloaded here: [Comfy-Org/PixelDiT - fp8]() + - Or fat BF16 version: [Comfy-Org/PixelDiT - bf16]() - Save in `diffusion_models` - It does not use a VAE - Uses the Gemma 2 2B text encoder, will be downloaded and handled automatically From 145fec390a2d0677978e98b06039dd8ce718adc4 Mon Sep 17 00:00:00 2001 From: Juan Treminio Date: Wed, 27 May 2026 18:55:15 -0500 Subject: [PATCH 06/13] Add hint --- src/wwwroot/js/genpage/gentab/prompttools.js | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/wwwroot/js/genpage/gentab/prompttools.js b/src/wwwroot/js/genpage/gentab/prompttools.js index b688932d1..cfaa14972 100644 --- a/src/wwwroot/js/genpage/gentab/prompttools.js +++ b/src/wwwroot/js/genpage/gentab/prompttools.js @@ -137,6 +137,9 @@ class PromptTabCompleteClass { this.registerPrefix('refiner', 'Add a section of prompt text that is only used for the Refine/Upscale pass.', (prefix) => { return []; }, true); + this.registerPrefix('pixeldecoder', 'Add a section of prompt text that is only used for the PiD pixel-decoder upscale pass.', (prefix) => { + return []; + }, true); this.registerPrefix('video', 'Add a section of prompt text that replaces the prompt for the image-to-video generation pass.', (prefix) => { return []; }, true); From f42cf7ab8c67f497eec0f9d37717719b7bcbe226 Mon Sep 17 00:00:00 2001 From: Juan Treminio Date: Wed, 27 May 2026 19:15:11 -0500 Subject: [PATCH 07/13] Use T2IParamTypes.GetBestModelInList() --- .../ComfyUIBackend/WorkflowGeneratorSteps.cs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs index d75ccfad9..59e3813b6 100644 --- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs +++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs @@ -1605,7 +1605,12 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn) if (doUpscale && upscaleMethod.StartsWith("pidmodel-")) { string pidModelName = upscaleMethod.After("pidmodel-"); - T2IModel pidModel = Program.MainSDModels.GetModel(pidModelName); + string pidMatched = T2IParamTypes.GetBestModelInList(pidModelName, Program.MainSDModels.ListModelNamesFor(g.UserInput.SourceSession)); + if (pidMatched is not null && pidMatched.EndsWith(".safetensors")) + { + pidMatched = pidMatched.BeforeLast('.'); + } + T2IModel pidModel = pidMatched is null ? null : Program.MainSDModels.GetModel(pidMatched); if (pidModel is null || pidModel.ModelClass?.CompatClass?.ID != "pid") { throw new SwarmUserErrorException($"Refiner Upscale Method is set to PiD model '{pidModelName}', but that model could not be found or is not a valid PiD model."); From 4502cb4ffd8be8a79dbc3d12dea7b3f5f2cd3006 Mon Sep 17 00:00:00 2001 From: Juan Treminio Date: Wed, 27 May 2026 19:19:08 -0500 Subject: [PATCH 08/13] Add more (pixel) keys --- src/Text2Image/T2IModelClassSorter.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Text2Image/T2IModelClassSorter.cs b/src/Text2Image/T2IModelClassSorter.cs index b9abb006c..87b7dbf8d 100644 --- a/src/Text2Image/T2IModelClassSorter.cs +++ b/src/Text2Image/T2IModelClassSorter.cs @@ -208,8 +208,8 @@ bool isZImageLora(JObject h) => (hasLoraKey(h, "layers.0.adaLN_modulation.0") && bool isHiDreamO1Lora(JObject h) => hasLoraKey(h, "final_layer2.linear") && hasLoraKey(h, "language_model.layers.0.self_attn.q_proj"); bool isChroma(JObject h) => h.ContainsKey("distilled_guidance_layer.in_proj.bias") && h.ContainsKey("double_blocks.0.img_attn.proj.bias"); bool isChromaRadiance(JObject h) => h.ContainsKey("nerf_image_embedder.embedder.0.bias"); - bool isPiD(JObject h) => h.ContainsKey("net.lq_proj.latent_proj.0.weight"); - bool isPixelDiT(JObject h) => h.ContainsKey("core.pixel_embedder.proj.weight") && !isPiD(h); + bool isPiD(JObject h) => h.ContainsKey("net.lq_proj.latent_proj.0.weight") && h.ContainsKey("net.pixel_blocks.0.attn.q_norm.weight"); + bool isPixelDiT(JObject h) => h.ContainsKey("core.pixel_embedder.proj.weight") && h.ContainsKey("core.pixel_blocks.0.attn.q_norm.weight") && !isPiD(h); bool isOmniGen(JObject h) => h.ContainsKey("time_caption_embed.timestep_embedder.linear_2.weight") && h.ContainsKey("context_refiner.0.attn.norm_k.weight"); bool isQwenImage(JObject h) => (h.ContainsKey("time_text_embed.timestep_embedder.linear_1.bias") && h.ContainsKey("img_in.bias") && (h.ContainsKey("transformer_blocks.0.attn.add_k_proj.bias") || h.ContainsKey("transformer_blocks.0.attn.add_qkv_proj.bias"))) || (h.ContainsKey("model.diffusion_model.time_text_embed.timestep_embedder.linear_1.bias") && h.ContainsKey("model.diffusion_model.img_in.bias") && (h.ContainsKey("model.diffusion_model.transformer_blocks.0.attn.add_k_proj.bias") || h.ContainsKey("model.diffusion_model.transformer_blocks.0.attn.add_qkv_proj.bias"))); From eb099fc51796530a1ebf15723177e1c195340df8 Mon Sep 17 00:00:00 2001 From: Juan Treminio Date: Wed, 27 May 2026 19:21:57 -0500 Subject: [PATCH 09/13] add more why not --- src/Text2Image/T2IModelClassSorter.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Text2Image/T2IModelClassSorter.cs b/src/Text2Image/T2IModelClassSorter.cs index 87b7dbf8d..df9d24c4c 100644 --- a/src/Text2Image/T2IModelClassSorter.cs +++ b/src/Text2Image/T2IModelClassSorter.cs @@ -208,8 +208,8 @@ bool isZImageLora(JObject h) => (hasLoraKey(h, "layers.0.adaLN_modulation.0") && bool isHiDreamO1Lora(JObject h) => hasLoraKey(h, "final_layer2.linear") && hasLoraKey(h, "language_model.layers.0.self_attn.q_proj"); bool isChroma(JObject h) => h.ContainsKey("distilled_guidance_layer.in_proj.bias") && h.ContainsKey("double_blocks.0.img_attn.proj.bias"); bool isChromaRadiance(JObject h) => h.ContainsKey("nerf_image_embedder.embedder.0.bias"); - bool isPiD(JObject h) => h.ContainsKey("net.lq_proj.latent_proj.0.weight") && h.ContainsKey("net.pixel_blocks.0.attn.q_norm.weight"); - bool isPixelDiT(JObject h) => h.ContainsKey("core.pixel_embedder.proj.weight") && h.ContainsKey("core.pixel_blocks.0.attn.q_norm.weight") && !isPiD(h); + bool isPiD(JObject h) => h.ContainsKey("net.lq_proj.latent_proj.0.weight") && h.ContainsKey("net.pixel_blocks.0.attn.q_norm.weight") && h.ContainsKey("net.pixel_blocks.0.compress_to_attn.weight"); + bool isPixelDiT(JObject h) => h.ContainsKey("core.pixel_embedder.proj.weight") && h.ContainsKey("core.pixel_blocks.0.attn.q_norm.weight") && h.ContainsKey("core.pixel_blocks.0.compress_to_attn.weight") && !isPiD(h); bool isOmniGen(JObject h) => h.ContainsKey("time_caption_embed.timestep_embedder.linear_2.weight") && h.ContainsKey("context_refiner.0.attn.norm_k.weight"); bool isQwenImage(JObject h) => (h.ContainsKey("time_text_embed.timestep_embedder.linear_1.bias") && h.ContainsKey("img_in.bias") && (h.ContainsKey("transformer_blocks.0.attn.add_k_proj.bias") || h.ContainsKey("transformer_blocks.0.attn.add_qkv_proj.bias"))) || (h.ContainsKey("model.diffusion_model.time_text_embed.timestep_embedder.linear_1.bias") && h.ContainsKey("model.diffusion_model.img_in.bias") && (h.ContainsKey("model.diffusion_model.transformer_blocks.0.attn.add_k_proj.bias") || h.ContainsKey("model.diffusion_model.transformer_blocks.0.attn.add_qkv_proj.bias"))); From d8716d56fdbd7e7ee1a6448ea3dfaf40fba4f365 Mon Sep 17 00:00:00 2001 From: Juan Treminio Date: Thu, 4 Jun 2026 11:20:34 -0600 Subject: [PATCH 10/13] Implements more pathways for DiT * as base model - for when a user uploads an image I guess * as refiner model - if base model isn't a compatible vae user, load the vae and add a vae decode/encode pair * refiner upscale model - base -> pid -> downscale or upscale with lanczos (if needed) -> refiner swarmksampler * after the refiner swarmksampler; if refiner model isn't a compatible vae user, load the vae and add a vae decode/encode pair --- .../ComfyUIBackend/ComfyUIBackendExtension.cs | 22 +++ .../ComfyUIBackend/WorkflowGenerator.cs | 104 +++++++++++- .../WorkflowGeneratorModelSupport.cs | 2 +- .../ComfyUIBackend/WorkflowGeneratorSteps.cs | 157 +++++++++++------- src/Text2Image/T2IModelClass.cs | 6 +- src/Text2Image/T2IModelClassSorter.cs | 60 ++++--- 6 files changed, 263 insertions(+), 88 deletions(-) diff --git a/src/BuiltinExtensions/ComfyUIBackend/ComfyUIBackendExtension.cs b/src/BuiltinExtensions/ComfyUIBackend/ComfyUIBackendExtension.cs index 0f5fc092e..d63283df9 100644 --- a/src/BuiltinExtensions/ComfyUIBackend/ComfyUIBackendExtension.cs +++ b/src/BuiltinExtensions/ComfyUIBackend/ComfyUIBackendExtension.cs @@ -614,6 +614,8 @@ public static void AssignValuesFromRaw(JObject rawObjectInfo) public static T2IRegisteredParam RefinerHyperTile, VideoFrameInterpolationMultiplier; + public static T2IRegisteredParam PixelDecoderModel; + public static T2IRegisteredParam[] ControlNetPreprocessorParams = new T2IRegisteredParam[3], ControlNetUnionTypeParams = new T2IRegisteredParam[3]; public static List UpscalerModels = ["pixel-lanczos///Pixel: Lanczos (cheap + high quality)", "pixel-bicubic///Pixel: Bicubic (Basic)", "pixel-area///Pixel: Area", "pixel-bilinear///Pixel: Bilinear", "pixel-nearest-exact///Pixel: Nearest-Exact (Pixel art)", "latent-bislerp///Latent: Bislerp", "latent-bicubic///Latent: Bicubic", "latent-area///Latent: Area", "latent-bilinear///Latent: Bilinear", "latent-nearest-exact///Latent: Nearest-Exact"], @@ -638,6 +640,22 @@ public static void AssignValuesFromRaw(JObject rawObjectInfo) /// Lists PiD decoder models. public static List PidUpscaleModels(Session session) => [.. Program.MainSDModels.ListModelsFor(session).Where(m => m.ModelClass?.CompatClass?.ID == "pid").OrderBy(m => m.Name).Select(m => $"pidmodel-{m.Name}///PiD Model: {m.Name}")]; + /// Resolves a PiD model from a model name. + public static T2IModel GetPidModel(string name, Session session) + { + string matched = T2IParamTypes.GetBestModelInList(name, Program.MainSDModels.ListModelNamesFor(session)); + if (matched is not null && matched.EndsWith(".safetensors")) + { + matched = matched.BeforeLast('.'); + } + T2IModel model = matched is null ? null : Program.MainSDModels.GetModel(matched); + if (model is null || model.ModelClass?.CompatClass?.ID != "pid") + { + throw new SwarmUserErrorException($"PiD model '{name}' could not be found, or is not a valid PiD model."); + } + return model; + } + public static List IPAdapterModels = ["None"], IPAdapterWeightTypes = ["standard", "prompt is more important", "style transfer"]; public static List GligenModels = ["None"], YoloModels = [], StyleModels = ["None"], SetClipDevices = ["cpu"]; @@ -757,6 +775,10 @@ public override void OnInit() "pixel-lanczos", Group: T2IParamTypes.GroupRefiners, OrderPriority: -1, FeatureFlag: "comfyui", ChangeWeight: 1, GetValues: (session) => [.. UpscalerModels, .. PidUpscaleModels(session)], DependNonDefault: T2IParamTypes.RefinerUpscale.Type.ID )); + PixelDecoderModel = T2IParamTypes.Register(new("Pixel Decoder Model", "Optionally use a PiD (Pixel Diffusion Decoder) model.", + "", Toggleable: true, FeatureFlag: "comfyui", Group: T2IParamTypes.GroupAdvancedModelAddons, IsAdvanced: true, Subtype: "Stable-Diffusion", ChangeWeight: 4, DoNotPreview: true, OrderPriority: 14, + GetValues: (session) => T2IParamTypes.CleanModelList(Program.MainSDModels.ListModelsFor(session).Where(m => m.ModelClass?.CompatClass?.ID == "pid").OrderBy(m => m.Name).Select(m => m.Name)) + )); RefinerSamplerParam = T2IParamTypes.Register(new("Refiner Sampler", SamplerParam.Type.Description + "\nThis is an override to only affect the Refine/Upscale stage.", "euler", Toggleable: true, FeatureFlag: "comfyui", Group: T2IParamTypes.GroupRefinerOverrides, OrderPriority: -2, GetValues: (_) => Samplers diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs index 66750efe1..104ffdb57 100644 --- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs +++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs @@ -961,8 +961,13 @@ public string CreateKSampler(JArray model, JArray pos, JArray neg, JArray latent latent = [srCond, 2]; } } + else if (IsPiD()) + { + defsampler ??= "lcm"; + defscheduler ??= "simple"; + } // TODO: Registry of model default preferences instead of this - else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1() || IsLens() || IsPixelDiT() || IsPiD()) + else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1() || IsLens() || IsPixelDiT()) { defscheduler ??= "simple"; } @@ -2520,6 +2525,103 @@ public bool ShouldZeroNegative() return false; } + /// The PiDConditioning node's latent_format value for each VAE family that PiD models exist for. + public static Dictionary PidLatentFormats = new() + { + ["flux1"] = "flux", + ["flux2"] = "flux", + ["sd3"] = "sd3", + ["sdxl"] = "sdxl", + ["qwenimage"] = "qwenimage" + }; + + /// Detects which VAE family a PiD model was trained against. + public static string PidFamilyOfModel(T2IModel pidModel) + { + string name = pidModel.Name.ToLowerFast(); + return PidLatentFormats.Keys.FirstOrDefault(name.Contains); + } + + /// Converts media into a latent in the PiD model's native latent space, re-encoding through an auto-loaded matching VAE if needed. + public (WGNodeData, string) CreatePidCompatLatent(T2IModel pidModel, WGNodeData media, WGNodeData decodeVae) + { + string mediaFamily = media.IsLatentData ? media.Compat?.VaeFamily : null; + string family = PidFamilyOfModel(pidModel) ?? mediaFamily ?? "flux1"; + string format = PidLatentFormats[family]; + if (mediaFamily == family) + { + return (media, format); + } + WGNodeData decoded = media.AsRawImage(decodeVae); + (string knownVae, string vaeCompat) = T2IModelClassSorter.VaeFamilies[family]; + string defaultVae = family switch + { + "flux1" => UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFluxVAE, + "flux2" => UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFlux2VAE, + "sd3" => UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultSD3VAE, + "sdxl" => UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultSDXLVAE, + _ => null + }; + ModelLoadHelpers helpers = new(this); + bool priorNoVae = NoVAEOverride; + NoVAEOverride = true; + helpers.DoVaeLoader(defaultVae, vaeCompat, knownVae); + NoVAEOverride = priorNoVae; + WGNodeData encodeVae = new(LoadingVAE, this, WGNodeData.DT_VAE, T2IModelClassSorter.CompatClasses[vaeCompat]); + return (decoded.EncodeToLatent(encodeVae), format); + } + + /// Creates a PiD pixel-decode stage: converts to a PiD-space latent and samples a 4x pixel image from it. + public WGNodeData CreatePixelDecode(T2IModel pidModel, WGNodeData media, WGNodeData decodeVae, long seed, bool isRefiner = false) + { + (WGNodeData latent, string format) = CreatePidCompatLatent(pidModel, media, decodeVae); + T2IModel priorFinalModel = FinalLoadedModel; + List priorFinalModelList = FinalLoadedModelList; + WGNodeData priorModel = CurrentModel, priorTextEnc = CurrentTextEnc, priorVae = CurrentVae; + bool priorNoVae = NoVAEOverride; + int sectionId = isRefiner ? T2IParamInput.SectionID_Refiner : T2IParamInput.SectionID_PixelDecoder; + FinalLoadedModel = pidModel; + FinalLoadedModelList = [pidModel]; + NoVAEOverride = true; + IsPixelDecoderStage = !isRefiner; + (FinalLoadedModel, CurrentModel, CurrentTextEnc, CurrentVae) = CreateModelLoader(pidModel, isRefiner ? "Refiner" : "PixelDecoder", sectionId: sectionId); + IsPixelDecoderStage = false; + NoVAEOverride = priorNoVae; + JArray pos = CreateConditioning(UserInput.Get(T2IParamTypes.Prompt), CurrentTextEnc.Path, pidModel, true, isRefiner: isRefiner, isPixelDecoder: !isRefiner); + JArray neg = CreateConditioning(UserInput.Get(T2IParamTypes.NegativePrompt), CurrentTextEnc.Path, pidModel, false, isRefiner: isRefiner, isPixelDecoder: !isRefiner); + string cond = CreateNode("PiDConditioning", new JObject() + { + ["positive"] = pos, + ["latent"] = latent.Path, + ["latent_format"] = format, + ["degrade_sigma"] = 0.0 + }); + int width = ((media.Width ?? UserInput.GetImageWidth()) * 4 / 16) * 16; + int height = ((media.Height ?? UserInput.GetImageHeight()) * 4 / 16) * 16; + string emptyLatent = CreateNode("EmptyChromaRadianceLatentImage", new JObject() + { + ["batch_size"] = UserInput.Get(T2IParamTypes.BatchSize, 1), + ["width"] = width, + ["height"] = height + }); + int steps = UserInput.GetNullable(T2IParamTypes.Steps, sectionId, false) ?? (isRefiner ? UserInput.GetNullable(T2IParamTypes.RefinerSteps) : null) ?? 4; + double cfg = UserInput.GetNullable(T2IParamTypes.CFGScale, sectionId, false) ?? (isRefiner ? UserInput.GetNullable(T2IParamTypes.RefinerCFGScale) : null) ?? 1; + string explicitSampler = UserInput.Get(ComfyUIBackendExtension.SamplerParam, null, sectionId: sectionId, includeBase: false) ?? (isRefiner ? UserInput.Get(ComfyUIBackendExtension.RefinerSamplerParam, null) : null); + string explicitScheduler = UserInput.Get(ComfyUIBackendExtension.SchedulerParam, null, sectionId: sectionId, includeBase: false) ?? (isRefiner ? UserInput.Get(ComfyUIBackendExtension.RefinerSchedulerParam, null) : null); + string sampled = CreateKSampler(CurrentModel.Path, [cond, 0], neg, [emptyLatent, 0], cfg, steps, 0, 10000, seed, false, true, + explicitSampler: explicitSampler, explicitScheduler: explicitScheduler, sectionId: sectionId); + WGNodeData result = media.WithPath([sampled, 0], WGNodeData.DT_LATENT_IMAGE, pidModel.ModelClass?.CompatClass); + result.Width = width; + result.Height = height; + result = result.DecodeLatents(CurrentVae, false); + FinalLoadedModel = priorFinalModel; + FinalLoadedModelList = priorFinalModelList; + CurrentModel = priorModel; + CurrentTextEnc = priorTextEnc; + CurrentVae = priorVae; + return result; + } + /// Creates a "CLIPTextEncode" or equivalent node for the given input, applying prompt-given conditioning modifiers as relevant. public JArray CreateConditioning(string prompt, JArray clip, T2IModel model, bool isPositive, string firstId = null, bool isRefiner = false, bool isVideo = false, bool isVideoSwap = false, bool isPixelDecoder = false) { diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs index 4c0ebaaa6..7ebc02cc5 100644 --- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs +++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs @@ -407,7 +407,7 @@ public WGNodeData EmptyImage(int width, int height, int batchSize, string id = n ["width"] = width }, id), frames); } - else if (IsChromaRadiance() || IsZetaChroma() || IsPixelDiT()) + else if (IsChromaRadiance() || IsZetaChroma() || IsPixelDiT() || IsPiD()) { return resultImage(CreateNode("EmptyChromaRadianceLatentImage", new JObject() { diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs index 59e3813b6..ee35d9c5e 100644 --- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs +++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs @@ -1360,6 +1360,34 @@ bool getBestFor(string phrase) { endStep = (int)(steps * (1 - endEarly)); } + if (g.IsPiD()) + { + if (g.BasicInputImage is null) + { + throw new SwarmUserErrorException("PiD models are pixel decoders/upscalers, not image generators, an Init Image is required."); + } + (WGNodeData pidLatent, string pidFormat) = g.CreatePidCompatLatent(g.FinalLoadedModel, g.BasicInputImage, g.CurrentVae); + string pidCond = g.CreateNode("PiDConditioning", new JObject() + { + ["positive"] = g.FinalPrompt, + ["latent"] = pidLatent.Path, + ["latent_format"] = pidFormat, + ["degrade_sigma"] = 0.0 + }); + g.FinalPrompt = [pidCond, 0]; + int pidWidth = (g.UserInput.GetImageWidth() * 4 / 16) * 16; + int pidHeight = (g.UserInput.GetImageHeight() * 4 / 16) * 16; + string pidEmptyLatent = g.CreateNode("EmptyChromaRadianceLatentImage", new JObject() + { + ["batch_size"] = g.UserInput.Get(T2IParamTypes.BatchSize, 1), + ["width"] = pidWidth, + ["height"] = pidHeight + }); + g.CurrentMedia = new WGNodeData([pidEmptyLatent, 0], g, WGNodeData.DT_LATENT_IMAGE, g.CurrentCompat()) { Width = pidWidth, Height = pidHeight }; + startStep = 0; + endStep = 10000; + g.MainSamplerAddNoise = true; + } double cfg = g.UserInput.Get(T2IParamTypes.CFGScale); if (!noSkip && (steps == 0 || endStep <= startStep)) { @@ -1444,6 +1472,36 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn) } loaderNodeId = "20"; } + if (refineModel.ModelClass?.CompatClass?.ID == "pid") + { + if (g.UserInput.Get(T2IParamTypes.OutputIntermediateImages, false)) + { + g.CurrentMedia.DecodeLatents(origVae, false, "24").SaveOutput(null, null, id: "29"); + } + WGNodeData pidDecoded = g.CreatePixelDecode(refineModel, g.CurrentMedia, origVae, g.UserInput.Get(T2IParamTypes.Seed) + 1, isRefiner: true); + if (g.UserInput.TryGet(T2IParamTypes.RefinerUpscale, out double pidUpscale) && pidUpscale != 1) + { + int targetWidth = ((int)Math.Round(g.UserInput.GetImageWidth() * pidUpscale) / 16) * 16; + int targetHeight = ((int)Math.Round(g.UserInput.GetImageHeight() * pidUpscale) / 16) * 16; + if (targetWidth != pidDecoded.Width || targetHeight != pidDecoded.Height) + { + g.CreateNode("ImageScale", new JObject() + { + ["image"] = pidDecoded.Path, + ["width"] = targetWidth, + ["height"] = targetHeight, + ["upscale_method"] = "lanczos", + ["crop"] = "disabled" + }, "26"); + pidDecoded = pidDecoded.WithPath(["26", 0]); + pidDecoded.Width = targetWidth; + pidDecoded.Height = targetHeight; + } + } + g.CurrentMedia = pidDecoded; + g.IsRefinerStage = false; + return; + } if (g.UserInput.TryGet(T2IParamTypes.RefinerVAE, out _)) { modelMustReencode = true; @@ -1460,11 +1518,41 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn) string upscaleMethod = g.UserInput.Get(ComfyUIBackendExtension.RefinerUpscaleMethod, "None"); // TODO: Better same-VAE check bool doPixelUpscale = doUpscale && (upscaleMethod.StartsWith("pixel-") || upscaleMethod.StartsWith("model-")); + bool doPidUpscale = doUpscale && upscaleMethod.StartsWith("pidmodel-"); int width = (int)Math.Round(g.UserInput.GetImageWidth() * refineUpscale); int height = (int)Math.Round(g.UserInput.GetImageHeight() * refineUpscale); width = (width / 16) * 16; // avoid unworkable output sizes height = (height / 16) * 16; - if (modelMustReencode || doPixelUpscale || doSave || g.MaskShrunkInfo.BoundsNode is not null) + if (doPidUpscale) + { + T2IModel pidModel = ComfyUIBackendExtension.GetPidModel(upscaleMethod.After("pidmodel-"), g.UserInput.SourceSession); + WGNodeData decoded = g.CreatePixelDecode(pidModel, g.CurrentMedia, origVae, g.UserInput.Get(T2IParamTypes.Seed) + 2); + if (doSave) + { + decoded.SaveOutput(null, null, id: "29"); + } + if (decoded.Width != width || decoded.Height != height) + { + g.CreateNode("ImageScale", new JObject() + { + ["image"] = decoded.Path, + ["width"] = width, + ["height"] = height, + ["upscale_method"] = "lanczos", + ["crop"] = "disabled" + }, "26"); + decoded = decoded.WithPath(["26", 0]); + decoded.Width = width; + decoded.Height = height; + } + if (refinerControl <= 0) + { + g.CurrentMedia = decoded; + return; + } + g.CurrentMedia = decoded.EncodeToLatent(g.CurrentVae, "25"); + } + else if (modelMustReencode || doPixelUpscale || doSave || g.MaskShrunkInfo.BoundsNode is not null) { WGNodeData decoded = g.CurrentMedia.DecodeLatents(origVae, false, "24"); JArray maskShrunk = doMaskShrinkApply(g, decoded.Path); @@ -1602,71 +1690,20 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn) explicitSampler: explicitSampler, explicitScheduler: explicitScheduler, sectionId: T2IParamInput.SectionID_Refiner); g.CurrentMedia = g.CurrentMedia.WithPath(["23", 0]); g.IsRefinerStage = false; - if (doUpscale && upscaleMethod.StartsWith("pidmodel-")) - { - string pidModelName = upscaleMethod.After("pidmodel-"); - string pidMatched = T2IParamTypes.GetBestModelInList(pidModelName, Program.MainSDModels.ListModelNamesFor(g.UserInput.SourceSession)); - if (pidMatched is not null && pidMatched.EndsWith(".safetensors")) - { - pidMatched = pidMatched.BeforeLast('.'); - } - T2IModel pidModel = pidMatched is null ? null : Program.MainSDModels.GetModel(pidMatched); - if (pidModel is null || pidModel.ModelClass?.CompatClass?.ID != "pid") - { - throw new SwarmUserErrorException($"Refiner Upscale Method is set to PiD model '{pidModelName}', but that model could not be found or is not a valid PiD model."); - } - string pidLatentFormat = g.IsSD3() ? "sd3" : (g.IsFlux() || g.IsAnyFlux2() || g.IsZImage() || g.IsZetaChroma()) ? "flux" : null; - if (pidLatentFormat is null) - { - throw new SwarmUserErrorException($"PiD model requires the refiner model's VAE to be Flux.1, Flux.2, or SD3, but model '{refineModel.Name}' is '{refineModel.ModelClass?.CompatClass?.ID ?? "unknown"}'."); - } - JArray refinedLatent = g.CurrentMedia.Path; - int pidWidth = g.UserInput.GetImageWidth() * 4; - int pidHeight = g.UserInput.GetImageHeight() * 4; - pidWidth = (pidWidth / 16) * 16; - pidHeight = (pidHeight / 16) * 16; - T2IModel refinerFinalModel = g.FinalLoadedModel; - List refinerFinalModelList = g.FinalLoadedModelList; - g.FinalLoadedModel = pidModel; - g.FinalLoadedModelList = [pidModel]; - g.NoVAEOverride = true; - g.IsPixelDecoderStage = true; - (g.FinalLoadedModel, g.CurrentModel, g.CurrentTextEnc, g.CurrentVae) = g.CreateModelLoader(pidModel, "PixelDecoder", sectionId: T2IParamInput.SectionID_PixelDecoder); - g.IsPixelDecoderStage = false; - g.NoVAEOverride = false; - JArray pidPos = g.CreateConditioning(g.UserInput.Get(T2IParamTypes.Prompt), g.CurrentTextEnc.Path, g.FinalLoadedModel, true, isPixelDecoder: true); - JArray pidNeg = g.CreateConditioning(g.UserInput.Get(T2IParamTypes.NegativePrompt), g.CurrentTextEnc.Path, g.FinalLoadedModel, false, isPixelDecoder: true); - string pidCond = g.CreateNode("PiDConditioning", new JObject() - { - ["positive"] = pidPos, - ["latent"] = refinedLatent, - ["latent_format"] = pidLatentFormat, - ["degrade_sigma"] = 0.0 - }); - string pidEmptyLatent = g.CreateNode("EmptyChromaRadianceLatentImage", new JObject() - { - ["batch_size"] = g.UserInput.Get(T2IParamTypes.BatchSize, 1), - ["width"] = pidWidth, - ["height"] = pidHeight - }); - int pidSteps = g.UserInput.GetNullable(T2IParamTypes.Steps, T2IParamInput.SectionID_PixelDecoder, false) ?? 4; - double pidCfg = g.UserInput.GetNullable(T2IParamTypes.CFGScale, T2IParamInput.SectionID_PixelDecoder, false) ?? 1.0; - string pidSampler = g.UserInput.Get(ComfyUIBackendExtension.SamplerParam, null, sectionId: T2IParamInput.SectionID_PixelDecoder, includeBase: false); - string pidScheduler = g.UserInput.Get(ComfyUIBackendExtension.SchedulerParam, null, sectionId: T2IParamInput.SectionID_PixelDecoder, includeBase: false); - string pidSampled = g.CreateKSampler(g.CurrentModel.Path, [pidCond, 0], pidNeg, [pidEmptyLatent, 0], pidCfg, pidSteps, 0, 10000, - g.UserInput.Get(T2IParamTypes.Seed) + 2, false, true, defsampler: "lcm", defscheduler: "simple", explicitSampler: pidSampler, explicitScheduler: pidScheduler, sectionId: T2IParamInput.SectionID_PixelDecoder); - g.CurrentMedia = g.CurrentMedia.WithPath([pidSampled, 0], WGNodeData.DT_LATENT_IMAGE, pidModel.ModelClass?.CompatClass); - g.CurrentMedia.Width = pidWidth; - g.CurrentMedia.Height = pidHeight; - g.FinalLoadedModel = refinerFinalModel; - g.FinalLoadedModelList = refinerFinalModelList; - } } }, -4); #endregion #region VAEDecode AddStep(g => { + if (g.UserInput.TryGet(ComfyUIBackendExtension.PixelDecoderModel, out T2IModel pixelDecoder) && g.CurrentMedia.DataType == WGNodeData.DT_LATENT_IMAGE) + { + if (pixelDecoder.ModelClass?.CompatClass?.ID != "pid") + { + throw new SwarmUserErrorException($"Pixel Decoder Model is set to '{pixelDecoder.Name}', but that is not a PiD model."); + } + g.CurrentMedia = g.CreatePixelDecode(pixelDecoder, g.CurrentMedia, g.CurrentVae, g.UserInput.Get(T2IParamTypes.Seed) + 3); + } g.CurrentMedia = g.CurrentMedia.DecodeLatents(g.CurrentVae, null, "8"); JArray maskShrinkApply = doMaskShrinkApply(g, g.CurrentMedia.Path); g.CurrentMedia = g.CurrentMedia.WithPath(maskShrinkApply); diff --git a/src/Text2Image/T2IModelClass.cs b/src/Text2Image/T2IModelClass.cs index 1a34278d5..de6678cf0 100644 --- a/src/Text2Image/T2IModelClass.cs +++ b/src/Text2Image/T2IModelClass.cs @@ -57,6 +57,9 @@ public record class T2IModelCompatClass /// If true, this is a model that primarily operates on audio. public bool IsAudioModel = false; + /// If this class natively works in a standard shared VAE/latent space, the ID of that family (see ). + public string VaeFamily = null; + /// Get a networkable JObject for this compat class. public JObject ToNetData() { @@ -67,7 +70,8 @@ public JObject ToNetData() ["loras_target_text_enc"] = LorasTargetTextEnc, ["is_text2video"] = IsText2Video, ["is_image2video"] = IsImage2Video, - ["is_audio_model"] = IsAudioModel + ["is_audio_model"] = IsAudioModel, + ["vae_family"] = VaeFamily }; } } diff --git a/src/Text2Image/T2IModelClassSorter.cs b/src/Text2Image/T2IModelClassSorter.cs index df9d24c4c..1a8a08c24 100644 --- a/src/Text2Image/T2IModelClassSorter.cs +++ b/src/Text2Image/T2IModelClassSorter.cs @@ -38,62 +38,72 @@ public static T2IModelCompatClass CompatSdv1 = RegisterCompat(new() { ID = "stable-diffusion-v1", ShortCode = "SDv1" }), CompatSdv2 = RegisterCompat(new() { ID = "stable-diffusion-v2", ShortCode = "SDv2" }), CompatSdv2Turbo = RegisterCompat(new() { ID = "stable-diffusion-v2-turbo", ShortCode = "SDv2" }), - CompatSdxl = RegisterCompat(new() { ID = "stable-diffusion-xl-v1", ShortCode = "SDXL" }), - CompatSdxlRefiner = RegisterCompat(new() { ID = "stable-diffusion-xl-v1-refiner", ShortCode = "SDXL" }), + CompatSdxl = RegisterCompat(new() { ID = "stable-diffusion-xl-v1", ShortCode = "SDXL", VaeFamily = "sdxl" }), + CompatSdxlRefiner = RegisterCompat(new() { ID = "stable-diffusion-xl-v1-refiner", ShortCode = "SDXL", VaeFamily = "sdxl" }), CompatSvd = RegisterCompat(new() { ID = "stable-video-diffusion-img2vid-v1", ShortCode = "SVD", IsImage2Video = true }), CompatCascade = RegisterCompat(new() { ID = "stable-cascade-v1", ShortCode = "Casc" }), - CompatSd3Medium = RegisterCompat(new() { ID = "stable-diffusion-v3-medium", ShortCode = "SD3m" }), - CompatSd35Large = RegisterCompat(new() { ID = "stable-diffusion-v3.5-large", ShortCode = "SD35L" }), - CompatSd35Medium = RegisterCompat(new() { ID = "stable-diffusion-v3.5-medium", ShortCode = "SD35m" }), - CompatSd3 = RegisterCompat(new() { ID = "stable-diffusion-v3", ShortCode = "SD3" }), + CompatSd3Medium = RegisterCompat(new() { ID = "stable-diffusion-v3-medium", ShortCode = "SD3m", VaeFamily = "sd3" }), + CompatSd35Large = RegisterCompat(new() { ID = "stable-diffusion-v3.5-large", ShortCode = "SD35L", VaeFamily = "sd3" }), + CompatSd35Medium = RegisterCompat(new() { ID = "stable-diffusion-v3.5-medium", ShortCode = "SD35m", VaeFamily = "sd3" }), + CompatSd3 = RegisterCompat(new() { ID = "stable-diffusion-v3", ShortCode = "SD3", VaeFamily = "sd3" }), // 2024-2025 era models - CompatFlux = RegisterCompat(new() { ID = "flux-1", ShortCode = "Flux", LorasTargetTextEnc = false }), + CompatFlux = RegisterCompat(new() { ID = "flux-1", ShortCode = "Flux", LorasTargetTextEnc = false, VaeFamily = "flux1" }), CompatWan21 = RegisterCompat(new() { ID = "wan-21", ShortCode = "Wan14B", LorasTargetTextEnc = false, IsText2Video = true, IsImage2Video = true }), CompatWan21_1_3b = RegisterCompat(new() { ID = "wan-21-1_3b", ShortCode = "Wan1B", LorasTargetTextEnc = false, IsText2Video = true, IsImage2Video = true }), CompatWan21_14b = RegisterCompat(new() { ID = "wan-21-14b", ShortCode = "Wan14B", LorasTargetTextEnc = false, IsText2Video = true, IsImage2Video = true }), CompatWan22_5b = RegisterCompat(new() { ID = "wan-22-5b", ShortCode = "Wan5B", LorasTargetTextEnc = false, IsText2Video = true, IsImage2Video = true }), CompatHunyuanVideo = RegisterCompat(new() { ID = "hunyuan-video", ShortCode = "HyVid", LorasTargetTextEnc = false, IsText2Video = true, IsImage2Video = true }), - CompatChroma = RegisterCompat(new() { ID = "chroma", ShortCode = "Chroma" }), + CompatChroma = RegisterCompat(new() { ID = "chroma", ShortCode = "Chroma", VaeFamily = "flux1" }), CompatChromaRadiance = RegisterCompat(new() { ID = "chroma-radiance", ShortCode = "ChrRad" }), CompatLtxv = RegisterCompat(new() { ID = "lightricks-ltx-video", ShortCode = "LTXV", IsText2Video = true, IsImage2Video = true }), - CompatLumina2 = RegisterCompat(new() { ID = "lumina-2", ShortCode = "Lumi2" }), - CompatQwenImage = RegisterCompat(new() { ID = "qwen-image", ShortCode = "Qwen", LorasTargetTextEnc = false }), + CompatLumina2 = RegisterCompat(new() { ID = "lumina-2", ShortCode = "Lumi2", VaeFamily = "flux1" }), + CompatQwenImage = RegisterCompat(new() { ID = "qwen-image", ShortCode = "Qwen", LorasTargetTextEnc = false, VaeFamily = "qwenimage" }), CompatHunyuanImage2_1 = RegisterCompat(new() { ID = "hunyuan-image-2_1", ShortCode = "HyImg", LorasTargetTextEnc = false }), CompatHunyuanImage2_1Refiner = RegisterCompat(new() { ID = "hunyuan-image-2_1-refiner", ShortCode = "HyImg", LorasTargetTextEnc = false }), CompatHunyuanVideo1_5 = RegisterCompat(new() { ID = "hunyuan-video-1_5", ShortCode = "HyVid", LorasTargetTextEnc = false, IsText2Video = true, IsImage2Video = true }), // 2025-2026 era models - CompatFlux2 = RegisterCompat(new() { ID = "flux-2", ShortCode = "Flux2", LorasTargetTextEnc = false }), - CompatFlux2Klein4B = RegisterCompat(new() { ID = "flux-2-klein-4b", ShortCode = "Fl2K4", LorasTargetTextEnc = false }), - CompatFlux2Klein9B = RegisterCompat(new() { ID = "flux-2-klein-9b", ShortCode = "Fl2K9", LorasTargetTextEnc = false }), - CompatErnieImage = RegisterCompat(new() { ID = "ernie-image", ShortCode = "Ernie", LorasTargetTextEnc = false }), + CompatFlux2 = RegisterCompat(new() { ID = "flux-2", ShortCode = "Flux2", LorasTargetTextEnc = false, VaeFamily = "flux2" }), + CompatFlux2Klein4B = RegisterCompat(new() { ID = "flux-2-klein-4b", ShortCode = "Fl2K4", LorasTargetTextEnc = false, VaeFamily = "flux2" }), + CompatFlux2Klein9B = RegisterCompat(new() { ID = "flux-2-klein-9b", ShortCode = "Fl2K9", LorasTargetTextEnc = false, VaeFamily = "flux2" }), + CompatErnieImage = RegisterCompat(new() { ID = "ernie-image", ShortCode = "Ernie", LorasTargetTextEnc = false, VaeFamily = "flux2" }), CompatLtxv2 = RegisterCompat(new() { ID = "lightricks-ltx-video-2", ShortCode = "LTXV2", IsText2Video = true, IsImage2Video = true }), - CompatZImage = RegisterCompat(new() { ID = "z-image", ShortCode = "ZImg", LorasTargetTextEnc = false }), + CompatZImage = RegisterCompat(new() { ID = "z-image", ShortCode = "ZImg", LorasTargetTextEnc = false, VaeFamily = "flux1" }), CompatZetaChroma = RegisterCompat(new() { ID = "zeta-chroma", ShortCode = "ZChr", LorasTargetTextEnc = false }), - CompatAnima = RegisterCompat(new() { ID = "anima", ShortCode = "Anima", LorasTargetTextEnc = false }), + CompatAnima = RegisterCompat(new() { ID = "anima", ShortCode = "Anima", LorasTargetTextEnc = false, VaeFamily = "qwenimage" }), CompatHiDreamO1 = RegisterCompat(new() { ID = "hidream-o1", ShortCode = "HiDrO1", LorasTargetTextEnc = false }), - CompatLens = RegisterCompat(new() { ID = "lens", ShortCode = "Lens", LorasTargetTextEnc = false }), + CompatLens = RegisterCompat(new() { ID = "lens", ShortCode = "Lens", LorasTargetTextEnc = false, VaeFamily = "flux2" }), CompatPiD = RegisterCompat(new() { ID = "pid", ShortCode = "PiD", LorasTargetTextEnc = false }), CompatPixelDiT = RegisterCompat(new() { ID = "pixeldit", ShortCode = "PixDiT", LorasTargetTextEnc = false }), // Audio models CompatAceStep15 = RegisterCompat(new() { ID = "ace-step-1_5", ShortCode = "Ace15", IsAudioModel = true }), // Obscure old random ones - CompatAuraFlow = RegisterCompat(new() { ID = "auraflow-v1", ShortCode = "Aura" }), - CompatHiDreamI1 = RegisterCompat(new() { ID = "hidream-i1", ShortCode = "HiDrm", LorasTargetTextEnc = false }), - CompatOmniGen2 = RegisterCompat(new() { ID = "omnigen-2", ShortCode = "Omni2" }), - CompatSegmindStableDiffusion1b = RegisterCompat(new() { ID = "segmind-stable-diffusion-1b", ShortCode = "SSD1B" }), + CompatAuraFlow = RegisterCompat(new() { ID = "auraflow-v1", ShortCode = "Aura", VaeFamily = "sdxl" }), + CompatHiDreamI1 = RegisterCompat(new() { ID = "hidream-i1", ShortCode = "HiDrm", LorasTargetTextEnc = false, VaeFamily = "flux1" }), + CompatOmniGen2 = RegisterCompat(new() { ID = "omnigen-2", ShortCode = "Omni2", VaeFamily = "flux1" }), + CompatSegmindStableDiffusion1b = RegisterCompat(new() { ID = "segmind-stable-diffusion-1b", ShortCode = "SSD1B", VaeFamily = "sdxl" }), CompatCosmos = RegisterCompat(new() { ID = "nvidia-cosmos-1", ShortCode = "Cosmos", IsText2Video = true, IsImage2Video = true }), CompatCosmosPredict2_2b = RegisterCompat(new() { ID = "nvidia-cosmos-predict2-t2i-2b", ShortCode = "Pred2", IsText2Video = true }), CompatCosmosPredict2_14b = RegisterCompat(new() { ID = "nvidia-cosmos-predict2-t2i-14b", ShortCode = "Pred2", IsText2Video = true }), CompatAltDiffusion = RegisterCompat(new() { ID = "alt_diffusion_v1", ShortCode = "AltD" }), CompatSana = RegisterCompat(new() { ID = "nvidia-sana-1600", ShortCode = "Sana" }), - CompatPixartMsSigmaXl2 = RegisterCompat(new() { ID = "pixart-ms-sigma-xl-2", ShortCode = "Pix" }), - CompatOvis = RegisterCompat(new() { ID = "ovis", ShortCode = "Ovis", LorasTargetTextEnc = false }), - CompatLongcatImage = RegisterCompat(new() { ID = "longcat-image", ShortCode = "LCat", LorasTargetTextEnc = false }), + CompatPixartMsSigmaXl2 = RegisterCompat(new() { ID = "pixart-ms-sigma-xl-2", ShortCode = "Pix", VaeFamily = "sdxl" }), + CompatOvis = RegisterCompat(new() { ID = "ovis", ShortCode = "Ovis", LorasTargetTextEnc = false, VaeFamily = "flux1" }), + CompatLongcatImage = RegisterCompat(new() { ID = "longcat-image", ShortCode = "LCat", LorasTargetTextEnc = false, VaeFamily = "flux1" }), CompatGenmoMochi = RegisterCompat(new() { ID = "genmo-mochi-1", IsText2Video = true, ShortCode = "Mochi" }), - CompatKandinsky5ImgLite = RegisterCompat(new() { ID = "kandinsky5-imglite", ShortCode = "Kan5IL", LorasTargetTextEnc = false }), + CompatKandinsky5ImgLite = RegisterCompat(new() { ID = "kandinsky5-imglite", ShortCode = "Kan5IL", LorasTargetTextEnc = false, VaeFamily = "flux1" }), CompatKandinsky5VidLite = RegisterCompat(new() { ID = "kandinsky5-vidlite", ShortCode = "Kan5VL", LorasTargetTextEnc = false, IsText2Video = true, IsImage2Video = true }), CompatKandinsky5VidPro = RegisterCompat(new() { ID = "kandinsky5-vidpro", ShortCode = "Kan5VP", LorasTargetTextEnc = false, IsText2Video = true, IsImage2Video = true }); + /// Standard shared VAE/latent-space families. + public static Dictionary VaeFamilies = new() + { + ["flux1"] = ("flux-ae", "flux-1"), + ["flux2"] = ("flux2-vae", "flux-2"), + ["sd3"] = ("sd35-vae", "stable-diffusion-v3"), + ["sdxl"] = ("sdxl-vae", "stable-diffusion-xl-v1"), + ["qwenimage"] = ("qwen-image-vae", "qwen-image") + }; + /// Initialize the class sorter. public static void Init() { From 5a796a3164e294071b4b3c547941da0f40d9dbae Mon Sep 17 00:00:00 2001 From: "Alex \"mcmonkey\" Goodwin" Date: Thu, 4 Jun 2026 15:46:34 -0700 Subject: [PATCH 11/13] minor doc --- docs/Model Support.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Model Support.md b/docs/Model Support.md index a1c8f1ab0..ac2348e58 100644 --- a/docs/Model Support.md +++ b/docs/Model Support.md @@ -650,7 +650,7 @@ For upscaling with SD3, the `Refiner Do Tiling` parameter is highly recommended - It does not use a VAE - Uses the Gemma 2 2B text encoder, will be downloaded and handled automatically - **Parameters:** - - **Sampler:** Default is fine. + - **Sampler:** Default is fine (LCM). - **Scheduler:** Default is fine. - **CFG Scale:** `4` is recommended. - **Steps:** `30` is recommended. From dccc351d86dccdba18a6dea49a2880912e2e9ddb Mon Sep 17 00:00:00 2001 From: "Alex \"mcmonkey\" Goodwin" Date: Thu, 4 Jun 2026 18:21:46 -0700 Subject: [PATCH 12/13] docs --- docs/Features/README.md | 1 + docs/Features/Upscaling.md | 9 +++++++++ docs/Model Support.md | 17 +---------------- docs/Obscure Model Support.md | 17 +++++++++++++++++ launchtools/comfy-install-linux.sh | 2 +- 5 files changed, 29 insertions(+), 17 deletions(-) create mode 100644 docs/Features/Upscaling.md diff --git a/docs/Features/README.md b/docs/Features/README.md index de695173e..6b29e9e72 100644 --- a/docs/Features/README.md +++ b/docs/Features/README.md @@ -11,3 +11,4 @@ See [The Docs Readme](/docs/README.md) for general listing of documentation and - [Webhooks](/docs/Features/Webhooks.md) for info about custom defined webhooks triggered by your SwarmUI server. - [UISounds](/docs/Features/UISounds.md) for info about sound playback in the UI (eg a sound to play after generations complete). - [AutoScalingBackend](/docs/Features/AutoScalingBackend.md) for info about the specialty advanced usage "Auto-Scaling" backend (for Slurm/Kubernetes/etc). +- [Upscaling](/docs/Features/Upscaling.md) for info about upscaling images and videos (ie increasing resolution, especially to improve quality). diff --git a/docs/Features/Upscaling.md b/docs/Features/Upscaling.md new file mode 100644 index 000000000..0da35122e --- /dev/null +++ b/docs/Features/Upscaling.md @@ -0,0 +1,9 @@ +# Upscaling In SwarmUI + +(TODO) + +# Pixel Decoder (PiD) + +(TODO) + +Downloads here: diff --git a/docs/Model Support.md b/docs/Model Support.md index ac2348e58..c2697e2cb 100644 --- a/docs/Model Support.md +++ b/docs/Model Support.md @@ -21,7 +21,6 @@ [ERNIE](#ernie) | DiT | 2026 | Baidu | 8B | Minimal | Modern, intelligent, good quality, fast | [HiDream O1](#hidream-o1) | "Pixel UiT" | 2026 | HiDream | 8B | Minimal | Modern, intelligent, fast, decent quality | [Lens](#lens) | MMDiT | 2026 | Microsoft | 4B | Minimal | Modern, lightweight | -[PixelDiT](#pixeldit) | Pixel DiT | 2026 | NVIDIA | 1.3B | Minimal | Modern, fast, pixel-space | Old or bad options also tracked listed via [Obscure Model Support](/docs/Obscure%20Model%20Support.md): @@ -39,6 +38,7 @@ Old or bad options also tracked listed via [Obscure Model Support](/docs/Obscure [Ovis](/docs/Obscure%20Model%20Support.md#ovis) | MMDiT | 2025 | AIDC-AI (Alibaba) | 7B | No | Passable quality, but outclassed on launch | [LongCat-Image](/docs/Obscure%20Model%20Support.md#longcat-image) | MMDiT | 2025 | LongCat | 6B | No | Passable quality, but outclassed on launch | [Zeta Chroma](/docs/Obscure%20Model%20Support.md#zeta-chroma) | Pixel S3-DiT | 2026 | Lodestone Rock | 6B | No | Modern, Pixel-space Z-Image variant | +[PixelDiT](/docs/Obscure%20Model%20Support.md#pixeldit) | Pixel DiT | 2026 | NVIDIA | 1.3B | Minimal | Modern, fast, pixel-space, but very bad relative quality on launch | - **Architecture** is the fundamental machine learning structure used for the model, UNet's were used in the past but DiT (Diffusion Transformers) are the modern choice - **Scale** is how big the model is - "B" for "Billion", so for example "2B" means "Two billion parameters". @@ -641,21 +641,6 @@ For upscaling with SD3, the `Refiner Do Tiling` parameter is highly recommended - **Steps:** For Turbo, `4` is recommended, `8` works well. For Base, `20` as normal. - **Resolution:** Side length `1440` is the official default, but 1024 is a reasonable option. It retains coherence down to about 512 and up to about 2048. -# PixelDiT - -- NVIDIA's [PixelDiT]() is supported in SwarmUI! - - The smaller FP8 model can be downloaded here: [Comfy-Org/PixelDiT - fp8]() - - Or fat BF16 version: [Comfy-Org/PixelDiT - bf16]() - - Save in `diffusion_models` -- It does not use a VAE -- Uses the Gemma 2 2B text encoder, will be downloaded and handled automatically -- **Parameters:** - - **Sampler:** Default is fine (LCM). - - **Scheduler:** Default is fine. - - **CFG Scale:** `4` is recommended. - - **Steps:** `30` is recommended. - - **Resolution:** Side length `1024` is the standard. - # Video Models - Video models are documented in [Video Model Support](/docs/Video%20Model%20Support.md). diff --git a/docs/Obscure Model Support.md b/docs/Obscure Model Support.md index bf0064c7b..31ab00063 100644 --- a/docs/Obscure Model Support.md +++ b/docs/Obscure Model Support.md @@ -18,6 +18,7 @@ This doc tracks specifically the old, bad, unpopular, etc. models that are suppo [Ovis](#ovis) | MMDiT | 2025 | AIDC-AI (Alibaba) | 7B | No | Passable quality, but outclassed on launch | [LongCat-Image](#longcat-image) | MMDiT | 2025 | LongCat | 6B | No | Passable quality, but outclassed on launch | [Zeta Chroma](#zeta-chroma) | Pixel S3-DiT | 2026 | Lodestone Rock | 6B | No | Modern, Pixel-space Z-Image variant | +[PixelDiT](#pixeldit) | Pixel DiT | 2026 | NVIDIA | 1.3B | Minimal | Modern, fast, pixel-space, but very bad relative quality on launch | Obscure video models are tracked at the [Video Models heading](#video-models) @@ -200,6 +201,22 @@ These steps are not friendly to beginners (if Sana gains popularity, likely more - **Scheduler:** Default is fine - **Resolution:** Side length `1024` is the standard, broadly supports the same range as regular Z-Image (roughly 512 to 2048) +# PixelDiT + +- NVIDIA's [PixelDiT]() is supported in SwarmUI! + - The smaller FP8 model can be downloaded here: [Comfy-Org/PixelDiT - fp8]() + - Or fat BF16 version: [Comfy-Org/PixelDiT - bf16]() + - Save in `diffusion_models` + - Released as a pair with PiD (Pixel Decoder), which is a separate pixel decode/upscale model. See [Features/Upscaling: PiD](/docs/Features/Upscaling.md#pixel-decoder-pid) for more info. +- It does not use a VAE +- Uses the Gemma 2 2B text encoder, will be downloaded and handled automatically +- **Parameters:** + - **Sampler:** Default is fine. + - **Scheduler:** Default is fine. + - **CFG Scale:** `4` is recommended. + - **Steps:** `30` is recommended. + - **Resolution:** Side length `1024` is the standard. + -------------------------------------------------------------------------- # Video Models diff --git a/launchtools/comfy-install-linux.sh b/launchtools/comfy-install-linux.sh index 77778e945..c006ce1b2 100644 --- a/launchtools/comfy-install-linux.sh +++ b/launchtools/comfy-install-linux.sh @@ -64,7 +64,7 @@ fi # Install PyTorch based on GPU type if [ "$GPU_TYPE" == "nv" ]; then echo "install nvidia torch..." - $python -s -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu129 --no-cache-dir + $python -s -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 --no-cache-dir elif [ "$GPU_TYPE" == "amd" ]; then echo "install amd torch..." $python -s -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm7.1 --no-cache-dir From a774977242a61ae6a9beffdf48bffe0ef0948cb0 Mon Sep 17 00:00:00 2001 From: "Alex \"mcmonkey\" Goodwin" Date: Thu, 4 Jun 2026 18:27:55 -0700 Subject: [PATCH 13/13] base sampler input wrong pattern --- .../ComfyUIBackend/WorkflowGeneratorSteps.cs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs index ee35d9c5e..01148a142 100644 --- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs +++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs @@ -1362,11 +1362,7 @@ bool getBestFor(string phrase) } if (g.IsPiD()) { - if (g.BasicInputImage is null) - { - throw new SwarmUserErrorException("PiD models are pixel decoders/upscalers, not image generators, an Init Image is required."); - } - (WGNodeData pidLatent, string pidFormat) = g.CreatePidCompatLatent(g.FinalLoadedModel, g.BasicInputImage, g.CurrentVae); + (WGNodeData pidLatent, string pidFormat) = g.CreatePidCompatLatent(g.FinalLoadedModel, g.CurrentMedia, g.CurrentVae); string pidCond = g.CreateNode("PiDConditioning", new JObject() { ["positive"] = g.FinalPrompt,