-
-
Notifications
You must be signed in to change notification settings - Fork 416
Adds NVIDIA PixelDiT and PiD support #1393
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
mcmonkey4eva
merged 15 commits into
mcmonkeyprojects:master
from
jtreminio:pixeldit-pid-support
Jun 5, 2026
Merged
Changes from all commits
Commits
Show all changes
15 commits
Select commit
Hold shift + click to select a range
33bac0c
Adds NVIDIA PixelDiT and PiD support
jtreminio 320de43
Merge branch 'master' into pixeldit-pid-support
jtreminio 3d3a933
PiD replaces Refiner upscaler, not refiner stage itself
jtreminio 05836eb
Cleanup
jtreminio ab0665d
Docs for PixelDiT
jtreminio f10bb98
doc fix
jtreminio 145fec3
Add <pixeldecoder> hint
jtreminio f42cf7a
Use T2IParamTypes.GetBestModelInList()
jtreminio 4502cb4
Add more (pixel) keys
jtreminio eb099fc
add more why not
jtreminio d8716d5
Implements more pathways for DiT
jtreminio a6715d8
Merge branch 'master' into pr/1393
mcmonkey4eva 5a796a3
minor doc
mcmonkey4eva dccc351
docs
mcmonkey4eva a774977
base sampler input wrong pattern
mcmonkey4eva File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| # Upscaling In SwarmUI | ||
|
|
||
| (TODO) | ||
|
|
||
| # Pixel Decoder (PiD) | ||
|
|
||
| (TODO) | ||
|
|
||
| Downloads here: <https://huggingface.co/Comfy-Org/PixelDiT/tree/main/diffusion_models> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -166,6 +166,9 @@ public JArray FinalImageOut | |
| /// <summary>If true, the generator is currently working on the refiner stage.</summary> | ||
| public bool IsRefinerStage = false; | ||
|
|
||
| /// <summary>If true, the generator is currently working on the pixel-decoder stage.</summary> | ||
| public bool IsPixelDecoderStage = false; | ||
|
|
||
| /// <summary>If true, the generator is currently working on Image2Video.</summary> | ||
| public bool IsImageToVideo = false; | ||
|
|
||
|
|
@@ -958,8 +961,13 @@ public string CreateKSampler(JArray model, JArray pos, JArray neg, JArray latent | |
| latent = [srCond, 2]; | ||
| } | ||
| } | ||
| else if (IsPiD()) | ||
| { | ||
| defsampler ??= "lcm"; | ||
| defscheduler ??= "simple"; | ||
| } | ||
| // TODO: Registry of model default preferences instead of this | ||
| else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1() || IsLens()) | ||
| else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1() || IsLens() || IsPixelDiT()) | ||
| { | ||
| defscheduler ??= "simple"; | ||
| } | ||
|
|
@@ -2517,8 +2525,105 @@ public bool ShouldZeroNegative() | |
| return false; | ||
| } | ||
|
|
||
| /// <summary>The PiDConditioning node's latent_format value for each VAE family that PiD models exist for.</summary> | ||
| public static Dictionary<string, string> PidLatentFormats = new() | ||
| { | ||
| ["flux1"] = "flux", | ||
| ["flux2"] = "flux", | ||
| ["sd3"] = "sd3", | ||
| ["sdxl"] = "sdxl", | ||
| ["qwenimage"] = "qwenimage" | ||
| }; | ||
|
|
||
| /// <summary>Detects which VAE family a PiD model was trained against.</summary> | ||
| public static string PidFamilyOfModel(T2IModel pidModel) | ||
| { | ||
| string name = pidModel.Name.ToLowerFast(); | ||
| return PidLatentFormats.Keys.FirstOrDefault(name.Contains); | ||
| } | ||
|
|
||
| /// <summary>Converts media into a latent in the PiD model's native latent space, re-encoding through an auto-loaded matching VAE if needed.</summary> | ||
| public (WGNodeData, string) CreatePidCompatLatent(T2IModel pidModel, WGNodeData media, WGNodeData decodeVae) | ||
| { | ||
| string mediaFamily = media.IsLatentData ? media.Compat?.VaeFamily : null; | ||
| string family = PidFamilyOfModel(pidModel) ?? mediaFamily ?? "flux1"; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if flux1 and flux2 both are fine, iirc flux2 is a much better latent format |
||
| string format = PidLatentFormats[family]; | ||
| if (mediaFamily == family) | ||
| { | ||
| return (media, format); | ||
| } | ||
| WGNodeData decoded = media.AsRawImage(decodeVae); | ||
| (string knownVae, string vaeCompat) = T2IModelClassSorter.VaeFamilies[family]; | ||
| string defaultVae = family switch | ||
| { | ||
| "flux1" => UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFluxVAE, | ||
| "flux2" => UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFlux2VAE, | ||
| "sd3" => UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultSD3VAE, | ||
| "sdxl" => UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultSDXLVAE, | ||
| _ => null | ||
| }; | ||
| ModelLoadHelpers helpers = new(this); | ||
| bool priorNoVae = NoVAEOverride; | ||
| NoVAEOverride = true; | ||
| helpers.DoVaeLoader(defaultVae, vaeCompat, knownVae); | ||
| NoVAEOverride = priorNoVae; | ||
| WGNodeData encodeVae = new(LoadingVAE, this, WGNodeData.DT_VAE, T2IModelClassSorter.CompatClasses[vaeCompat]); | ||
| return (decoded.EncodeToLatent(encodeVae), format); | ||
| } | ||
|
|
||
| /// <summary>Creates a PiD pixel-decode stage: converts to a PiD-space latent and samples a 4x pixel image from it.</summary> | ||
| public WGNodeData CreatePixelDecode(T2IModel pidModel, WGNodeData media, WGNodeData decodeVae, long seed, bool isRefiner = false) | ||
| { | ||
| (WGNodeData latent, string format) = CreatePidCompatLatent(pidModel, media, decodeVae); | ||
| T2IModel priorFinalModel = FinalLoadedModel; | ||
| List<T2IModel> priorFinalModelList = FinalLoadedModelList; | ||
| WGNodeData priorModel = CurrentModel, priorTextEnc = CurrentTextEnc, priorVae = CurrentVae; | ||
| bool priorNoVae = NoVAEOverride; | ||
| int sectionId = isRefiner ? T2IParamInput.SectionID_Refiner : T2IParamInput.SectionID_PixelDecoder; | ||
| FinalLoadedModel = pidModel; | ||
| FinalLoadedModelList = [pidModel]; | ||
| NoVAEOverride = true; | ||
| IsPixelDecoderStage = !isRefiner; | ||
| (FinalLoadedModel, CurrentModel, CurrentTextEnc, CurrentVae) = CreateModelLoader(pidModel, isRefiner ? "Refiner" : "PixelDecoder", sectionId: sectionId); | ||
| IsPixelDecoderStage = false; | ||
| NoVAEOverride = priorNoVae; | ||
| JArray pos = CreateConditioning(UserInput.Get(T2IParamTypes.Prompt), CurrentTextEnc.Path, pidModel, true, isRefiner: isRefiner, isPixelDecoder: !isRefiner); | ||
| JArray neg = CreateConditioning(UserInput.Get(T2IParamTypes.NegativePrompt), CurrentTextEnc.Path, pidModel, false, isRefiner: isRefiner, isPixelDecoder: !isRefiner); | ||
| string cond = CreateNode("PiDConditioning", new JObject() | ||
| { | ||
| ["positive"] = pos, | ||
| ["latent"] = latent.Path, | ||
| ["latent_format"] = format, | ||
| ["degrade_sigma"] = 0.0 | ||
| }); | ||
| int width = ((media.Width ?? UserInput.GetImageWidth()) * 4 / 16) * 16; | ||
| int height = ((media.Height ?? UserInput.GetImageHeight()) * 4 / 16) * 16; | ||
| string emptyLatent = CreateNode("EmptyChromaRadianceLatentImage", new JObject() | ||
| { | ||
| ["batch_size"] = UserInput.Get(T2IParamTypes.BatchSize, 1), | ||
| ["width"] = width, | ||
| ["height"] = height | ||
| }); | ||
| int steps = UserInput.GetNullable(T2IParamTypes.Steps, sectionId, false) ?? (isRefiner ? UserInput.GetNullable(T2IParamTypes.RefinerSteps) : null) ?? 4; | ||
| double cfg = UserInput.GetNullable(T2IParamTypes.CFGScale, sectionId, false) ?? (isRefiner ? UserInput.GetNullable(T2IParamTypes.RefinerCFGScale) : null) ?? 1; | ||
| string explicitSampler = UserInput.Get(ComfyUIBackendExtension.SamplerParam, null, sectionId: sectionId, includeBase: false) ?? (isRefiner ? UserInput.Get(ComfyUIBackendExtension.RefinerSamplerParam, null) : null); | ||
| string explicitScheduler = UserInput.Get(ComfyUIBackendExtension.SchedulerParam, null, sectionId: sectionId, includeBase: false) ?? (isRefiner ? UserInput.Get(ComfyUIBackendExtension.RefinerSchedulerParam, null) : null); | ||
| string sampled = CreateKSampler(CurrentModel.Path, [cond, 0], neg, [emptyLatent, 0], cfg, steps, 0, 10000, seed, false, true, | ||
| explicitSampler: explicitSampler, explicitScheduler: explicitScheduler, sectionId: sectionId); | ||
| WGNodeData result = media.WithPath([sampled, 0], WGNodeData.DT_LATENT_IMAGE, pidModel.ModelClass?.CompatClass); | ||
| result.Width = width; | ||
| result.Height = height; | ||
| result = result.DecodeLatents(CurrentVae, false); | ||
| FinalLoadedModel = priorFinalModel; | ||
| FinalLoadedModelList = priorFinalModelList; | ||
| CurrentModel = priorModel; | ||
| CurrentTextEnc = priorTextEnc; | ||
| CurrentVae = priorVae; | ||
| return result; | ||
| } | ||
|
|
||
| /// <summary>Creates a "CLIPTextEncode" or equivalent node for the given input, applying prompt-given conditioning modifiers as relevant.</summary> | ||
| public JArray CreateConditioning(string prompt, JArray clip, T2IModel model, bool isPositive, string firstId = null, bool isRefiner = false, bool isVideo = false, bool isVideoSwap = false) | ||
| public JArray CreateConditioning(string prompt, JArray clip, T2IModel model, bool isPositive, string firstId = null, bool isRefiner = false, bool isVideo = false, bool isVideoSwap = false, bool isPixelDecoder = false) | ||
| { | ||
| PromptRegion regionalizer = new(prompt); | ||
| string globalPromptText = regionalizer.GlobalPrompt; | ||
|
|
@@ -2534,7 +2639,11 @@ public JArray CreateConditioning(string prompt, JArray clip, T2IModel model, boo | |
| { | ||
| globalPromptText = $"{globalPromptText} {regionalizer.RefinerPrompt}"; | ||
| } | ||
| else if (!isVideo && !isRefiner && !string.IsNullOrWhiteSpace(regionalizer.BasePrompt)) | ||
| else if (isPixelDecoder && !string.IsNullOrWhiteSpace(regionalizer.PixelDecoderPrompt)) | ||
| { | ||
| globalPromptText = $"{globalPromptText} {regionalizer.PixelDecoderPrompt}"; | ||
| } | ||
| else if (!isVideo && !isRefiner && !isPixelDecoder && !string.IsNullOrWhiteSpace(regionalizer.BasePrompt)) | ||
| { | ||
| globalPromptText = $"{globalPromptText} {regionalizer.BasePrompt}"; | ||
| } | ||
|
|
||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.