mcmonkeyprojects · mcmonkey4eva · Jun 5, 2026 · May 26, 2026 · May 27, 2026 · May 27, 2026
diff --git a/docs/Features/README.md b/docs/Features/README.md
@@ -11,3 +11,4 @@ See [The Docs Readme](/docs/README.md) for general listing of documentation and
 - [Webhooks](/docs/Features/Webhooks.md) for info about custom defined webhooks triggered by your SwarmUI server.
 - [UISounds](/docs/Features/UISounds.md) for info about sound playback in the UI (eg a sound to play after generations complete).
 - [AutoScalingBackend](/docs/Features/AutoScalingBackend.md) for info about the specialty advanced usage "Auto-Scaling" backend (for Slurm/Kubernetes/etc).
+- [Upscaling](/docs/Features/Upscaling.md) for info about upscaling images and videos (ie increasing resolution, especially to improve quality).
diff --git a/docs/Features/Upscaling.md b/docs/Features/Upscaling.md
@@ -0,0 +1,9 @@
+# Upscaling In SwarmUI
+
+(TODO)
+
+# Pixel Decoder (PiD)
+
+(TODO)
+
+Downloads here: <https://huggingface.co/Comfy-Org/PixelDiT/tree/main/diffusion_models>
diff --git a/docs/Model Support.md b/docs/Model Support.md
@@ -38,6 +38,7 @@ Old or bad options also tracked listed via [Obscure Model Support](/docs/Obscure
 [Ovis](/docs/Obscure%20Model%20Support.md#ovis) | MMDiT | 2025 | AIDC-AI (Alibaba) | 7B | No | Passable quality, but outclassed on launch |
 [LongCat-Image](/docs/Obscure%20Model%20Support.md#longcat-image) | MMDiT | 2025 | LongCat | 6B | No | Passable quality, but outclassed on launch |
 [Zeta Chroma](/docs/Obscure%20Model%20Support.md#zeta-chroma) | Pixel S3-DiT | 2026 | Lodestone Rock | 6B | No | Modern, Pixel-space Z-Image variant |
+[PixelDiT](/docs/Obscure%20Model%20Support.md#pixeldit) | Pixel DiT | 2026 | NVIDIA | 1.3B | Minimal | Modern, fast, pixel-space, but very bad relative quality on launch |
 
 - **Architecture** is the fundamental machine learning structure used for the model, UNet's were used in the past but DiT (Diffusion Transformers) are the modern choice
 - **Scale** is how big the model is - "B" for "Billion", so for example "2B" means "Two billion parameters".

diff --git a/docs/Obscure Model Support.md b/docs/Obscure Model Support.md
@@ -18,6 +18,7 @@ This doc tracks specifically the old, bad, unpopular, etc. models that are suppo
 [Ovis](#ovis) | MMDiT | 2025 | AIDC-AI (Alibaba) | 7B | No | Passable quality, but outclassed on launch |
 [LongCat-Image](#longcat-image) | MMDiT | 2025 | LongCat | 6B | No | Passable quality, but outclassed on launch |
 [Zeta Chroma](#zeta-chroma) | Pixel S3-DiT | 2026 | Lodestone Rock | 6B | No | Modern, Pixel-space Z-Image variant |
+[PixelDiT](#pixeldit) | Pixel DiT | 2026 | NVIDIA | 1.3B | Minimal | Modern, fast, pixel-space, but very bad relative quality on launch |
 
 Obscure video models are tracked at the [Video Models heading](#video-models)
 
@@ -200,6 +201,22 @@ These steps are not friendly to beginners (if Sana gains popularity, likely more
     - **Scheduler:** Default is fine
     - **Resolution:** Side length `1024` is the standard, broadly supports the same range as regular Z-Image (roughly 512 to 2048)
 
+# PixelDiT
+
+- NVIDIA's [PixelDiT](<https://huggingface.co/Comfy-Org/PixelDiT>) is supported in SwarmUI!
+    - The smaller FP8 model can be downloaded here: [Comfy-Org/PixelDiT - fp8](<https://huggingface.co/Comfy-Org/PixelDiT/resolve/main/diffusion_models/pixeldit_1300m_1024px_mxfp8.safetensors>)
+    - Or fat BF16 version: [Comfy-Org/PixelDiT - bf16](<https://huggingface.co/Comfy-Org/PixelDiT/resolve/main/diffusion_models/pixeldit_1300m_1024px_bf16.safetensors>)
+    - Save in `diffusion_models`
+    - Released as a pair with PiD (Pixel Decoder), which is a separate pixel decode/upscale model. See [Features/Upscaling: PiD](/docs/Features/Upscaling.md#pixel-decoder-pid) for more info.
+- It does not use a VAE
+- Uses the Gemma 2 2B text encoder, will be downloaded and handled automatically
+- **Parameters:**
+    - **Sampler:** Default is fine.
+    - **Scheduler:** Default is fine.
+    - **CFG Scale:** `4` is recommended.
+    - **Steps:** `30` is recommended.
+    - **Resolution:** Side length `1024` is the standard.
+
 --------------------------------------------------------------------------
 
 # Video Models

diff --git a/launchtools/comfy-install-linux.sh b/launchtools/comfy-install-linux.sh
@@ -64,7 +64,7 @@ fi
 # Install PyTorch based on GPU type
 if [ "$GPU_TYPE" == "nv" ]; then
     echo "install nvidia torch..."
-    $python -s -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu129 --no-cache-dir
+    $python -s -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 --no-cache-dir
 elif [ "$GPU_TYPE" == "amd" ]; then
     echo "install amd torch..."
     $python -s -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm7.1 --no-cache-dir

diff --git a/src/BuiltinExtensions/ComfyUIBackend/ComfyUIBackendExtension.cs b/src/BuiltinExtensions/ComfyUIBackend/ComfyUIBackendExtension.cs
@@ -614,6 +614,8 @@ public static void AssignValuesFromRaw(JObject rawObjectInfo)
 
     public static T2IRegisteredParam<int> RefinerHyperTile, VideoFrameInterpolationMultiplier;
 
+    public static T2IRegisteredParam<T2IModel> PixelDecoderModel;
+
     public static T2IRegisteredParam<string>[] ControlNetPreprocessorParams = new T2IRegisteredParam<string>[3], ControlNetUnionTypeParams = new T2IRegisteredParam<string>[3];
 
     public static List<string> UpscalerModels = ["pixel-lanczos///Pixel: Lanczos (cheap + high quality)", "pixel-bicubic///Pixel: Bicubic (Basic)", "pixel-area///Pixel: Area", "pixel-bilinear///Pixel: Bilinear", "pixel-nearest-exact///Pixel: Nearest-Exact (Pixel art)", "latent-bislerp///Latent: Bislerp", "latent-bicubic///Latent: Bicubic", "latent-area///Latent: Area", "latent-bilinear///Latent: Bilinear", "latent-nearest-exact///Latent: Nearest-Exact"],
@@ -635,6 +637,25 @@ public static void AssignValuesFromRaw(JObject rawObjectInfo)
         ],
         Schedulers = ["normal///Normal", "karras///Karras", "exponential///Exponential", "simple///Simple", "ddim_uniform///DDIM Uniform", "sgm_uniform///SGM Uniform", "turbo///Turbo (for turbo models, max 10 steps)", "align_your_steps///Align Your Steps (Model-specific behavior)", "beta///Beta", "linear_quadratic///Linear Quadratic (Mochi)", "ltxv///LTX-Video", "ltxv-image///LTXV-Image", "kl_optimal///KL Optimal (Nvidia AYS)", "flux2///Flux.2"];
 
+    /// <summary>Lists PiD decoder models.</summary>
+    public static List<string> PidUpscaleModels(Session session) => [.. Program.MainSDModels.ListModelsFor(session).Where(m => m.ModelClass?.CompatClass?.ID == "pid").OrderBy(m => m.Name).Select(m => $"pidmodel-{m.Name}///PiD Model: {m.Name}")];
+
+    /// <summary>Resolves a PiD model from a model name.</summary>
+    public static T2IModel GetPidModel(string name, Session session)
+    {
+        string matched = T2IParamTypes.GetBestModelInList(name, Program.MainSDModels.ListModelNamesFor(session));
+        if (matched is not null && matched.EndsWith(".safetensors"))
+        {
+            matched = matched.BeforeLast('.');
+        }
+        T2IModel model = matched is null ? null : Program.MainSDModels.GetModel(matched);
+        if (model is null || model.ModelClass?.CompatClass?.ID != "pid")
+        {
+            throw new SwarmUserErrorException($"PiD model '{name}' could not be found, or is not a valid PiD model.");
+        }
+        return model;
+    }
+
     public static List<string> IPAdapterModels = ["None"], IPAdapterWeightTypes = ["standard", "prompt is more important", "style transfer"];
 
     public static List<string> GligenModels = ["None"], YoloModels = [], StyleModels = ["None"], SetClipDevices = ["cpu"];
@@ -752,7 +773,11 @@ public override void OnInit()
             ));
         RefinerUpscaleMethod = T2IParamTypes.Register<string>(new("Refiner Upscale Method", "How to upscale the image, if upscaling is used.",
             "pixel-lanczos", Group: T2IParamTypes.GroupRefiners, OrderPriority: -1, FeatureFlag: "comfyui", ChangeWeight: 1,
-            GetValues: (_) => UpscalerModels, DependNonDefault: T2IParamTypes.RefinerUpscale.Type.ID
+            GetValues: (session) => [.. UpscalerModels, .. PidUpscaleModels(session)], DependNonDefault: T2IParamTypes.RefinerUpscale.Type.ID
+            ));
+        PixelDecoderModel = T2IParamTypes.Register<T2IModel>(new("Pixel Decoder Model", "Optionally use a PiD (Pixel Diffusion Decoder) model.",
+            "", Toggleable: true, FeatureFlag: "comfyui", Group: T2IParamTypes.GroupAdvancedModelAddons, IsAdvanced: true, Subtype: "Stable-Diffusion", ChangeWeight: 4, DoNotPreview: true, OrderPriority: 14,
+            GetValues: (session) => T2IParamTypes.CleanModelList(Program.MainSDModels.ListModelsFor(session).Where(m => m.ModelClass?.CompatClass?.ID == "pid").OrderBy(m => m.Name).Select(m => m.Name))
             ));
         RefinerSamplerParam = T2IParamTypes.Register<string>(new("Refiner Sampler", SamplerParam.Type.Description + "\nThis is an override to only affect the Refine/Upscale stage.",
             "euler", Toggleable: true, FeatureFlag: "comfyui", Group: T2IParamTypes.GroupRefinerOverrides, OrderPriority: -2,

diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs
@@ -166,6 +166,9 @@ public JArray FinalImageOut
     /// <summary>If true, the generator is currently working on the refiner stage.</summary>
     public bool IsRefinerStage = false;
 
+    /// <summary>If true, the generator is currently working on the pixel-decoder stage.</summary>
+    public bool IsPixelDecoderStage = false;
+
     /// <summary>If true, the generator is currently working on Image2Video.</summary>
     public bool IsImageToVideo = false;
 
@@ -958,8 +961,13 @@ public string CreateKSampler(JArray model, JArray pos, JArray neg, JArray latent
                 latent = [srCond, 2];
             }
         }
+        else if (IsPiD())
+        {
+            defsampler ??= "lcm";
+            defscheduler ??= "simple";
+        }
         // TODO: Registry of model default preferences instead of this
-        else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1() || IsLens())
+        else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1() || IsLens() || IsPixelDiT())
         {
             defscheduler ??= "simple";
         }
@@ -2517,8 +2525,105 @@ public bool ShouldZeroNegative()
         return false;
     }
 
+    /// <summary>The PiDConditioning node's latent_format value for each VAE family that PiD models exist for.</summary>
+    public static Dictionary<string, string> PidLatentFormats = new()
+    {
+        ["flux1"] = "flux",
+        ["flux2"] = "flux",
+        ["sd3"] = "sd3",
+        ["sdxl"] = "sdxl",
+        ["qwenimage"] = "qwenimage"
+    };
+
+    /// <summary>Detects which VAE family a PiD model was trained against.</summary>
+    public static string PidFamilyOfModel(T2IModel pidModel)
+    {
+        string name = pidModel.Name.ToLowerFast();
+        return PidLatentFormats.Keys.FirstOrDefault(name.Contains);
+    }
+
+    /// <summary>Converts media into a latent in the PiD model's native latent space, re-encoding through an auto-loaded matching VAE if needed.</summary>
+    public (WGNodeData, string) CreatePidCompatLatent(T2IModel pidModel, WGNodeData media, WGNodeData decodeVae)
+    {
+        string mediaFamily = media.IsLatentData ? media.Compat?.VaeFamily : null;
+        string family = PidFamilyOfModel(pidModel) ?? mediaFamily ?? "flux1";
+        string format = PidLatentFormats[family];
+        if (mediaFamily == family)
+        {
+            return (media, format);
+        }
+        WGNodeData decoded = media.AsRawImage(decodeVae);
+        (string knownVae, string vaeCompat) = T2IModelClassSorter.VaeFamilies[family];
+        string defaultVae = family switch
+        {
+            "flux1" => UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFluxVAE,
+            "flux2" => UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFlux2VAE,
+            "sd3" => UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultSD3VAE,
+            "sdxl" => UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultSDXLVAE,
+            _ => null
+        };
+        ModelLoadHelpers helpers = new(this);
+        bool priorNoVae = NoVAEOverride;
+        NoVAEOverride = true;
+        helpers.DoVaeLoader(defaultVae, vaeCompat, knownVae);
+        NoVAEOverride = priorNoVae;
+        WGNodeData encodeVae = new(LoadingVAE, this, WGNodeData.DT_VAE, T2IModelClassSorter.CompatClasses[vaeCompat]);
+        return (decoded.EncodeToLatent(encodeVae), format);
+    }
+
+    /// <summary>Creates a PiD pixel-decode stage: converts to a PiD-space latent and samples a 4x pixel image from it.</summary>
+    public WGNodeData CreatePixelDecode(T2IModel pidModel, WGNodeData media, WGNodeData decodeVae, long seed, bool isRefiner = false)
+    {
+        (WGNodeData latent, string format) = CreatePidCompatLatent(pidModel, media, decodeVae);
+        T2IModel priorFinalModel = FinalLoadedModel;
+        List<T2IModel> priorFinalModelList = FinalLoadedModelList;
+        WGNodeData priorModel = CurrentModel, priorTextEnc = CurrentTextEnc, priorVae = CurrentVae;
+        bool priorNoVae = NoVAEOverride;
+        int sectionId = isRefiner ? T2IParamInput.SectionID_Refiner : T2IParamInput.SectionID_PixelDecoder;
+        FinalLoadedModel = pidModel;
+        FinalLoadedModelList = [pidModel];
+        NoVAEOverride = true;
+        IsPixelDecoderStage = !isRefiner;
+        (FinalLoadedModel, CurrentModel, CurrentTextEnc, CurrentVae) = CreateModelLoader(pidModel, isRefiner ? "Refiner" : "PixelDecoder", sectionId: sectionId);
+        IsPixelDecoderStage = false;
+        NoVAEOverride = priorNoVae;
+        JArray pos = CreateConditioning(UserInput.Get(T2IParamTypes.Prompt), CurrentTextEnc.Path, pidModel, true, isRefiner: isRefiner, isPixelDecoder: !isRefiner);
+        JArray neg = CreateConditioning(UserInput.Get(T2IParamTypes.NegativePrompt), CurrentTextEnc.Path, pidModel, false, isRefiner: isRefiner, isPixelDecoder: !isRefiner);
+        string cond = CreateNode("PiDConditioning", new JObject()
+        {
+            ["positive"] = pos,
+            ["latent"] = latent.Path,
+            ["latent_format"] = format,
+            ["degrade_sigma"] = 0.0
+        });
+        int width = ((media.Width ?? UserInput.GetImageWidth()) * 4 / 16) * 16;
+        int height = ((media.Height ?? UserInput.GetImageHeight()) * 4 / 16) * 16;
+        string emptyLatent = CreateNode("EmptyChromaRadianceLatentImage", new JObject()
+        {
+            ["batch_size"] = UserInput.Get(T2IParamTypes.BatchSize, 1),
+            ["width"] = width,
+            ["height"] = height
+        });
+        int steps = UserInput.GetNullable(T2IParamTypes.Steps, sectionId, false) ?? (isRefiner ? UserInput.GetNullable(T2IParamTypes.RefinerSteps) : null) ?? 4;
+        double cfg = UserInput.GetNullable(T2IParamTypes.CFGScale, sectionId, false) ?? (isRefiner ? UserInput.GetNullable(T2IParamTypes.RefinerCFGScale) : null) ?? 1;
+        string explicitSampler = UserInput.Get(ComfyUIBackendExtension.SamplerParam, null, sectionId: sectionId, includeBase: false) ?? (isRefiner ? UserInput.Get(ComfyUIBackendExtension.RefinerSamplerParam, null) : null);
+        string explicitScheduler = UserInput.Get(ComfyUIBackendExtension.SchedulerParam, null, sectionId: sectionId, includeBase: false) ?? (isRefiner ? UserInput.Get(ComfyUIBackendExtension.RefinerSchedulerParam, null) : null);
+        string sampled = CreateKSampler(CurrentModel.Path, [cond, 0], neg, [emptyLatent, 0], cfg, steps, 0, 10000, seed, false, true,
+            explicitSampler: explicitSampler, explicitScheduler: explicitScheduler, sectionId: sectionId);
+        WGNodeData result = media.WithPath([sampled, 0], WGNodeData.DT_LATENT_IMAGE, pidModel.ModelClass?.CompatClass);
+        result.Width = width;
+        result.Height = height;
+        result = result.DecodeLatents(CurrentVae, false);
+        FinalLoadedModel = priorFinalModel;
+        FinalLoadedModelList = priorFinalModelList;
+        CurrentModel = priorModel;
+        CurrentTextEnc = priorTextEnc;
+        CurrentVae = priorVae;
+        return result;
+    }
+
     /// <summary>Creates a "CLIPTextEncode" or equivalent node for the given input, applying prompt-given conditioning modifiers as relevant.</summary>
-    public JArray CreateConditioning(string prompt, JArray clip, T2IModel model, bool isPositive, string firstId = null, bool isRefiner = false, bool isVideo = false, bool isVideoSwap = false)
+    public JArray CreateConditioning(string prompt, JArray clip, T2IModel model, bool isPositive, string firstId = null, bool isRefiner = false, bool isVideo = false, bool isVideoSwap = false, bool isPixelDecoder = false)
     {
         PromptRegion regionalizer = new(prompt);
         string globalPromptText = regionalizer.GlobalPrompt;
@@ -2534,7 +2639,11 @@ public JArray CreateConditioning(string prompt, JArray clip, T2IModel model, boo
         {
             globalPromptText = $"{globalPromptText} {regionalizer.RefinerPrompt}";
         }
-        else if (!isVideo && !isRefiner && !string.IsNullOrWhiteSpace(regionalizer.BasePrompt))
+        else if (isPixelDecoder && !string.IsNullOrWhiteSpace(regionalizer.PixelDecoderPrompt))
+        {
+            globalPromptText = $"{globalPromptText} {regionalizer.PixelDecoderPrompt}";
+        }
+        else if (!isVideo && !isRefiner && !isPixelDecoder && !string.IsNullOrWhiteSpace(regionalizer.BasePrompt))
         {
             globalPromptText = $"{globalPromptText} {regionalizer.BasePrompt}";
         }