diff --git a/easy_samplers.py b/easy_samplers.py index d44e77c..86b5df1 100644 --- a/easy_samplers.py +++ b/easy_samplers.py @@ -16,6 +16,50 @@ from .nodes_registry import comfy_node +def _make_av_latent_dict(video_latent_dict, audio_tensor, audio_noise_mask=None): + """Wrap video latent dict + audio tensor into AV latent dict with NestedTensor. + + If audio_tensor is None, returns video_latent_dict unchanged. + Creates matching noise masks for both modalities when either is present. + """ + if audio_tensor is None: + return video_latent_dict + result = video_latent_dict.copy() + result["samples"] = NestedTensor([result["samples"], audio_tensor]) + video_mask = result.get("noise_mask") + if video_mask is not None or audio_noise_mask is not None: + if video_mask is None: + vs = result["samples"].tensors[0] + video_mask = torch.ones( + vs.shape[0], 1, vs.shape[2], vs.shape[3], vs.shape[4], + device=vs.device, dtype=vs.dtype, + ) + if audio_noise_mask is None: + audio_noise_mask = torch.ones( + audio_tensor.shape[0], 1, audio_tensor.shape[2], audio_tensor.shape[3], + device=audio_tensor.device, dtype=audio_tensor.dtype, + ) + result["noise_mask"] = NestedTensor([video_mask, audio_noise_mask]) + return result + + +def _split_av_latent_dict(latent_dict): + """Split AV latent dict into (video_latent_dict, audio_tensor). + + If the latent is not an AV NestedTensor, returns (latent_dict, None). + """ + samples = latent_dict["samples"] + if not isinstance(samples, NestedTensor) or len(samples.tensors) < 2: + return latent_dict, None + result = latent_dict.copy() + result["samples"] = samples.tensors[0] + audio = samples.tensors[1] + nm = result.get("noise_mask") + if nm is not None and isinstance(nm, NestedTensor): + result["noise_mask"] = nm.tensors[0] + return result, audio + + def _get_raw_conds_from_guider(guider): if not hasattr(guider, "raw_conds"): if "negative" not in guider.original_conds: @@ -148,6 +192,7 @@ def sample( optional_initialization_latents=None, guiding_start_step=0, guiding_end_step=1000, + _audio_tile=None, ): guider = copy.copy(guider) guider.original_conds = copy.deepcopy(guider.original_conds) @@ -262,13 +307,15 @@ def sample( # Denoise the latent video print("Denoising with conditioning on sigmas: ", middle_sigmas) + _av = _make_av_latent_dict(latents, _audio_tile) (output_latents, denoised_output_latents) = SamplerCustomAdvanced().sample( noise=noise, guider=guider, sampler=sampler, sigmas=middle_sigmas, - latent_image=latents, + latent_image=_av, ) + denoised_output_latents, _audio_tile = _split_av_latent_dict(denoised_output_latents) # Clean up guides if image conditioning was used positive, negative, denoised_output_latents = LTXVCropGuides.execute( @@ -284,13 +331,18 @@ def sample( "Denoising with no conditioning but with classical i2v noise mask on sigmas: ", low_sigmas, ) + _av = _make_av_latent_dict(denoised_output_latents, _audio_tile) (_, denoised_output_latents) = SamplerCustomAdvanced().sample( noise=noise, guider=guider, sampler=sampler, sigmas=low_sigmas, - latent_image=denoised_output_latents, + latent_image=_av, ) + denoised_output_latents, _audio_tile = _split_av_latent_dict(denoised_output_latents) + + if _audio_tile is not None: + denoised_output_latents["_audio"] = _audio_tile return (denoised_output_latents, positive, negative) @@ -399,6 +451,8 @@ def sample( guiding_start_step=0, guiding_end_step=1000, normalize_per_frame=False, + _audio_tile=None, + _audio_new_init=None, ): guider = copy.copy(guider) guider.original_conds = copy.deepcopy(guider.original_conds) @@ -412,7 +466,20 @@ def sample( positive, negative = _get_raw_conds_from_guider(guider) + # Handle AV latents (standalone mode) + _standalone_av = False + _accumulated_audio = _audio_tile samples = latents["samples"] + if isinstance(samples, NestedTensor) and len(samples.tensors) == 2: + if _accumulated_audio is None: + _accumulated_audio = samples.tensors[1] + _standalone_av = True + latents = latents.copy() + latents["samples"] = samples.tensors[0] + if "noise_mask" in latents and isinstance(latents["noise_mask"], NestedTensor): + latents["noise_mask"] = latents["noise_mask"].tensors[0] + samples = latents["samples"] + batch, channels, frames, height, width = samples.shape time_scale_factor, width_scale_factor, height_scale_factor = ( vae.downscale_index_formula @@ -428,6 +495,52 @@ def sample( latents, -overlap, -1 ) + # Set up audio extend tile if audio is available + _audio_extend_tile = None + _audio_noise_mask = None + _audio_overlap = 0 + if _accumulated_audio is not None: + audio_T = _accumulated_audio.shape[2] + video_T = frames + audio_ratio = audio_T / max(video_T, 1) + _audio_overlap = max(1, round(overlap * audio_ratio)) + video_new_latent_frames = num_new_frames // time_scale_factor + audio_new_frames = max(1, round(video_new_latent_frames * audio_ratio)) + + # Build audio tile: overlap (already denoised) + new frames. + # If _audio_new_init is provided (stage-2 refinement), use it + # as initialization for the new frames instead of zeros. + audio_overlap_data = _accumulated_audio[:, :, -_audio_overlap:] + if _audio_new_init is not None: + available = min(audio_new_frames, _audio_new_init.shape[2]) + audio_new_data = _audio_new_init[:, :, :available].clone() + if available < audio_new_frames: + pad = torch.zeros( + _accumulated_audio.shape[0], _accumulated_audio.shape[1], + audio_new_frames - available, _accumulated_audio.shape[3], + device=_accumulated_audio.device, dtype=_accumulated_audio.dtype, + ) + audio_new_data = torch.cat([audio_new_data, pad], dim=2) + else: + audio_new_data = torch.zeros( + _accumulated_audio.shape[0], _accumulated_audio.shape[1], + audio_new_frames, _accumulated_audio.shape[3], + device=_accumulated_audio.device, dtype=_accumulated_audio.dtype, + ) + _audio_extend_tile = torch.cat([audio_overlap_data, audio_new_data], dim=2) + + # Audio noise mask: preserve overlap, denoise new + _audio_noise_mask = torch.ones( + _audio_extend_tile.shape[0], 1, + _audio_extend_tile.shape[2], _audio_extend_tile.shape[3], + device=_audio_extend_tile.device, dtype=_audio_extend_tile.dtype, + ) + _audio_noise_mask[:, :, :_audio_overlap] = 1.0 - strength + print( + f"[ExtendSampler] Audio extend tile: overlap={_audio_overlap}, " + f"new={audio_new_frames}, total={_audio_extend_tile.shape[2]}" + ) + if optional_initialization_latents is None: new_latents = EmptyLTXVLatentVideo.execute( width=width * width_scale_factor, @@ -488,13 +601,15 @@ def sample( if len(high_sigmas) > 1: guider.set_conds(positive, negative) print("Denoising with overlap conditioning only on sigmas: ", high_sigmas) + _av = _make_av_latent_dict(new_latents, _audio_extend_tile, _audio_noise_mask) (_, new_latents) = SamplerCustomAdvanced().sample( noise=noise, guider=guider, sampler=sampler, sigmas=high_sigmas, - latent_image=new_latents, + latent_image=_av, ) + new_latents, _audio_extend_tile = _split_av_latent_dict(new_latents) if optional_guiding_latents is not None: optional_guiding_latents = LTXVSelectLatents().select_latents( @@ -533,13 +648,15 @@ def sample( # Denoise the latent video print("Denoising with full conditioning on sigmas: ", middle_sigmas) + _av = _make_av_latent_dict(new_latents, _audio_extend_tile, _audio_noise_mask) (output_latents, denoised_output_latents) = SamplerCustomAdvanced().sample( noise=noise, guider=guider, sampler=sampler, sigmas=middle_sigmas, - latent_image=new_latents, + latent_image=_av, ) + denoised_output_latents, _audio_extend_tile = _split_av_latent_dict(denoised_output_latents) positive, negative, denoised_output_latents = LTXVCropGuides.execute( positive=positive, @@ -591,13 +708,15 @@ def sample( "Denoising with overlap + keyframes conditioning only on sigmas: ", low_sigmas, ) + _av = _make_av_latent_dict(denoised_output_latents, _audio_extend_tile, _audio_noise_mask) (_, denoised_output_latents) = SamplerCustomAdvanced().sample( noise=noise, guider=guider, sampler=sampler, sigmas=low_sigmas, - latent_image=denoised_output_latents, + latent_image=_av, ) + denoised_output_latents, _audio_extend_tile = _split_av_latent_dict(denoised_output_latents) positive, negative, denoised_output_latents = LTXVCropGuides.execute( positive=positive, negative=negative, @@ -621,6 +740,16 @@ def sample( (latents,) = LinearOverlapLatentTransition().process( latents, truncated_denoised_output_latents, overlap - 1, axis=2 ) + + # Accumulate audio: append new (non-overlap) audio frames + if _accumulated_audio is not None and _audio_extend_tile is not None: + new_audio = _audio_extend_tile[:, :, _audio_overlap:] + accumulated_audio_out = torch.cat([_accumulated_audio, new_audio], dim=2) + if _standalone_av: + latents["samples"] = NestedTensor([latents["samples"], accumulated_audio_out]) + else: + latents["_audio"] = accumulated_audio_out + return (latents, positive, negative) @@ -692,6 +821,7 @@ def sample( guiding_strength=1.0, guiding_start_step=0, guiding_end_step=1000, + _audio_tile=None, ): guider = copy.copy(guider) guider.original_conds = copy.deepcopy(guider.original_conds) @@ -735,13 +865,15 @@ def sample( "Denoising with keyframes only [if available] on sigmas: ", high_sigmas, ) + _av = _make_av_latent_dict(new_latents, _audio_tile) (_, new_latents) = SamplerCustomAdvanced().sample( noise=noise, guider=guider, sampler=sampler, sigmas=high_sigmas, - latent_image=new_latents, + latent_image=_av, ) + new_latents, _audio_tile = _split_av_latent_dict(new_latents) if optional_cond_indices is not None and 0 in optional_cond_indices: guiding_latents = LTXVSelectLatents().select_latents( @@ -806,13 +938,15 @@ def sample( # Denoise the latent video print("Denoising with full conditioning on sigmas: ", middle_sigmas) + _av = _make_av_latent_dict(new_latents, _audio_tile) (_, denoised_output_latents) = SamplerCustomAdvanced().sample( noise=noise, guider=guider, sampler=sampler, sigmas=middle_sigmas, - latent_image=new_latents, + latent_image=_av, ) + denoised_output_latents, _audio_tile = _split_av_latent_dict(denoised_output_latents) # Clean up guides if image conditioning was used positive, negative, denoised_output_latents = LTXVCropGuides.execute( @@ -827,19 +961,24 @@ def sample( "Denoising with keyframes only [if available] conditioning on sigmas: ", low_sigmas, ) + _av = _make_av_latent_dict(denoised_output_latents, _audio_tile) (_, denoised_output_latents) = SamplerCustomAdvanced().sample( noise=noise, guider=guider, sampler=sampler, sigmas=low_sigmas, - latent_image=denoised_output_latents, + latent_image=_av, ) + denoised_output_latents, _audio_tile = _split_av_latent_dict(denoised_output_latents) positive, negative, denoised_output_latents = LTXVCropGuides.execute( positive=positive, negative=negative, latent=denoised_output_latents, ) + if _audio_tile is not None: + denoised_output_latents["_audio"] = _audio_tile + return (denoised_output_latents, positive, negative) diff --git a/example_workflows/LTX-2.3_Two_Pass_I2V_Looping.json b/example_workflows/LTX-2.3_Two_Pass_I2V_Looping.json new file mode 100644 index 0000000..6ef2f4e --- /dev/null +++ b/example_workflows/LTX-2.3_Two_Pass_I2V_Looping.json @@ -0,0 +1,2048 @@ +{ + "id": "6442f6ec-19f9-4ded-93a2-00c286be6dab", + "revision": 0, + "last_node_id": 75, + "last_link_id": 65, + "nodes": [ + { + "id": 1, + "type": "LoadImage", + "pos": [0, 0], + "size": [300, 300], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "COMBO", + "widget": { "name": "image" }, + "link": null + }, + { + "localized_name": "choose file to upload", + "name": "upload", + "type": "IMAGEUPLOAD", + "widget": { "name": "upload" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "slot_index": 0, + "links": [1, 8] + }, + { + "localized_name": "MASK", + "name": "MASK", + "type": "MASK", + "slot_index": 1, + "links": [] + } + ], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LoadImage" + }, + "widgets_values": ["reference_image.png", "image"] + }, + { + "id": 3, + "type": "PrimitiveInt", + "pos": [0, 800], + "size": [210, 100], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { "name": "value" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "slot_index": 0, + "links": [9, 11] + } + ], + "title": "Frame Count", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "PrimitiveInt" + }, + "widgets_values": [241, "fixed"] + }, + { + "id": 20, + "type": "CLIPTextEncode", + "pos": [900, 0], + "size": [400, 180], + "flags": {}, + "order": 18, + "mode": 0, + "inputs": [ + { "localized_name": "clip", "name": "clip", "type": "CLIP", "link": 3 }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { "name": "text" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "slot_index": 0, + "links": [5] + } + ], + "title": "Positive Prompt", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "CLIPTextEncode" + }, + "widgets_values": [ + "A woman walks through a sunlit meadow. Warm breeze rustles the tall grass. Birds sing in the distance. She pauses to admire wildflowers." + ] + }, + { + "id": 21, + "type": "CLIPTextEncode", + "pos": [900, 220], + "size": [400, 120], + "flags": {}, + "order": 19, + "mode": 0, + "inputs": [ + { "localized_name": "clip", "name": "clip", "type": "CLIP", "link": 4 }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { "name": "text" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "slot_index": 0, + "links": [6] + } + ], + "title": "Negative Prompt", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "CLIPTextEncode" + }, + "widgets_values": [ + "pc game, console game, video game, cartoon, childish, ugly, blurry" + ] + }, + { + "id": 40, + "type": "RandomNoise", + "pos": [1950, -80], + "size": [210, 100], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "noise_seed", + "name": "noise_seed", + "type": "INT", + "widget": { "name": "noise_seed" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "NOISE", + "name": "NOISE", + "type": "NOISE", + "slot_index": 0, + "links": [27] + } + ], + "title": "Stage 1 Noise", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "RandomNoise" + }, + "widgets_values": [42, "fixed"] + }, + { + "id": 41, + "type": "KSamplerSelect", + "pos": [1950, 40], + "size": [250, 80], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "sampler_name", + "name": "sampler_name", + "type": "COMBO", + "widget": { "name": "sampler_name" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "SAMPLER", + "name": "SAMPLER", + "type": "SAMPLER", + "slot_index": 0, + "links": [28] + } + ], + "title": "Stage 1 Sampler", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "KSamplerSelect" + }, + "widgets_values": ["euler_ancestral_cfg_pp"] + }, + { + "id": 42, + "type": "ManualSigmas", + "pos": [1950, 140], + "size": [350, 80], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "sigmas", + "name": "sigmas", + "type": "STRING", + "widget": { "name": "sigmas" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "SIGMAS", + "name": "SIGMAS", + "type": "SIGMAS", + "slot_index": 0, + "links": [29] + } + ], + "title": "Stage 1 Sigmas", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "ManualSigmas" + }, + "widgets_values": [ + "1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875, 0.0" + ] + }, + { + "id": 44, + "type": "LTXVLoopingSampler", + "pos": [1950, 400], + "size": [400, 580], + "flags": {}, + "order": 28, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 25 + }, + { "localized_name": "vae", "name": "vae", "type": "VAE", "link": 26 }, + { + "localized_name": "noise", + "name": "noise", + "type": "NOISE", + "link": 27 + }, + { + "localized_name": "sampler", + "name": "sampler", + "type": "SAMPLER", + "link": 28 + }, + { + "localized_name": "sigmas", + "name": "sigmas", + "type": "SIGMAS", + "link": 29 + }, + { + "localized_name": "guider", + "name": "guider", + "type": "GUIDER", + "link": 30 + }, + { + "localized_name": "latents", + "name": "latents", + "type": "LATENT", + "link": 31 + }, + { + "localized_name": "optional_cond_images", + "name": "optional_cond_images", + "shape": 7, + "type": "IMAGE", + "link": 32 + }, + { + "localized_name": "optional_guiding_latents", + "name": "optional_guiding_latents", + "shape": 7, + "type": "LATENT", + "link": null + }, + { + "localized_name": "optional_positive_conditionings", + "name": "optional_positive_conditionings", + "shape": 7, + "type": "CONDITIONING", + "link": null + }, + { + "localized_name": "optional_negative_index_latents", + "name": "optional_negative_index_latents", + "shape": 7, + "type": "LATENT", + "link": 33 + }, + { + "localized_name": "optional_normalizing_latents", + "name": "optional_normalizing_latents", + "shape": 7, + "type": "LATENT", + "link": null + }, + { + "localized_name": "temporal_tile_size", + "name": "temporal_tile_size", + "type": "INT", + "widget": { "name": "temporal_tile_size" }, + "link": null + }, + { + "localized_name": "temporal_overlap", + "name": "temporal_overlap", + "type": "INT", + "widget": { "name": "temporal_overlap" }, + "link": null + }, + { + "localized_name": "guiding_strength", + "name": "guiding_strength", + "type": "FLOAT", + "widget": { "name": "guiding_strength" }, + "link": null + }, + { + "localized_name": "temporal_overlap_cond_strength", + "name": "temporal_overlap_cond_strength", + "type": "FLOAT", + "widget": { "name": "temporal_overlap_cond_strength" }, + "link": null + }, + { + "localized_name": "cond_image_strength", + "name": "cond_image_strength", + "type": "FLOAT", + "widget": { "name": "cond_image_strength" }, + "link": null + }, + { + "localized_name": "horizontal_tiles", + "name": "horizontal_tiles", + "type": "INT", + "widget": { "name": "horizontal_tiles" }, + "link": null + }, + { + "localized_name": "vertical_tiles", + "name": "vertical_tiles", + "type": "INT", + "widget": { "name": "vertical_tiles" }, + "link": null + }, + { + "localized_name": "spatial_overlap", + "name": "spatial_overlap", + "type": "INT", + "widget": { "name": "spatial_overlap" }, + "link": null + }, + { + "localized_name": "adain_factor", + "name": "adain_factor", + "shape": 7, + "type": "FLOAT", + "widget": { "name": "adain_factor" }, + "link": null + }, + { + "localized_name": "guiding_start_step", + "name": "guiding_start_step", + "shape": 7, + "type": "INT", + "widget": { "name": "guiding_start_step" }, + "link": null + }, + { + "localized_name": "guiding_end_step", + "name": "guiding_end_step", + "shape": 7, + "type": "INT", + "widget": { "name": "guiding_end_step" }, + "link": null + }, + { + "localized_name": "optional_cond_image_indices", + "name": "optional_cond_image_indices", + "shape": 7, + "type": "STRING", + "widget": { "name": "optional_cond_image_indices" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "denoised_output", + "name": "denoised_output", + "type": "LATENT", + "slot_index": 0, + "links": [34] + } + ], + "title": "Stage 1 — Generate", + "properties": { + "cnr_id": "ComfyUI-LTXVideo", + "ver": "531512f7286963dc7aff1fd8bf5556e95eae03af", + "Node name for S&R": "LTXVLoopingSampler" + }, + "widgets_values": [128, 24, 1, 0.5, 1, 1, 1, 1, 0.15, 0, 1000, "0"], + "color": "#335533", + "bgcolor": "#223322" + }, + { + "id": 4, + "type": "PrimitiveFloat", + "pos": [0, 930], + "size": [210, 100], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "FLOAT", + "widget": { "name": "value" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "slot_index": 0, + "links": [7, 62, 64] + } + ], + "title": "Frame Rate", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "PrimitiveFloat" + }, + "widgets_values": [24] + }, + { + "id": 75, + "type": "FloatToInt", + "pos": [991.6666666666669, 928.3333333333287], + "size": [270, 82], + "flags": { "collapsed": true }, + "order": 17, + "mode": 0, + "inputs": [ + { + "localized_name": "float_value", + "name": "float_value", + "type": "FLOAT", + "widget": { "name": "float_value" }, + "link": 64 + }, + { + "localized_name": "rounding_mode", + "name": "rounding_mode", + "type": "COMBO", + "widget": { "name": "rounding_mode" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "int_value", + "name": "int_value", + "type": "INT", + "links": [65] + } + ], + "properties": { + "aux_id": "danTheMonk/comfyui-int-and-float", + "ver": "a8b5a383ec6b5cff43c2f81a9a3aa24b87c4c720", + "Node name for S&R": "FloatToInt" + }, + "widgets_values": [0, "down (floor)"] + }, + { + "id": 2, + "type": "LTXVPreprocess", + "pos": [11.666666666666666, 355.00000000000017], + "size": [220, 58], + "flags": {}, + "order": 14, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 1 + }, + { + "localized_name": "img_compression", + "name": "img_compression", + "type": "INT", + "widget": { "name": "img_compression" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "output_image", + "name": "output_image", + "type": "IMAGE", + "slot_index": 0, + "links": [15, 20, 32] + } + ], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXVPreprocess" + }, + "widgets_values": [18] + }, + { + "id": 23, + "type": "ResizeImageMaskNode", + "pos": [8.333333333333284, 505.00000000000085], + "size": [300, 106], + "flags": {}, + "order": 15, + "mode": 0, + "inputs": [ + { + "localized_name": "input", + "name": "input", + "type": "IMAGE,MASK", + "link": 8 + }, + { + "localized_name": "resize_type", + "name": "resize_type", + "type": "COMFY_DYNAMICCOMBO_V3", + "widget": { "name": "resize_type" }, + "link": null + }, + { + "localized_name": "resize_type.longer_size", + "name": "resize_type.longer_size", + "type": "INT", + "widget": { "name": "resize_type.longer_size" }, + "link": null + }, + { + "localized_name": "scale_method", + "name": "scale_method", + "type": "COMBO", + "widget": { "name": "scale_method" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "resized", + "name": "resized", + "type": "IMAGE", + "slot_index": 0, + "links": [39, 54] + } + ], + "title": "Resize Reference", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "ResizeImageMaskNode" + }, + "widgets_values": ["scale longer dimension", 1536, "lanczos"] + }, + { + "id": 14, + "type": "LatentUpscaleModelLoader", + "pos": [452.82236965026885, 683.519747085575], + "size": [376.2368404663082, 58], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "model_name", + "name": "model_name", + "type": "COMBO", + "widget": { "name": "model_name" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "LATENT_UPSCALE_MODEL", + "name": "LATENT_UPSCALE_MODEL", + "type": "LATENT_UPSCALE_MODEL", + "slot_index": 0, + "links": [36] + } + ], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LatentUpscaleModelLoader" + }, + "widgets_values": ["ltx-2.3-spatial-upscaler-x2-1.1.safetensors"] + }, + { + "id": 13, + "type": "LoraLoaderModelOnly", + "pos": [451.8815797668459, 542.2302684844991], + "size": [373.4144708160393, 82], + "flags": {}, + "order": 20, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 2 + }, + { + "localized_name": "lora_name", + "name": "lora_name", + "type": "COMBO", + "widget": { "name": "lora_name" }, + "link": null + }, + { + "localized_name": "strength_model", + "name": "strength_model", + "type": "FLOAT", + "widget": { "name": "strength_model" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "slot_index": 0, + "links": [22, 25, 44, 47] + } + ], + "title": "Distilled LoRA (both stages)", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LoraLoaderModelOnly" + }, + "widgets_values": ["LTX/ltx-2.3-22b-distilled-lora-384.safetensors", 0.5] + }, + { + "id": 12, + "type": "LTXVAudioVAELoader", + "pos": [450, 400], + "size": [369.75658755188215, 58], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "localized_name": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { "name": "ckpt_name" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "Audio VAE", + "name": "Audio VAE", + "type": "VAE", + "slot_index": 0, + "links": [10, 59] + } + ], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXVAudioVAELoader" + }, + "widgets_values": ["ltx-2.3-22b-dev.safetensors"] + }, + { + "id": 11, + "type": "LTXAVTextEncoderLoader", + "pos": [448.1184202331541, 213.86843580322656], + "size": [373.41447081603906, 106], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "localized_name": "text_encoder", + "name": "text_encoder", + "type": "COMBO", + "widget": { "name": "text_encoder" }, + "link": null + }, + { + "localized_name": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { "name": "ckpt_name" }, + "link": null + }, + { + "localized_name": "device", + "name": "device", + "type": "COMBO", + "widget": { "name": "device" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "slot_index": 0, + "links": [3, 4] + } + ], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXAVTextEncoderLoader" + }, + "widgets_values": [ + "gemma_3_12B_it.safetensors", + "ltx-2.3-22b-dev.safetensors", + "default" + ] + }, + { + "id": 10, + "type": "CheckpointLoaderSimple", + "pos": [445.29605058288524, 31.046066152957746], + "size": [371.63816731872794, 98], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "localized_name": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { "name": "ckpt_name" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "slot_index": 0, + "links": [2] + }, + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "slot_index": 1, + "links": [] + }, + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "slot_index": 2, + "links": [14, 21, 26, 37, 38, 48, 57] + } + ], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "CheckpointLoaderSimple" + }, + "widgets_values": ["ltx-2.3-22b-dev.safetensors"] + }, + { + "id": 22, + "type": "LTXVConditioning", + "pos": [999.7237276428352, 409.4078988342295], + "size": [210, 78], + "flags": {}, + "order": 24, + "mode": 0, + "inputs": [ + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 5 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 6 + }, + { + "localized_name": "frame_rate", + "name": "frame_rate", + "type": "FLOAT", + "widget": { "name": "frame_rate" }, + "link": 7 + } + ], + "outputs": [ + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "slot_index": 0, + "links": [23, 45] + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "slot_index": 1, + "links": [24, 46] + } + ], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXVConditioning" + }, + "widgets_values": [24] + }, + { + "id": 31, + "type": "LTXVEmptyLatentAudio", + "pos": [1400.940789883423, 211.9868560363806], + "size": [252.82236965026914, 106], + "flags": {}, + "order": 23, + "mode": 0, + "inputs": [ + { + "localized_name": "audio_vae", + "name": "audio_vae", + "type": "VAE", + "link": 10 + }, + { + "localized_name": "frames_number", + "name": "frames_number", + "type": "INT", + "widget": { "name": "frames_number" }, + "link": 11 + }, + { + "localized_name": "frame_rate", + "name": "frame_rate", + "type": "INT", + "widget": { "name": "frame_rate" }, + "link": 65 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { "name": "batch_size" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "Latent", + "name": "Latent", + "type": "LATENT", + "slot_index": 0, + "links": [19] + } + ], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXVEmptyLatentAudio" + }, + "widgets_values": [97, 25, 1] + }, + { + "id": 30, + "type": "EmptyLTXVLatentVideo", + "pos": [1400, 0], + "size": [252.82236965026868, 130], + "flags": {}, + "order": 16, + "mode": 0, + "inputs": [ + { + "localized_name": "width", + "name": "width", + "type": "INT", + "widget": { "name": "width" }, + "link": null + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "widget": { "name": "height" }, + "link": null + }, + { + "localized_name": "length", + "name": "length", + "type": "INT", + "widget": { "name": "length" }, + "link": 9 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { "name": "batch_size" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "slot_index": 0, + "links": [16] + } + ], + "title": "Stage 1 Empty Latent", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "EmptyLTXVLatentVideo" + }, + "widgets_values": [960, 544, 241, 1] + }, + { + "id": 43, + "type": "CFGGuider", + "pos": [1960.3486887176518, 256.9342179016131], + "size": [250, 98], + "flags": {}, + "order": 26, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 22 + }, + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 23 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 24 + }, + { + "localized_name": "cfg", + "name": "cfg", + "type": "FLOAT", + "widget": { "name": "cfg" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "GUIDER", + "name": "GUIDER", + "type": "GUIDER", + "slot_index": 0, + "links": [30] + } + ], + "title": "Stage 1 Guider", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "CFGGuider" + }, + "widgets_values": [1], + "color": "#335533", + "bgcolor": "#223322" + }, + { + "id": 63, + "type": "CFGGuider", + "pos": [2563.9220909318396, 255.9369806251849], + "size": [235.2013751337572, 98], + "flags": {}, + "order": 27, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 44 + }, + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 45 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 46 + }, + { + "localized_name": "cfg", + "name": "cfg", + "type": "FLOAT", + "widget": { "name": "cfg" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "GUIDER", + "name": "GUIDER", + "type": "GUIDER", + "slot_index": 0, + "links": [52] + } + ], + "title": "Stage 2 Guider", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "CFGGuider" + }, + "widgets_values": [1], + "color": "#333355", + "bgcolor": "#222233" + }, + { + "id": 62, + "type": "ManualSigmas", + "pos": [3047.723288482116, 178.70409580402023], + "size": [277.23288482116413, 58], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "localized_name": "sigmas", + "name": "sigmas", + "type": "STRING", + "widget": { "name": "sigmas" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "SIGMAS", + "name": "SIGMAS", + "type": "SIGMAS", + "slot_index": 0, + "links": [51] + } + ], + "title": "Stage 2 Sigmas", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "ManualSigmas" + }, + "widgets_values": ["0.85, 0.7250, 0.4219, 0.0"] + }, + { + "id": 61, + "type": "KSamplerSelect", + "pos": [3050, 69.5972497324864], + "size": [250, 58], + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "localized_name": "sampler_name", + "name": "sampler_name", + "type": "COMBO", + "widget": { "name": "sampler_name" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "SAMPLER", + "name": "SAMPLER", + "type": "SAMPLER", + "slot_index": 0, + "links": [50] + } + ], + "title": "Stage 2 Sampler", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "KSamplerSelect" + }, + "widgets_values": ["euler_cfg_pp"] + }, + { + "id": 60, + "type": "RandomNoise", + "pos": [3050, -80], + "size": [210, 82], + "flags": {}, + "order": 12, + "mode": 0, + "inputs": [ + { + "localized_name": "noise_seed", + "name": "noise_seed", + "type": "INT", + "widget": { "name": "noise_seed" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "NOISE", + "name": "NOISE", + "type": "NOISE", + "slot_index": 0, + "links": [49] + } + ], + "title": "Stage 2 Noise", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "RandomNoise" + }, + "widgets_values": [43, "fixed"] + }, + { + "id": 74, + "type": "SaveVideo", + "pos": [3975.7575757575723, 1040.9090909090924], + "size": [250, 106], + "flags": {}, + "order": 38, + "mode": 0, + "inputs": [ + { + "localized_name": "video", + "name": "video", + "type": "VIDEO", + "link": 63 + }, + { + "localized_name": "filename_prefix", + "name": "filename_prefix", + "type": "STRING", + "widget": { "name": "filename_prefix" }, + "link": null + }, + { + "localized_name": "format", + "name": "format", + "type": "COMBO", + "widget": { "name": "format" }, + "link": null + }, + { + "localized_name": "codec", + "name": "codec", + "type": "COMBO", + "widget": { "name": "codec" }, + "link": null + } + ], + "outputs": [], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "SaveVideo" + }, + "widgets_values": ["LTX-2.3/Looping", "auto", "auto"] + }, + { + "id": 73, + "type": "CreateVideo", + "pos": [3649.9999999999995, 1049.9999999999995], + "size": [243.939393939394, 78], + "flags": {}, + "order": 37, + "mode": 0, + "inputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "link": 60 + }, + { + "localized_name": "audio", + "name": "audio", + "shape": 7, + "type": "AUDIO", + "link": 61 + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "widget": { "name": "fps" }, + "link": 62 + } + ], + "outputs": [ + { + "localized_name": "VIDEO", + "name": "VIDEO", + "type": "VIDEO", + "slot_index": 0, + "links": [63] + } + ], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "CreateVideo" + }, + "widgets_values": [30] + }, + { + "id": 72, + "type": "LTXVAudioVAEDecode", + "pos": [3643.9393939393935, 899.3939393939382], + "size": [203.00000610351563, 46], + "flags": {}, + "order": 36, + "mode": 0, + "inputs": [ + { + "localized_name": "samples", + "name": "samples", + "type": "LATENT", + "link": 58 + }, + { + "localized_name": "audio_vae", + "name": "audio_vae", + "type": "VAE", + "link": 59 + } + ], + "outputs": [ + { + "localized_name": "Audio", + "name": "Audio", + "type": "AUDIO", + "slot_index": 0, + "links": [61] + } + ], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXVAudioVAEDecode" + }, + "widgets_values": [] + }, + { + "id": 71, + "type": "LTXVSpatioTemporalTiledVAEDecode", + "pos": [3615.151515151515, 569.393939393939], + "size": [350, 242], + "flags": {}, + "order": 35, + "mode": 0, + "inputs": [ + { "localized_name": "vae", "name": "vae", "type": "VAE", "link": 57 }, + { + "localized_name": "latents", + "name": "latents", + "type": "LATENT", + "link": null + }, + { + "localized_name": "spatial_tiles", + "name": "spatial_tiles", + "type": "INT", + "widget": { "name": "spatial_tiles" }, + "link": null + }, + { + "localized_name": "spatial_overlap", + "name": "spatial_overlap", + "type": "INT", + "widget": { "name": "spatial_overlap" }, + "link": null + }, + { + "localized_name": "temporal_tile_length", + "name": "temporal_tile_length", + "type": "INT", + "widget": { "name": "temporal_tile_length" }, + "link": null + }, + { + "localized_name": "temporal_overlap", + "name": "temporal_overlap", + "type": "INT", + "widget": { "name": "temporal_overlap" }, + "link": null + }, + { + "localized_name": "last_frame_fix", + "name": "last_frame_fix", + "type": "BOOLEAN", + "widget": { "name": "last_frame_fix" }, + "link": null + }, + { + "localized_name": "working_device", + "name": "working_device", + "type": "COMBO", + "widget": { "name": "working_device" }, + "link": null + }, + { + "localized_name": "working_dtype", + "name": "working_dtype", + "type": "COMBO", + "widget": { "name": "working_dtype" }, + "link": null + }, + { "name": "samples", "type": "LATENT", "link": 56 } + ], + "outputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "slot_index": 0, + "links": [60] + } + ], + "title": "Decode Video (Tiled)", + "properties": { + "cnr_id": "ComfyUI-LTXVideo", + "ver": "531512f7286963dc7aff1fd8bf5556e95eae03af", + "Node name for S&R": "LTXVSpatioTemporalTiledVAEDecode" + }, + "widgets_values": [6, 4, 16, 4, false, "auto", "auto"] + }, + { + "id": 70, + "type": "LTXVSeparateAVLatent", + "pos": [3600, 400], + "size": [233.33333333333348, 46], + "flags": {}, + "order": 34, + "mode": 0, + "inputs": [ + { + "localized_name": "av_latent", + "name": "av_latent", + "type": "LATENT", + "link": 55 + } + ], + "outputs": [ + { + "localized_name": "video_latent", + "name": "video_latent", + "type": "LATENT", + "slot_index": 0, + "links": [56] + }, + { + "localized_name": "audio_latent", + "name": "audio_latent", + "type": "LATENT", + "slot_index": 1, + "links": [58] + } + ], + "title": "Split Final AV", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXVSeparateAVLatent" + }, + "widgets_values": [] + }, + { + "id": 64, + "type": "LTXVLoopingSampler", + "pos": [3073.801984050594, 392.75591789764337], + "size": [400, 580], + "flags": {}, + "order": 33, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 47 + }, + { "localized_name": "vae", "name": "vae", "type": "VAE", "link": 48 }, + { + "localized_name": "noise", + "name": "noise", + "type": "NOISE", + "link": 49 + }, + { + "localized_name": "sampler", + "name": "sampler", + "type": "SAMPLER", + "link": 50 + }, + { + "localized_name": "sigmas", + "name": "sigmas", + "type": "SIGMAS", + "link": 51 + }, + { + "localized_name": "guider", + "name": "guider", + "type": "GUIDER", + "link": 52 + }, + { + "localized_name": "latents", + "name": "latents", + "type": "LATENT", + "link": 53 + }, + { + "localized_name": "optional_cond_images", + "name": "optional_cond_images", + "shape": 7, + "type": "IMAGE", + "link": 54 + }, + { + "localized_name": "optional_guiding_latents", + "name": "optional_guiding_latents", + "shape": 7, + "type": "LATENT", + "link": null + }, + { + "localized_name": "optional_positive_conditionings", + "name": "optional_positive_conditionings", + "shape": 7, + "type": "CONDITIONING", + "link": null + }, + { + "localized_name": "optional_negative_index_latents", + "name": "optional_negative_index_latents", + "shape": 7, + "type": "LATENT", + "link": null + }, + { + "localized_name": "optional_normalizing_latents", + "name": "optional_normalizing_latents", + "shape": 7, + "type": "LATENT", + "link": null + }, + { + "localized_name": "temporal_tile_size", + "name": "temporal_tile_size", + "type": "INT", + "widget": { "name": "temporal_tile_size" }, + "link": null + }, + { + "localized_name": "temporal_overlap", + "name": "temporal_overlap", + "type": "INT", + "widget": { "name": "temporal_overlap" }, + "link": null + }, + { + "localized_name": "guiding_strength", + "name": "guiding_strength", + "type": "FLOAT", + "widget": { "name": "guiding_strength" }, + "link": null + }, + { + "localized_name": "temporal_overlap_cond_strength", + "name": "temporal_overlap_cond_strength", + "type": "FLOAT", + "widget": { "name": "temporal_overlap_cond_strength" }, + "link": null + }, + { + "localized_name": "cond_image_strength", + "name": "cond_image_strength", + "type": "FLOAT", + "widget": { "name": "cond_image_strength" }, + "link": null + }, + { + "localized_name": "horizontal_tiles", + "name": "horizontal_tiles", + "type": "INT", + "widget": { "name": "horizontal_tiles" }, + "link": null + }, + { + "localized_name": "vertical_tiles", + "name": "vertical_tiles", + "type": "INT", + "widget": { "name": "vertical_tiles" }, + "link": null + }, + { + "localized_name": "spatial_overlap", + "name": "spatial_overlap", + "type": "INT", + "widget": { "name": "spatial_overlap" }, + "link": null + }, + { + "localized_name": "adain_factor", + "name": "adain_factor", + "shape": 7, + "type": "FLOAT", + "widget": { "name": "adain_factor" }, + "link": null + }, + { + "localized_name": "guiding_start_step", + "name": "guiding_start_step", + "shape": 7, + "type": "INT", + "widget": { "name": "guiding_start_step" }, + "link": null + }, + { + "localized_name": "guiding_end_step", + "name": "guiding_end_step", + "shape": 7, + "type": "INT", + "widget": { "name": "guiding_end_step" }, + "link": null + }, + { + "localized_name": "optional_cond_image_indices", + "name": "optional_cond_image_indices", + "shape": 7, + "type": "STRING", + "widget": { "name": "optional_cond_image_indices" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "denoised_output", + "name": "denoised_output", + "type": "LATENT", + "slot_index": 0, + "links": [55] + } + ], + "title": "Stage 2 — Refine", + "properties": { + "cnr_id": "ComfyUI-LTXVideo", + "ver": "531512f7286963dc7aff1fd8bf5556e95eae03af", + "Node name for S&R": "LTXVLoopingSampler" + }, + "widgets_values": [128, 24, 1, 0.5, 1, 2, 1, 1, 0, 0, 1000, "0"], + "color": "#333355", + "bgcolor": "#222233" + }, + { + "id": 53, + "type": "LTXVConcatAVLatent", + "pos": [2807.97697623735, 524.995187859747], + "size": [190.80550053502748, 46], + "flags": {}, + "order": 32, + "mode": 0, + "inputs": [ + { + "localized_name": "video_latent", + "name": "video_latent", + "type": "LATENT", + "link": 42 + }, + { + "localized_name": "audio_latent", + "name": "audio_latent", + "type": "LATENT", + "link": 43 + } + ], + "outputs": [ + { + "localized_name": "latent", + "name": "latent", + "type": "LATENT", + "slot_index": 0, + "links": [53] + } + ], + "title": "Stage 2 AV Concat", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXVConcatAVLatent" + }, + "widgets_values": [], + "color": "#333355", + "bgcolor": "#222233" + }, + { + "id": 51, + "type": "LTXVLatentUpsampler", + "pos": [2494.204734318114, 557.0100775530725], + "size": [249.9123466065612, 66], + "flags": {}, + "order": 30, + "mode": 0, + "inputs": [ + { + "localized_name": "samples", + "name": "samples", + "type": "LATENT", + "link": 35 + }, + { + "localized_name": "upscale_model", + "name": "upscale_model", + "type": "LATENT_UPSCALE_MODEL", + "link": 36 + }, + { "localized_name": "vae", "name": "vae", "type": "VAE", "link": 37 } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "slot_index": 0, + "links": [40] + } + ], + "title": "Spatial Upscale 2x", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXVLatentUpsampler" + }, + "widgets_values": [] + }, + { + "id": 50, + "type": "LTXVSeparateAVLatent", + "pos": [2429.214969171252, 400.10348688717687], + "size": [172.5918083919587, 46], + "flags": {}, + "order": 29, + "mode": 0, + "inputs": [ + { + "localized_name": "av_latent", + "name": "av_latent", + "type": "LATENT", + "link": 34 + } + ], + "outputs": [ + { + "localized_name": "video_latent", + "name": "video_latent", + "type": "LATENT", + "slot_index": 0, + "links": [35] + }, + { + "localized_name": "audio_latent", + "name": "audio_latent", + "type": "LATENT", + "slot_index": 1, + "links": [43] + } + ], + "title": "Split Stage 1 AV", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXVSeparateAVLatent" + }, + "widgets_values": [] + }, + { + "id": 33, + "type": "LTXVConcatAVLatent", + "pos": [1435.7500155700718, 795.296050582885], + "size": [174.92496730284756, 46], + "flags": {}, + "order": 25, + "mode": 0, + "inputs": [ + { + "localized_name": "video_latent", + "name": "video_latent", + "type": "LATENT", + "link": 18 + }, + { + "localized_name": "audio_latent", + "name": "audio_latent", + "type": "LATENT", + "link": 19 + } + ], + "outputs": [ + { + "localized_name": "latent", + "name": "latent", + "type": "LATENT", + "slot_index": 0, + "links": [31] + } + ], + "title": "Stage 1 AV Concat", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXVConcatAVLatent" + }, + "widgets_values": [], + "color": "#335533", + "bgcolor": "#223322" + }, + { + "id": 35, + "type": "VAEEncode", + "pos": [1383.3333333333328, 1164.9999999999995], + "size": [206.36665954589844, 46], + "flags": {}, + "order": 21, + "mode": 0, + "inputs": [ + { + "localized_name": "pixels", + "name": "pixels", + "type": "IMAGE", + "link": 20 + }, + { "localized_name": "vae", "name": "vae", "type": "VAE", "link": 21 } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "slot_index": 0, + "links": [33] + } + ], + "title": "Encode Reference Latent", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "VAEEncode" + }, + "widgets_values": [] + }, + { + "id": 32, + "type": "LTXVImgToVideoConditionOnly", + "pos": [1399.999999999999, 604.9999999999992], + "size": [210, 122], + "flags": {}, + "order": 22, + "mode": 0, + "inputs": [ + { "localized_name": "vae", "name": "vae", "type": "VAE", "link": 14 }, + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 15 + }, + { + "localized_name": "latent", + "name": "latent", + "type": "LATENT", + "link": 16 + }, + { + "localized_name": "strength", + "name": "strength", + "type": "FLOAT", + "widget": { "name": "strength" }, + "link": null + }, + { + "localized_name": "bypass", + "name": "bypass", + "shape": 7, + "type": "BOOLEAN", + "widget": { "name": "bypass" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "latent", + "name": "latent", + "type": "LATENT", + "slot_index": 0, + "links": [18] + } + ], + "title": "Stage 1 I2V Cond", + "properties": { + "cnr_id": "ComfyUI-LTXVideo", + "ver": "531512f7286963dc7aff1fd8bf5556e95eae03af", + "Node name for S&R": "LTXVImgToVideoConditionOnly" + }, + "widgets_values": [0.7, false], + "color": "#335533", + "bgcolor": "#223322" + }, + { + "id": 52, + "type": "LTXVImgToVideoConditionOnly", + "pos": [2492.238483461759, 791.1178860526603], + "size": [210, 122], + "flags": {}, + "order": 31, + "mode": 0, + "inputs": [ + { "localized_name": "vae", "name": "vae", "type": "VAE", "link": 38 }, + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 39 + }, + { + "localized_name": "latent", + "name": "latent", + "type": "LATENT", + "link": 40 + }, + { + "localized_name": "strength", + "name": "strength", + "type": "FLOAT", + "widget": { "name": "strength" }, + "link": null + }, + { + "localized_name": "bypass", + "name": "bypass", + "shape": 7, + "type": "BOOLEAN", + "widget": { "name": "bypass" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "latent", + "name": "latent", + "type": "LATENT", + "slot_index": 0, + "links": [42] + } + ], + "title": "Stage 2 I2V Cond", + "properties": { + "cnr_id": "ComfyUI-LTXVideo", + "ver": "531512f7286963dc7aff1fd8bf5556e95eae03af", + "Node name for S&R": "LTXVImgToVideoConditionOnly" + }, + "widgets_values": [1, false], + "color": "#333355", + "bgcolor": "#222233" + }, + { + "id": 6, + "type": "Note", + "pos": [281.1738724586202, 1016.7247228103745], + "size": [631.0862190651818, 273.1698654463494], + "flags": {}, + "order": 13, + "mode": 0, + "inputs": [], + "outputs": [], + "properties": { "Node name for S&R": "Note" }, + "widgets_values": [ + "## Guiding Image Indices\n\nSet `optional_cond_image_indices` on the Stage 1 Looping Sampler.\nDefault: \"0\" (reference image at first frame only).\n\nFor multi-tile conditioning, set indices at tile boundaries.\nWith tile_size=128, overlap=24, new content starts every 104 frames:\n \"0, 104, 208\" for a 241-frame (3-tile) clip.\n\nThe number of images in the guiding batch must match the indices.\nUse LatentBatch or ImageBatch to provide multiple images.\nBy default, a single reference image at index 0 is used." + ], + "color": "#432", + "bgcolor": "#653" + } + ], + "links": [ + [1, 1, 0, 2, 0, "IMAGE"], + [2, 10, 0, 13, 0, "MODEL"], + [3, 11, 0, 20, 0, "CLIP"], + [4, 11, 0, 21, 0, "CLIP"], + [5, 20, 0, 22, 0, "CONDITIONING"], + [6, 21, 0, 22, 1, "CONDITIONING"], + [7, 4, 0, 22, 2, "FLOAT"], + [8, 1, 0, 23, 0, "IMAGE"], + [9, 3, 0, 30, 2, "INT"], + [10, 12, 0, 31, 0, "VAE"], + [11, 3, 0, 31, 1, "INT"], + [14, 10, 2, 32, 0, "VAE"], + [15, 2, 0, 32, 1, "IMAGE"], + [16, 30, 0, 32, 2, "LATENT"], + [18, 32, 0, 33, 0, "LATENT"], + [19, 31, 0, 33, 1, "LATENT"], + [20, 2, 0, 35, 0, "IMAGE"], + [21, 10, 2, 35, 1, "VAE"], + [22, 13, 0, 43, 0, "MODEL"], + [23, 22, 0, 43, 1, "CONDITIONING"], + [24, 22, 1, 43, 2, "CONDITIONING"], + [25, 13, 0, 44, 0, "MODEL"], + [26, 10, 2, 44, 1, "VAE"], + [27, 40, 0, 44, 2, "NOISE"], + [28, 41, 0, 44, 3, "SAMPLER"], + [29, 42, 0, 44, 4, "SIGMAS"], + [30, 43, 0, 44, 5, "GUIDER"], + [31, 33, 0, 44, 6, "LATENT"], + [32, 2, 0, 44, 7, "IMAGE"], + [33, 35, 0, 44, 10, "LATENT"], + [34, 44, 0, 50, 0, "LATENT"], + [35, 50, 0, 51, 0, "LATENT"], + [36, 14, 0, 51, 1, "LATENT_UPSCALE_MODEL"], + [37, 10, 2, 51, 2, "VAE"], + [38, 10, 2, 52, 0, "VAE"], + [39, 23, 0, 52, 1, "IMAGE"], + [40, 51, 0, 52, 2, "LATENT"], + [42, 52, 0, 53, 0, "LATENT"], + [43, 50, 1, 53, 1, "LATENT"], + [44, 13, 0, 63, 0, "MODEL"], + [45, 22, 0, 63, 1, "CONDITIONING"], + [46, 22, 1, 63, 2, "CONDITIONING"], + [47, 13, 0, 64, 0, "MODEL"], + [48, 10, 2, 64, 1, "VAE"], + [49, 60, 0, 64, 2, "NOISE"], + [50, 61, 0, 64, 3, "SAMPLER"], + [51, 62, 0, 64, 4, "SIGMAS"], + [52, 63, 0, 64, 5, "GUIDER"], + [53, 53, 0, 64, 6, "LATENT"], + [54, 23, 0, 64, 7, "IMAGE"], + [55, 64, 0, 70, 0, "LATENT"], + [56, 70, 0, 71, 9, "LATENT"], + [57, 10, 2, 71, 0, "VAE"], + [58, 70, 1, 72, 0, "LATENT"], + [59, 12, 0, 72, 1, "VAE"], + [60, 71, 0, 73, 0, "IMAGE"], + [61, 72, 0, 73, 1, "AUDIO"], + [62, 4, 0, 73, 2, "FLOAT"], + [63, 73, 0, 74, 0, "VIDEO"], + [64, 4, 0, 75, 0, "FLOAT"], + [65, 75, 0, 31, 2, "INT"] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.878460000000002, + "offset": [-147.93391628437732, 99.35113851569274] + }, + "info": { + "name": "LTX-2.3 Two-Pass I2V Looping", + "description": "Two-pass I2V workflow for arbitrary-length video. Stage 1 generates at base resolution with temporal tiling. Stage 2 spatially upscales and refines. Soft guiding images at tile boundaries maintain subject continuity." + } + }, + "version": 0.4 +} diff --git a/example_workflows/LTX-2.3_Two_Pass_I2V_Looping.md b/example_workflows/LTX-2.3_Two_Pass_I2V_Looping.md new file mode 100644 index 0000000..b777fe1 --- /dev/null +++ b/example_workflows/LTX-2.3_Two_Pass_I2V_Looping.md @@ -0,0 +1,180 @@ +# LTX-2.3 Two-Pass I2V Looping — Arbitrary-Length Video + +## Overview + +Two-pass image-to-video workflow for generating videos of any duration using +LTX-2.3 (22B) with `LTXVLoopingSampler`. A batch of soft guiding images at +tile boundaries maintains subject and scene continuity across temporal tiles. + +**Stage 1** generates video+audio at base resolution (~544p) with temporal +tiling. +**Stage 2** spatially upscales 2x and refines at high resolution with the +same prompts. + +**Model:** `ltx-2.3-22b-dev.safetensors` + distilled LoRA (0.5 strength) +**Text encoder:** Gemma 3 12B +**No detailer LoRA required** (none exists for 2.3) + +--- + +## Data Flow + +``` +LoadImage (reference) + | + +-- LTXVPreprocess --> Stage 1 I2V Cond --> Stage 1 AV Concat + | | + +-- VAEEncode (negative_index_latents) LTXVLoopingSampler (Stage 1) + | | + +-- ResizeImage (for stage 2) LTXVSeparateAVLatent + | | + LTXVLatentUpsampler | + | | + Stage 2 I2V Cond | + | | + Stage 2 AV Concat------+ + | + LTXVLoopingSampler (Stage 2) + (AV refinement) + | + LTXVSeparateAVLatent + | | + VAEDecodeTiled AudioVAEDecode + | | + CreateVideo------+ + | + SaveVideo +``` + +**Audio refinement:** Both stages process AV latents jointly. Stage 1 +generates video and audio from scratch. Stage 2 receives the upscaled video ++ stage 1 audio as an AV latent and refines both together — the looping +sampler initializes each tile's audio from the input audio data (not zeros), +so the model refines lipsync and audio-visual coherence at the higher +resolution. This matches the behaviour of the standard two-stage workflow +using `SamplerCustomAdvanced`. + +--- + +## Key Parameters + +### Stage 1 — Generate + +| Parameter | Value | Notes | +|---|---|---| +| Resolution | 960x544 | Base res; 2x upscale yields ~1920x1088 | +| temporal_tile_size | 128 | Pixel frames per tile | +| temporal_overlap | 24 | Overlap between tiles | +| temporal_overlap_cond_strength | 0.5 | How strongly previous tile conditions next | +| cond_image_strength | 1.0 | Guiding image influence | +| adain_factor | 0.15 | Prevents color drift across tiles | +| horizontal_tiles / vertical_tiles | 1 / 1 | No spatial tiling at 544p | +| Sigmas | `1.0, 0.994, 0.988, 0.981, 0.975, 0.909, 0.725, 0.422, 0.0` | Distilled schedule | +| Sampler | euler_ancestral_cfg_pp | Good for generation | +| CFG | 1 | With distilled LoRA | + +### Stage 2 — Refine + +| Parameter | Value | Notes | +|---|---|---| +| Resolution | ~1920x1088 | 2x from stage 1 | +| temporal_tile_size | 128 | Same as stage 1 | +| temporal_overlap | 24 | Same | +| horizontal_tiles / vertical_tiles | 2 / 1 | Spatial tiling for memory | +| adain_factor | 0.0 | Not needed for refinement | +| Sigmas | `0.85, 0.725, 0.422, 0.0` | Low — refinement only | +| Sampler | euler_cfg_pp | Deterministic for refinement | +| CFG | 1 | With distilled LoRA | + +--- + +## Guiding Images + +### Default: Reference image at frame 0 + +By default, `optional_cond_images` is connected to the preprocessed reference +image and `optional_cond_image_indices` is set to `"0"`. This provides soft +I2V conditioning at the first frame only. + +For global subject anchoring across ALL tiles, `optional_negative_index_latents` +is connected to the VAE-encoded reference image. This attaches the reference +with negative positional embeddings to every tile, providing identity context +without pinning a specific frame position. + +### Transition images at tile boundaries + +To guide content at specific points in the video: + +1. Batch multiple images using `ImageBatch` (or any node that produces an + IMAGE batch). +2. Set `optional_cond_image_indices` to the pixel frame positions where each + image should appear, e.g. `"0, 104, 208"`. +3. Connect the batch to `optional_cond_images`. + +**The number of images must match the number of indices.** + +Frame positions for tile boundaries with `tile_size=128, overlap=24`: + +| Tiles | Total frames | Indices | +|---|---|---| +| 2 | 241 | `0, 104` | +| 3 | 345 | `0, 104, 208` | +| 4 | 449 | `0, 104, 208, 312` | +| N | 128 + (N-1)*104 + 1 | `0, 104, 208, ..., (N-1)*104` | + +Frame indices must be divisible by 8 (except 0). The formula for new content +start per tile is: `tile_size - overlap = 128 - 24 = 104`. + +### Per-tile prompts + +Connect `LTXVMultiPromptProvider` to `optional_positive_conditionings` on the +Stage 1 looping sampler. Prompts are separated by `|`: + +``` +A woman walks through a meadow | She reaches a stream | She crosses a bridge +``` + +Each prompt maps to one temporal tile. If more tiles than prompts, the last +prompt repeats. Use the same provider for Stage 2 to keep prompts aligned. + +--- + +## Duration and Tile Count + +| Duration (24fps) | Pixel frames | Tiles (128/24) | Est. time (Strix Halo) | +|---|---|---|---| +| 5 sec | 121 | 1 | ~5 min | +| 10 sec | 241 | 3 | ~15 min | +| 30 sec | 721 | 7 | ~45 min | +| 1 min | 1441 | 14 | ~1.5 hr | +| 5 min | 7201 | 69 | ~7 hr | + +Frame count must satisfy `8n+1` (e.g. 121, 241, 361...). +Times are rough estimates for both stages combined on Strix Halo 128GB. + +--- + +## Strix Halo / Unified Memory Notes + +See `LTX-2_V2V_Detailer.md` for full Strix Halo tuning. + +- Stage 1 at 544p with 1x1 spatial tiles fits easily. +- Stage 2 at ~1088p needs 2x1 spatial tiling (set in the workflow). + Increase to 2x2 if OOM occurs. +- Add `LTXVChunkFeedForward` (from KJNodes) between the LoRA loader and + the guiders if stage 2 still OOMs. Set `chunks=2, dim_threshold=4096`. +- VAE decode uses `LTXVSpatioTemporalTiledVAEDecode` with `spatial_tiles=6`. + Increase to 8 if needed. + +--- + +## Regenerating the Workflow + +The workflow JSON is generated by the companion script: + +```bash +cd custom_nodes/ComfyUI-LTXVideo/example_workflows +python generate_two_pass_i2v_looping.py +``` + +Edit the script to change default parameters, add nodes, or adjust layout. diff --git a/example_workflows/LTX-2.3_Two_Pass_I2V_Looping_30s.json b/example_workflows/LTX-2.3_Two_Pass_I2V_Looping_30s.json new file mode 100644 index 0000000..551d47b --- /dev/null +++ b/example_workflows/LTX-2.3_Two_Pass_I2V_Looping_30s.json @@ -0,0 +1,2170 @@ +{ + "id": "6442f6ec-19f9-4ded-93a2-00c286be6dab", + "revision": 0, + "last_node_id": 82, + "last_link_id": 72, + "nodes": [ + { + "id": 1, + "type": "LoadImage", + "pos": [0, 0], + "size": [300, 300], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "COMBO", + "widget": { "name": "image" }, + "link": null + }, + { + "localized_name": "choose file to upload", + "name": "upload", + "type": "IMAGEUPLOAD", + "widget": { "name": "upload" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "slot_index": 0, + "links": [1, 8] + }, + { + "localized_name": "MASK", + "name": "MASK", + "type": "MASK", + "slot_index": 1, + "links": [] + } + ], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LoadImage" + }, + "widgets_values": ["reference_image.png", "image"] + }, + { + "id": 3, + "type": "PrimitiveInt", + "pos": [0, 800], + "size": [210, 100], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { "name": "value" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "slot_index": 0, + "links": [9, 11] + } + ], + "title": "Frame Count", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "PrimitiveInt" + }, + "widgets_values": [713, "fixed"] + }, + { + "id": 20, + "type": "CLIPTextEncode", + "pos": [900, 0], + "size": [400, 180], + "flags": {}, + "order": 18, + "mode": 0, + "inputs": [ + { "localized_name": "clip", "name": "clip", "type": "CLIP", "link": 3 }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { "name": "text" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "slot_index": 0, + "links": [5] + } + ], + "title": "Positive Prompt", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "CLIPTextEncode" + }, + "widgets_values": [ + "A beautiful woman in a flowing dress walks through a sunlit garden on a warm summer day. Soft natural lighting, cinematic composition, gentle breeze." + ] + }, + { + "id": 21, + "type": "CLIPTextEncode", + "pos": [900, 220], + "size": [400, 120], + "flags": {}, + "order": 19, + "mode": 0, + "inputs": [ + { "localized_name": "clip", "name": "clip", "type": "CLIP", "link": 4 }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { "name": "text" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "slot_index": 0, + "links": [6] + } + ], + "title": "Negative Prompt", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "CLIPTextEncode" + }, + "widgets_values": [ + "pc game, console game, video game, cartoon, childish, ugly, blurry" + ] + }, + { + "id": 40, + "type": "RandomNoise", + "pos": [1950, -80], + "size": [210, 100], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "noise_seed", + "name": "noise_seed", + "type": "INT", + "widget": { "name": "noise_seed" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "NOISE", + "name": "NOISE", + "type": "NOISE", + "slot_index": 0, + "links": [27] + } + ], + "title": "Stage 1 Noise", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "RandomNoise" + }, + "widgets_values": [42, "fixed"] + }, + { + "id": 41, + "type": "KSamplerSelect", + "pos": [1950, 40], + "size": [250, 80], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "sampler_name", + "name": "sampler_name", + "type": "COMBO", + "widget": { "name": "sampler_name" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "SAMPLER", + "name": "SAMPLER", + "type": "SAMPLER", + "slot_index": 0, + "links": [28] + } + ], + "title": "Stage 1 Sampler", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "KSamplerSelect" + }, + "widgets_values": ["euler_ancestral_cfg_pp"] + }, + { + "id": 42, + "type": "ManualSigmas", + "pos": [1950, 140], + "size": [350, 80], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "sigmas", + "name": "sigmas", + "type": "STRING", + "widget": { "name": "sigmas" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "SIGMAS", + "name": "SIGMAS", + "type": "SIGMAS", + "slot_index": 0, + "links": [29] + } + ], + "title": "Stage 1 Sigmas", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "ManualSigmas" + }, + "widgets_values": [ + "1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875, 0.0" + ] + }, + { + "id": 44, + "type": "LTXVLoopingSampler", + "pos": [1950, 400], + "size": [400, 580], + "flags": {}, + "order": 28, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 25 + }, + { "localized_name": "vae", "name": "vae", "type": "VAE", "link": 26 }, + { + "localized_name": "noise", + "name": "noise", + "type": "NOISE", + "link": 27 + }, + { + "localized_name": "sampler", + "name": "sampler", + "type": "SAMPLER", + "link": 28 + }, + { + "localized_name": "sigmas", + "name": "sigmas", + "type": "SIGMAS", + "link": 29 + }, + { + "localized_name": "guider", + "name": "guider", + "type": "GUIDER", + "link": 30 + }, + { + "localized_name": "latents", + "name": "latents", + "type": "LATENT", + "link": 31 + }, + { + "localized_name": "optional_cond_images", + "name": "optional_cond_images", + "shape": 7, + "type": "IMAGE", + "link": 67 + }, + { + "localized_name": "optional_guiding_latents", + "name": "optional_guiding_latents", + "shape": 7, + "type": "LATENT", + "link": null + }, + { + "localized_name": "optional_positive_conditionings", + "name": "optional_positive_conditionings", + "shape": 7, + "type": "CONDITIONING", + "link": 71 + }, + { + "localized_name": "optional_negative_index_latents", + "name": "optional_negative_index_latents", + "shape": 7, + "type": "LATENT", + "link": 33 + }, + { + "localized_name": "optional_normalizing_latents", + "name": "optional_normalizing_latents", + "shape": 7, + "type": "LATENT", + "link": null + }, + { + "localized_name": "temporal_tile_size", + "name": "temporal_tile_size", + "type": "INT", + "widget": { "name": "temporal_tile_size" }, + "link": null + }, + { + "localized_name": "temporal_overlap", + "name": "temporal_overlap", + "type": "INT", + "widget": { "name": "temporal_overlap" }, + "link": null + }, + { + "localized_name": "guiding_strength", + "name": "guiding_strength", + "type": "FLOAT", + "widget": { "name": "guiding_strength" }, + "link": null + }, + { + "localized_name": "temporal_overlap_cond_strength", + "name": "temporal_overlap_cond_strength", + "type": "FLOAT", + "widget": { "name": "temporal_overlap_cond_strength" }, + "link": null + }, + { + "localized_name": "cond_image_strength", + "name": "cond_image_strength", + "type": "FLOAT", + "widget": { "name": "cond_image_strength" }, + "link": null + }, + { + "localized_name": "horizontal_tiles", + "name": "horizontal_tiles", + "type": "INT", + "widget": { "name": "horizontal_tiles" }, + "link": null + }, + { + "localized_name": "vertical_tiles", + "name": "vertical_tiles", + "type": "INT", + "widget": { "name": "vertical_tiles" }, + "link": null + }, + { + "localized_name": "spatial_overlap", + "name": "spatial_overlap", + "type": "INT", + "widget": { "name": "spatial_overlap" }, + "link": null + }, + { + "localized_name": "adain_factor", + "name": "adain_factor", + "shape": 7, + "type": "FLOAT", + "widget": { "name": "adain_factor" }, + "link": null + }, + { + "localized_name": "guiding_start_step", + "name": "guiding_start_step", + "shape": 7, + "type": "INT", + "widget": { "name": "guiding_start_step" }, + "link": null + }, + { + "localized_name": "guiding_end_step", + "name": "guiding_end_step", + "shape": 7, + "type": "INT", + "widget": { "name": "guiding_end_step" }, + "link": null + }, + { + "localized_name": "optional_cond_image_indices", + "name": "optional_cond_image_indices", + "shape": 7, + "type": "STRING", + "widget": { "name": "optional_cond_image_indices" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "denoised_output", + "name": "denoised_output", + "type": "LATENT", + "slot_index": 0, + "links": [34] + } + ], + "title": "Stage 1 \u2014 Generate", + "properties": { + "cnr_id": "ComfyUI-LTXVideo", + "ver": "531512f7286963dc7aff1fd8bf5556e95eae03af", + "Node name for S&R": "LTXVLoopingSampler" + }, + "widgets_values": [ + 264, + 24, + 1, + 0.5, + 1, + 1, + 1, + 1, + 0.15, + 0, + 1000, + "0, 240, 480" + ], + "color": "#335533", + "bgcolor": "#223322" + }, + { + "id": 4, + "type": "PrimitiveFloat", + "pos": [0, 930], + "size": [210, 100], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "FLOAT", + "widget": { "name": "value" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "slot_index": 0, + "links": [7, 62, 64] + } + ], + "title": "Frame Rate", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "PrimitiveFloat" + }, + "widgets_values": [24] + }, + { + "id": 75, + "type": "FloatToInt", + "pos": [991.6666666666669, 928.3333333333287], + "size": [270, 82], + "flags": { "collapsed": true }, + "order": 17, + "mode": 0, + "inputs": [ + { + "localized_name": "float_value", + "name": "float_value", + "type": "FLOAT", + "widget": { "name": "float_value" }, + "link": 64 + }, + { + "localized_name": "rounding_mode", + "name": "rounding_mode", + "type": "COMBO", + "widget": { "name": "rounding_mode" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "int_value", + "name": "int_value", + "type": "INT", + "links": [65] + } + ], + "properties": { + "aux_id": "danTheMonk/comfyui-int-and-float", + "ver": "a8b5a383ec6b5cff43c2f81a9a3aa24b87c4c720", + "Node name for S&R": "FloatToInt" + }, + "widgets_values": [0, "down (floor)"] + }, + { + "id": 2, + "type": "LTXVPreprocess", + "pos": [11.666666666666666, 355.00000000000017], + "size": [220, 58], + "flags": {}, + "order": 14, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 1 + }, + { + "localized_name": "img_compression", + "name": "img_compression", + "type": "INT", + "widget": { "name": "img_compression" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "output_image", + "name": "output_image", + "type": "IMAGE", + "slot_index": 0, + "links": [15, 20, 66] + } + ], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXVPreprocess" + }, + "widgets_values": [18] + }, + { + "id": 23, + "type": "ResizeImageMaskNode", + "pos": [8.333333333333284, 505.00000000000085], + "size": [300, 106], + "flags": {}, + "order": 15, + "mode": 0, + "inputs": [ + { + "localized_name": "input", + "name": "input", + "type": "IMAGE,MASK", + "link": 8 + }, + { + "localized_name": "resize_type", + "name": "resize_type", + "type": "COMFY_DYNAMICCOMBO_V3", + "widget": { "name": "resize_type" }, + "link": null + }, + { + "localized_name": "resize_type.longer_size", + "name": "resize_type.longer_size", + "type": "INT", + "widget": { "name": "resize_type.longer_size" }, + "link": null + }, + { + "localized_name": "scale_method", + "name": "scale_method", + "type": "COMBO", + "widget": { "name": "scale_method" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "resized", + "name": "resized", + "type": "IMAGE", + "slot_index": 0, + "links": [39, 68] + } + ], + "title": "Resize Reference", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "ResizeImageMaskNode" + }, + "widgets_values": ["scale longer dimension", 1536, "lanczos"] + }, + { + "id": 14, + "type": "LatentUpscaleModelLoader", + "pos": [452.82236965026885, 683.519747085575], + "size": [376.2368404663082, 58], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "model_name", + "name": "model_name", + "type": "COMBO", + "widget": { "name": "model_name" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "LATENT_UPSCALE_MODEL", + "name": "LATENT_UPSCALE_MODEL", + "type": "LATENT_UPSCALE_MODEL", + "slot_index": 0, + "links": [36] + } + ], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LatentUpscaleModelLoader" + }, + "widgets_values": ["ltx-2.3-spatial-upscaler-x2-1.1.safetensors"] + }, + { + "id": 13, + "type": "LoraLoaderModelOnly", + "pos": [451.8815797668459, 542.2302684844991], + "size": [373.4144708160393, 82], + "flags": {}, + "order": 20, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 2 + }, + { + "localized_name": "lora_name", + "name": "lora_name", + "type": "COMBO", + "widget": { "name": "lora_name" }, + "link": null + }, + { + "localized_name": "strength_model", + "name": "strength_model", + "type": "FLOAT", + "widget": { "name": "strength_model" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "slot_index": 0, + "links": [22, 25, 44, 47] + } + ], + "title": "Distilled LoRA (both stages)", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LoraLoaderModelOnly" + }, + "widgets_values": ["LTX/ltx-2.3-22b-distilled-lora-384.safetensors", 0.5] + }, + { + "id": 12, + "type": "LTXVAudioVAELoader", + "pos": [450, 400], + "size": [369.75658755188215, 58], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "localized_name": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { "name": "ckpt_name" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "Audio VAE", + "name": "Audio VAE", + "type": "VAE", + "slot_index": 0, + "links": [10, 59] + } + ], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXVAudioVAELoader" + }, + "widgets_values": ["ltx-2.3-22b-dev.safetensors"] + }, + { + "id": 11, + "type": "LTXAVTextEncoderLoader", + "pos": [448.1184202331541, 213.86843580322656], + "size": [373.41447081603906, 106], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "localized_name": "text_encoder", + "name": "text_encoder", + "type": "COMBO", + "widget": { "name": "text_encoder" }, + "link": null + }, + { + "localized_name": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { "name": "ckpt_name" }, + "link": null + }, + { + "localized_name": "device", + "name": "device", + "type": "COMBO", + "widget": { "name": "device" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "slot_index": 0, + "links": [3, 4, 70] + } + ], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXAVTextEncoderLoader" + }, + "widgets_values": [ + "gemma_3_12B_it.safetensors", + "ltx-2.3-22b-dev.safetensors", + "default" + ] + }, + { + "id": 10, + "type": "CheckpointLoaderSimple", + "pos": [445.29605058288524, 31.046066152957746], + "size": [371.63816731872794, 98], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "localized_name": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { "name": "ckpt_name" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "slot_index": 0, + "links": [2] + }, + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "slot_index": 1, + "links": [] + }, + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "slot_index": 2, + "links": [14, 21, 26, 37, 38, 48, 57] + } + ], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "CheckpointLoaderSimple" + }, + "widgets_values": ["ltx-2.3-22b-dev.safetensors"] + }, + { + "id": 22, + "type": "LTXVConditioning", + "pos": [999.7237276428352, 409.4078988342295], + "size": [210, 78], + "flags": {}, + "order": 24, + "mode": 0, + "inputs": [ + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 5 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 6 + }, + { + "localized_name": "frame_rate", + "name": "frame_rate", + "type": "FLOAT", + "widget": { "name": "frame_rate" }, + "link": 7 + } + ], + "outputs": [ + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "slot_index": 0, + "links": [23, 45] + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "slot_index": 1, + "links": [24, 46] + } + ], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXVConditioning" + }, + "widgets_values": [24] + }, + { + "id": 31, + "type": "LTXVEmptyLatentAudio", + "pos": [1400.940789883423, 211.9868560363806], + "size": [252.82236965026914, 106], + "flags": {}, + "order": 23, + "mode": 0, + "inputs": [ + { + "localized_name": "audio_vae", + "name": "audio_vae", + "type": "VAE", + "link": 10 + }, + { + "localized_name": "frames_number", + "name": "frames_number", + "type": "INT", + "widget": { "name": "frames_number" }, + "link": 11 + }, + { + "localized_name": "frame_rate", + "name": "frame_rate", + "type": "INT", + "widget": { "name": "frame_rate" }, + "link": 65 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { "name": "batch_size" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "Latent", + "name": "Latent", + "type": "LATENT", + "slot_index": 0, + "links": [19] + } + ], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXVEmptyLatentAudio" + }, + "widgets_values": [97, 25, 1] + }, + { + "id": 30, + "type": "EmptyLTXVLatentVideo", + "pos": [1400, 0], + "size": [252.82236965026868, 130], + "flags": {}, + "order": 16, + "mode": 0, + "inputs": [ + { + "localized_name": "width", + "name": "width", + "type": "INT", + "widget": { "name": "width" }, + "link": null + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "widget": { "name": "height" }, + "link": null + }, + { + "localized_name": "length", + "name": "length", + "type": "INT", + "widget": { "name": "length" }, + "link": 9 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { "name": "batch_size" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "slot_index": 0, + "links": [16] + } + ], + "title": "Stage 1 Empty Latent", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "EmptyLTXVLatentVideo" + }, + "widgets_values": [960, 544, 713, 1] + }, + { + "id": 43, + "type": "CFGGuider", + "pos": [1960.3486887176518, 256.9342179016131], + "size": [250, 98], + "flags": {}, + "order": 26, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 22 + }, + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 23 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 24 + }, + { + "localized_name": "cfg", + "name": "cfg", + "type": "FLOAT", + "widget": { "name": "cfg" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "GUIDER", + "name": "GUIDER", + "type": "GUIDER", + "slot_index": 0, + "links": [30] + } + ], + "title": "Stage 1 Guider", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "CFGGuider" + }, + "widgets_values": [1], + "color": "#335533", + "bgcolor": "#223322" + }, + { + "id": 63, + "type": "CFGGuider", + "pos": [2563.9220909318396, 255.9369806251849], + "size": [235.2013751337572, 98], + "flags": {}, + "order": 27, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 44 + }, + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 45 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 46 + }, + { + "localized_name": "cfg", + "name": "cfg", + "type": "FLOAT", + "widget": { "name": "cfg" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "GUIDER", + "name": "GUIDER", + "type": "GUIDER", + "slot_index": 0, + "links": [52] + } + ], + "title": "Stage 2 Guider", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "CFGGuider" + }, + "widgets_values": [1], + "color": "#333355", + "bgcolor": "#222233" + }, + { + "id": 62, + "type": "ManualSigmas", + "pos": [3047.723288482116, 178.70409580402023], + "size": [277.23288482116413, 58], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "localized_name": "sigmas", + "name": "sigmas", + "type": "STRING", + "widget": { "name": "sigmas" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "SIGMAS", + "name": "SIGMAS", + "type": "SIGMAS", + "slot_index": 0, + "links": [51] + } + ], + "title": "Stage 2 Sigmas", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "ManualSigmas" + }, + "widgets_values": ["0.85, 0.7250, 0.4219, 0.0"] + }, + { + "id": 61, + "type": "KSamplerSelect", + "pos": [3050, 69.5972497324864], + "size": [250, 58], + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "localized_name": "sampler_name", + "name": "sampler_name", + "type": "COMBO", + "widget": { "name": "sampler_name" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "SAMPLER", + "name": "SAMPLER", + "type": "SAMPLER", + "slot_index": 0, + "links": [50] + } + ], + "title": "Stage 2 Sampler", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "KSamplerSelect" + }, + "widgets_values": ["euler_cfg_pp"] + }, + { + "id": 60, + "type": "RandomNoise", + "pos": [3050, -80], + "size": [210, 82], + "flags": {}, + "order": 12, + "mode": 0, + "inputs": [ + { + "localized_name": "noise_seed", + "name": "noise_seed", + "type": "INT", + "widget": { "name": "noise_seed" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "NOISE", + "name": "NOISE", + "type": "NOISE", + "slot_index": 0, + "links": [49] + } + ], + "title": "Stage 2 Noise", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "RandomNoise" + }, + "widgets_values": [43, "fixed"] + }, + { + "id": 74, + "type": "SaveVideo", + "pos": [3975.7575757575723, 1040.9090909090924], + "size": [250, 106], + "flags": {}, + "order": 38, + "mode": 0, + "inputs": [ + { + "localized_name": "video", + "name": "video", + "type": "VIDEO", + "link": 63 + }, + { + "localized_name": "filename_prefix", + "name": "filename_prefix", + "type": "STRING", + "widget": { "name": "filename_prefix" }, + "link": null + }, + { + "localized_name": "format", + "name": "format", + "type": "COMBO", + "widget": { "name": "format" }, + "link": null + }, + { + "localized_name": "codec", + "name": "codec", + "type": "COMBO", + "widget": { "name": "codec" }, + "link": null + } + ], + "outputs": [], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "SaveVideo" + }, + "widgets_values": ["LTX-2.3/Looping", "auto", "auto"] + }, + { + "id": 73, + "type": "CreateVideo", + "pos": [3649.9999999999995, 1049.9999999999995], + "size": [243.939393939394, 78], + "flags": {}, + "order": 37, + "mode": 0, + "inputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "link": 60 + }, + { + "localized_name": "audio", + "name": "audio", + "shape": 7, + "type": "AUDIO", + "link": 61 + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "widget": { "name": "fps" }, + "link": 62 + } + ], + "outputs": [ + { + "localized_name": "VIDEO", + "name": "VIDEO", + "type": "VIDEO", + "slot_index": 0, + "links": [63] + } + ], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "CreateVideo" + }, + "widgets_values": [30] + }, + { + "id": 72, + "type": "LTXVAudioVAEDecode", + "pos": [3643.9393939393935, 899.3939393939382], + "size": [203.00000610351563, 46], + "flags": {}, + "order": 36, + "mode": 0, + "inputs": [ + { + "localized_name": "samples", + "name": "samples", + "type": "LATENT", + "link": 58 + }, + { + "localized_name": "audio_vae", + "name": "audio_vae", + "type": "VAE", + "link": 59 + } + ], + "outputs": [ + { + "localized_name": "Audio", + "name": "Audio", + "type": "AUDIO", + "slot_index": 0, + "links": [61] + } + ], + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXVAudioVAEDecode" + }, + "widgets_values": [] + }, + { + "id": 71, + "type": "LTXVSpatioTemporalTiledVAEDecode", + "pos": [3615.151515151515, 569.393939393939], + "size": [350, 242], + "flags": {}, + "order": 35, + "mode": 0, + "inputs": [ + { "localized_name": "vae", "name": "vae", "type": "VAE", "link": 57 }, + { + "localized_name": "latents", + "name": "latents", + "type": "LATENT", + "link": null + }, + { + "localized_name": "spatial_tiles", + "name": "spatial_tiles", + "type": "INT", + "widget": { "name": "spatial_tiles" }, + "link": null + }, + { + "localized_name": "spatial_overlap", + "name": "spatial_overlap", + "type": "INT", + "widget": { "name": "spatial_overlap" }, + "link": null + }, + { + "localized_name": "temporal_tile_length", + "name": "temporal_tile_length", + "type": "INT", + "widget": { "name": "temporal_tile_length" }, + "link": null + }, + { + "localized_name": "temporal_overlap", + "name": "temporal_overlap", + "type": "INT", + "widget": { "name": "temporal_overlap" }, + "link": null + }, + { + "localized_name": "last_frame_fix", + "name": "last_frame_fix", + "type": "BOOLEAN", + "widget": { "name": "last_frame_fix" }, + "link": null + }, + { + "localized_name": "working_device", + "name": "working_device", + "type": "COMBO", + "widget": { "name": "working_device" }, + "link": null + }, + { + "localized_name": "working_dtype", + "name": "working_dtype", + "type": "COMBO", + "widget": { "name": "working_dtype" }, + "link": null + }, + { "name": "samples", "type": "LATENT", "link": 56 } + ], + "outputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "slot_index": 0, + "links": [60] + } + ], + "title": "Decode Video (Tiled)", + "properties": { + "cnr_id": "ComfyUI-LTXVideo", + "ver": "531512f7286963dc7aff1fd8bf5556e95eae03af", + "Node name for S&R": "LTXVSpatioTemporalTiledVAEDecode" + }, + "widgets_values": [6, 4, 16, 4, false, "auto", "auto"] + }, + { + "id": 70, + "type": "LTXVSeparateAVLatent", + "pos": [3600, 400], + "size": [233.33333333333348, 46], + "flags": {}, + "order": 34, + "mode": 0, + "inputs": [ + { + "localized_name": "av_latent", + "name": "av_latent", + "type": "LATENT", + "link": 55 + } + ], + "outputs": [ + { + "localized_name": "video_latent", + "name": "video_latent", + "type": "LATENT", + "slot_index": 0, + "links": [56] + }, + { + "localized_name": "audio_latent", + "name": "audio_latent", + "type": "LATENT", + "slot_index": 1, + "links": [58] + } + ], + "title": "Split Final AV", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXVSeparateAVLatent" + }, + "widgets_values": [] + }, + { + "id": 64, + "type": "LTXVLoopingSampler", + "pos": [3073.801984050594, 392.75591789764337], + "size": [400, 580], + "flags": {}, + "order": 33, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 47 + }, + { "localized_name": "vae", "name": "vae", "type": "VAE", "link": 48 }, + { + "localized_name": "noise", + "name": "noise", + "type": "NOISE", + "link": 49 + }, + { + "localized_name": "sampler", + "name": "sampler", + "type": "SAMPLER", + "link": 50 + }, + { + "localized_name": "sigmas", + "name": "sigmas", + "type": "SIGMAS", + "link": 51 + }, + { + "localized_name": "guider", + "name": "guider", + "type": "GUIDER", + "link": 52 + }, + { + "localized_name": "latents", + "name": "latents", + "type": "LATENT", + "link": 53 + }, + { + "localized_name": "optional_cond_images", + "name": "optional_cond_images", + "shape": 7, + "type": "IMAGE", + "link": 69 + }, + { + "localized_name": "optional_guiding_latents", + "name": "optional_guiding_latents", + "shape": 7, + "type": "LATENT", + "link": null + }, + { + "localized_name": "optional_positive_conditionings", + "name": "optional_positive_conditionings", + "shape": 7, + "type": "CONDITIONING", + "link": 72 + }, + { + "localized_name": "optional_negative_index_latents", + "name": "optional_negative_index_latents", + "shape": 7, + "type": "LATENT", + "link": null + }, + { + "localized_name": "optional_normalizing_latents", + "name": "optional_normalizing_latents", + "shape": 7, + "type": "LATENT", + "link": null + }, + { + "localized_name": "temporal_tile_size", + "name": "temporal_tile_size", + "type": "INT", + "widget": { "name": "temporal_tile_size" }, + "link": null + }, + { + "localized_name": "temporal_overlap", + "name": "temporal_overlap", + "type": "INT", + "widget": { "name": "temporal_overlap" }, + "link": null + }, + { + "localized_name": "guiding_strength", + "name": "guiding_strength", + "type": "FLOAT", + "widget": { "name": "guiding_strength" }, + "link": null + }, + { + "localized_name": "temporal_overlap_cond_strength", + "name": "temporal_overlap_cond_strength", + "type": "FLOAT", + "widget": { "name": "temporal_overlap_cond_strength" }, + "link": null + }, + { + "localized_name": "cond_image_strength", + "name": "cond_image_strength", + "type": "FLOAT", + "widget": { "name": "cond_image_strength" }, + "link": null + }, + { + "localized_name": "horizontal_tiles", + "name": "horizontal_tiles", + "type": "INT", + "widget": { "name": "horizontal_tiles" }, + "link": null + }, + { + "localized_name": "vertical_tiles", + "name": "vertical_tiles", + "type": "INT", + "widget": { "name": "vertical_tiles" }, + "link": null + }, + { + "localized_name": "spatial_overlap", + "name": "spatial_overlap", + "type": "INT", + "widget": { "name": "spatial_overlap" }, + "link": null + }, + { + "localized_name": "adain_factor", + "name": "adain_factor", + "shape": 7, + "type": "FLOAT", + "widget": { "name": "adain_factor" }, + "link": null + }, + { + "localized_name": "guiding_start_step", + "name": "guiding_start_step", + "shape": 7, + "type": "INT", + "widget": { "name": "guiding_start_step" }, + "link": null + }, + { + "localized_name": "guiding_end_step", + "name": "guiding_end_step", + "shape": 7, + "type": "INT", + "widget": { "name": "guiding_end_step" }, + "link": null + }, + { + "localized_name": "optional_cond_image_indices", + "name": "optional_cond_image_indices", + "shape": 7, + "type": "STRING", + "widget": { "name": "optional_cond_image_indices" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "denoised_output", + "name": "denoised_output", + "type": "LATENT", + "slot_index": 0, + "links": [55] + } + ], + "title": "Stage 2 \u2014 Refine", + "properties": { + "cnr_id": "ComfyUI-LTXVideo", + "ver": "531512f7286963dc7aff1fd8bf5556e95eae03af", + "Node name for S&R": "LTXVLoopingSampler" + }, + "widgets_values": [ + 264, + 24, + 1, + 0.5, + 1, + 2, + 1, + 1, + 0, + 0, + 1000, + "0, 240, 480" + ], + "color": "#333355", + "bgcolor": "#222233" + }, + { + "id": 53, + "type": "LTXVConcatAVLatent", + "pos": [2807.97697623735, 524.995187859747], + "size": [190.80550053502748, 46], + "flags": {}, + "order": 32, + "mode": 0, + "inputs": [ + { + "localized_name": "video_latent", + "name": "video_latent", + "type": "LATENT", + "link": 42 + }, + { + "localized_name": "audio_latent", + "name": "audio_latent", + "type": "LATENT", + "link": 43 + } + ], + "outputs": [ + { + "localized_name": "latent", + "name": "latent", + "type": "LATENT", + "slot_index": 0, + "links": [53] + } + ], + "title": "Stage 2 AV Concat", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXVConcatAVLatent" + }, + "widgets_values": [], + "color": "#333355", + "bgcolor": "#222233" + }, + { + "id": 51, + "type": "LTXVLatentUpsampler", + "pos": [2494.204734318114, 557.0100775530725], + "size": [249.9123466065612, 66], + "flags": {}, + "order": 30, + "mode": 0, + "inputs": [ + { + "localized_name": "samples", + "name": "samples", + "type": "LATENT", + "link": 35 + }, + { + "localized_name": "upscale_model", + "name": "upscale_model", + "type": "LATENT_UPSCALE_MODEL", + "link": 36 + }, + { "localized_name": "vae", "name": "vae", "type": "VAE", "link": 37 } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "slot_index": 0, + "links": [40] + } + ], + "title": "Spatial Upscale 2x", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXVLatentUpsampler" + }, + "widgets_values": [] + }, + { + "id": 50, + "type": "LTXVSeparateAVLatent", + "pos": [2429.214969171252, 400.10348688717687], + "size": [172.5918083919587, 46], + "flags": {}, + "order": 29, + "mode": 0, + "inputs": [ + { + "localized_name": "av_latent", + "name": "av_latent", + "type": "LATENT", + "link": 34 + } + ], + "outputs": [ + { + "localized_name": "video_latent", + "name": "video_latent", + "type": "LATENT", + "slot_index": 0, + "links": [35] + }, + { + "localized_name": "audio_latent", + "name": "audio_latent", + "type": "LATENT", + "slot_index": 1, + "links": [43] + } + ], + "title": "Split Stage 1 AV", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXVSeparateAVLatent" + }, + "widgets_values": [] + }, + { + "id": 33, + "type": "LTXVConcatAVLatent", + "pos": [1435.7500155700718, 795.296050582885], + "size": [174.92496730284756, 46], + "flags": {}, + "order": 25, + "mode": 0, + "inputs": [ + { + "localized_name": "video_latent", + "name": "video_latent", + "type": "LATENT", + "link": 18 + }, + { + "localized_name": "audio_latent", + "name": "audio_latent", + "type": "LATENT", + "link": 19 + } + ], + "outputs": [ + { + "localized_name": "latent", + "name": "latent", + "type": "LATENT", + "slot_index": 0, + "links": [31] + } + ], + "title": "Stage 1 AV Concat", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "LTXVConcatAVLatent" + }, + "widgets_values": [], + "color": "#335533", + "bgcolor": "#223322" + }, + { + "id": 35, + "type": "VAEEncode", + "pos": [1383.3333333333328, 1164.9999999999995], + "size": [206.36665954589844, 46], + "flags": {}, + "order": 21, + "mode": 0, + "inputs": [ + { + "localized_name": "pixels", + "name": "pixels", + "type": "IMAGE", + "link": 20 + }, + { "localized_name": "vae", "name": "vae", "type": "VAE", "link": 21 } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "slot_index": 0, + "links": [33] + } + ], + "title": "Encode Reference Latent", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "VAEEncode" + }, + "widgets_values": [] + }, + { + "id": 32, + "type": "LTXVImgToVideoConditionOnly", + "pos": [1399.999999999999, 604.9999999999992], + "size": [210, 122], + "flags": {}, + "order": 22, + "mode": 0, + "inputs": [ + { "localized_name": "vae", "name": "vae", "type": "VAE", "link": 14 }, + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 15 + }, + { + "localized_name": "latent", + "name": "latent", + "type": "LATENT", + "link": 16 + }, + { + "localized_name": "strength", + "name": "strength", + "type": "FLOAT", + "widget": { "name": "strength" }, + "link": null + }, + { + "localized_name": "bypass", + "name": "bypass", + "shape": 7, + "type": "BOOLEAN", + "widget": { "name": "bypass" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "latent", + "name": "latent", + "type": "LATENT", + "slot_index": 0, + "links": [18] + } + ], + "title": "Stage 1 I2V Cond", + "properties": { + "cnr_id": "ComfyUI-LTXVideo", + "ver": "531512f7286963dc7aff1fd8bf5556e95eae03af", + "Node name for S&R": "LTXVImgToVideoConditionOnly" + }, + "widgets_values": [0.7, false], + "color": "#335533", + "bgcolor": "#223322" + }, + { + "id": 52, + "type": "LTXVImgToVideoConditionOnly", + "pos": [2492.238483461759, 791.1178860526603], + "size": [210, 122], + "flags": {}, + "order": 31, + "mode": 0, + "inputs": [ + { "localized_name": "vae", "name": "vae", "type": "VAE", "link": 38 }, + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 39 + }, + { + "localized_name": "latent", + "name": "latent", + "type": "LATENT", + "link": 40 + }, + { + "localized_name": "strength", + "name": "strength", + "type": "FLOAT", + "widget": { "name": "strength" }, + "link": null + }, + { + "localized_name": "bypass", + "name": "bypass", + "shape": 7, + "type": "BOOLEAN", + "widget": { "name": "bypass" }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "latent", + "name": "latent", + "type": "LATENT", + "slot_index": 0, + "links": [42] + } + ], + "title": "Stage 2 I2V Cond", + "properties": { + "cnr_id": "ComfyUI-LTXVideo", + "ver": "531512f7286963dc7aff1fd8bf5556e95eae03af", + "Node name for S&R": "LTXVImgToVideoConditionOnly" + }, + "widgets_values": [1, false], + "color": "#333355", + "bgcolor": "#222233" + }, + { + "id": 6, + "type": "Note", + "pos": [281.1738724586202, 1016.7247228103745], + "size": [631.0862190651818, 273.1698654463494], + "flags": {}, + "order": 13, + "mode": 0, + "inputs": [], + "outputs": [], + "properties": { "Node name for S&R": "Note" }, + "widgets_values": [ + "## Three 10-Second Tiles \u2014 30s Video\n\n**Frame count:** 713 (29.7s at 24fps)\n**Tile size:** 264 (10.7s context per tile), Overlap: 24 (1s)\n**Tiles:** 3 temporal tiles, each ~10 seconds\n\n### Tile Prompts (MultiPromptProvider)\nPipe-separated prompts, one per tile. Edit to change per-tile narration.\nIf fewer prompts than tiles, the last prompt is reused.\n\n### Guiding Images\nThe reference image is repeated 3x and placed at tile boundaries:\n indices \"0, 240, 480\" \u2014 start of each tile in pixel frames.\nThis anchors subject identity across tile transitions.\n\n### Conditioning Image Indices\nIndices must be divisible by 8 (except 0).\nWith tile_size=264, overlap=24, pixel-space tile starts are:\n Tile 0: frame 0, Tile 1: frame 240, Tile 2: frame 480." + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 80, + "type": "RepeatImageBatch", + "pos": [11, 430], + "size": [220, 58], + "flags": {}, + "order": 15, + "mode": 0, + "inputs": [ + { "name": "image", "type": "IMAGE", "link": 66 }, + { + "name": "amount", + "type": "INT", + "widget": { "name": "amount" }, + "link": null + } + ], + "outputs": [ + { "name": "IMAGE", "type": "IMAGE", "slot_index": 0, "links": [67] } + ], + "title": "Repeat Ref Image (3x)", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "RepeatImageBatch" + }, + "widgets_values": [3] + }, + { + "id": 81, + "type": "RepeatImageBatch", + "pos": [11, 650], + "size": [220, 58], + "flags": {}, + "order": 16, + "mode": 0, + "inputs": [ + { "name": "image", "type": "IMAGE", "link": 68 }, + { + "name": "amount", + "type": "INT", + "widget": { "name": "amount" }, + "link": null + } + ], + "outputs": [ + { "name": "IMAGE", "type": "IMAGE", "slot_index": 0, "links": [69] } + ], + "title": "Repeat Resized Ref (3x)", + "properties": { + "cnr_id": "comfy-core", + "ver": "0.18.5", + "Node name for S&R": "RepeatImageBatch" + }, + "widgets_values": [3] + }, + { + "id": 82, + "type": "MultiPromptProvider", + "pos": [900, 440], + "size": [400, 220], + "flags": {}, + "order": 20, + "mode": 0, + "inputs": [ + { + "name": "prompts", + "type": "STRING", + "widget": { "name": "prompts" }, + "link": null + }, + { "name": "clip", "type": "CLIP", "link": 70 } + ], + "outputs": [ + { + "name": "conditionings", + "type": "CONDITIONING", + "slot_index": 0, + "links": [71, 72] + } + ], + "title": "Per-Tile Prompts (3 tiles)", + "properties": { + "cnr_id": "ComfyUI-LTXVideo", + "ver": "531512f7286963dc7aff1fd8bf5556e95eae03af", + "Node name for S&R": "MultiPromptProvider" + }, + "widgets_values": [ + "A woman walks through a sunlit garden, birds singing overhead. She smiles as petals fall gently around her. | She pauses by a fountain, trailing her fingers through the water. The camera slowly orbits around her as light plays on the surface. | She walks along a tree-lined path toward a distant gate. Leaves drift in the warm breeze as she disappears into golden light." + ] + } + ], + "links": [ + [1, 1, 0, 2, 0, "IMAGE"], + [2, 10, 0, 13, 0, "MODEL"], + [3, 11, 0, 20, 0, "CLIP"], + [4, 11, 0, 21, 0, "CLIP"], + [5, 20, 0, 22, 0, "CONDITIONING"], + [6, 21, 0, 22, 1, "CONDITIONING"], + [7, 4, 0, 22, 2, "FLOAT"], + [8, 1, 0, 23, 0, "IMAGE"], + [9, 3, 0, 30, 2, "INT"], + [10, 12, 0, 31, 0, "VAE"], + [11, 3, 0, 31, 1, "INT"], + [14, 10, 2, 32, 0, "VAE"], + [15, 2, 0, 32, 1, "IMAGE"], + [16, 30, 0, 32, 2, "LATENT"], + [18, 32, 0, 33, 0, "LATENT"], + [19, 31, 0, 33, 1, "LATENT"], + [20, 2, 0, 35, 0, "IMAGE"], + [21, 10, 2, 35, 1, "VAE"], + [22, 13, 0, 43, 0, "MODEL"], + [23, 22, 0, 43, 1, "CONDITIONING"], + [24, 22, 1, 43, 2, "CONDITIONING"], + [25, 13, 0, 44, 0, "MODEL"], + [26, 10, 2, 44, 1, "VAE"], + [27, 40, 0, 44, 2, "NOISE"], + [28, 41, 0, 44, 3, "SAMPLER"], + [29, 42, 0, 44, 4, "SIGMAS"], + [30, 43, 0, 44, 5, "GUIDER"], + [31, 33, 0, 44, 6, "LATENT"], + [33, 35, 0, 44, 10, "LATENT"], + [34, 44, 0, 50, 0, "LATENT"], + [35, 50, 0, 51, 0, "LATENT"], + [36, 14, 0, 51, 1, "LATENT_UPSCALE_MODEL"], + [37, 10, 2, 51, 2, "VAE"], + [38, 10, 2, 52, 0, "VAE"], + [39, 23, 0, 52, 1, "IMAGE"], + [40, 51, 0, 52, 2, "LATENT"], + [42, 52, 0, 53, 0, "LATENT"], + [43, 50, 1, 53, 1, "LATENT"], + [44, 13, 0, 63, 0, "MODEL"], + [45, 22, 0, 63, 1, "CONDITIONING"], + [46, 22, 1, 63, 2, "CONDITIONING"], + [47, 13, 0, 64, 0, "MODEL"], + [48, 10, 2, 64, 1, "VAE"], + [49, 60, 0, 64, 2, "NOISE"], + [50, 61, 0, 64, 3, "SAMPLER"], + [51, 62, 0, 64, 4, "SIGMAS"], + [52, 63, 0, 64, 5, "GUIDER"], + [53, 53, 0, 64, 6, "LATENT"], + [55, 64, 0, 70, 0, "LATENT"], + [56, 70, 0, 71, 9, "LATENT"], + [57, 10, 2, 71, 0, "VAE"], + [58, 70, 1, 72, 0, "LATENT"], + [59, 12, 0, 72, 1, "VAE"], + [60, 71, 0, 73, 0, "IMAGE"], + [61, 72, 0, 73, 1, "AUDIO"], + [62, 4, 0, 73, 2, "FLOAT"], + [63, 73, 0, 74, 0, "VIDEO"], + [64, 4, 0, 75, 0, "FLOAT"], + [65, 75, 0, 31, 2, "INT"], + [66, 2, 0, 80, 0, "IMAGE"], + [67, 80, 0, 44, 7, "IMAGE"], + [68, 23, 0, 81, 0, "IMAGE"], + [69, 81, 0, 64, 7, "IMAGE"], + [70, 11, 0, 82, 1, "CLIP"], + [71, 82, 0, 44, 9, "CONDITIONING"], + [72, 82, 0, 64, 9, "CONDITIONING"] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.878460000000002, + "offset": [-147.93391628437732, 99.35113851569274] + }, + "info": { + "name": "LTX-2.3 Two-Pass I2V Looping", + "description": "Two-pass I2V workflow for arbitrary-length video. Stage 1 generates at base resolution with temporal tiling. Stage 2 spatially upscales and refines. Soft guiding images at tile boundaries maintain subject continuity." + } + }, + "version": 0.4 +} diff --git a/example_workflows/LTX-2_V2V_Detailer.md b/example_workflows/LTX-2_V2V_Detailer.md new file mode 100644 index 0000000..5ec259e --- /dev/null +++ b/example_workflows/LTX-2_V2V_Detailer.md @@ -0,0 +1,191 @@ +# LTX-2 V2V Detailer — Tuning Notes + +## Workflow Overview + +Video-to-video detailer using LTX-2 19B with the IC-LoRA detailer. Upscales and refines +an input video by adding noise and denoising at the target resolution. + +**Default upscale target:** 1920px max dimension (via `ImageScaleToMaxDimension`) +**Sampler:** Euler +**Text encoder:** Gemma 3 12B IT + +--- + +## Known Issues at Large Upscale Ratios (e.g. 544 → 1920) + +A 3.5× upscale in a single pass forces the model to invent ~12× more pixel area than +the source. This causes two symptoms at the default sigma settings: + +- **Oversaturated colors** — model rebuilds rather than refines, drifting from source colors +- **Dithering/noise on fine textures** (hair, fabric) — hallucinated high-frequency detail + +--- + +## Key Parameters & Recommended Values + +### ManualSigmas — most impactful setting + +Controls how aggressively the model re-generates the video. Lower = preserves original more. + +| Scenario | Values | +|---|---| +| Default (too aggressive for large upscales) | `0.909375, 0.725, 0.421875, 0.0` | +| Recommended starting point | `0.5, 0.35, 0.2, 0.0` | +| Conservative (colors still drifting) | `0.35, 0.2, 0.1, 0.0` | + +### LoRA Strength (LoraLoaderModelOnly) + +The detailer LoRA at full strength over-sharpens fine structures. + +| Default | Recommended | +|---|---| +| 1.0 | 0.65 – 0.75 | + +### LTXVLoopingSampler + +| Parameter | Default | Notes | +|---|---|---| +| guiding_strength | 1.0 | Keep at 1.0 — lowering causes drift from source | +| temporal_overlap_cond_strength | 0.5 | Leave as-is | +| horizontal_tiles / vertical_tiles | 1 / 1 | Single spatial tile at 1920px is fine | + +### LTXVSpatioTemporalTiledVAEDecode + +| Parameter | Default | Notes | +|---|---|---| +| spatial_tiles | 4 | Fine for 1920px | +| spatial_overlap | 4 | Fine as-is | +| temporal_tile_length | 16 | Fine as-is | + +--- + +## Recommended Tuning Order + +1. Set ManualSigmas to `0.5, 0.35, 0.2, 0.0` → run and compare +2. If hair/texture still dithers → reduce LoRA strength to 0.7 +3. If colors still saturated → drop sigmas further to `0.35, 0.2, 0.1, 0.0` +4. If quality still insufficient → split into two upscale passes (see below) + +--- + +## Two-Stage Upscaling (Best Quality for Large Ratios) + +Rather than one 3.5× jump, run the workflow twice: + +**Pass 1:** 544 → 1024, sigmas `0.5, 0.35, 0.2, 0.0` +**Pass 2:** 1024 → 1920, sigmas `0.25, 0.15, 0.05, 0.0` + +Pass 2 needs very low sigmas — most detail is already correct, it is only sharpening. + +--- + +## Handling Arbitrary-Length Videos + +The workflow can process videos of any length. `LoadVideo` loads the full clip, +`ImageScaleToMaxDimension` rescales every frame, `VAEEncodeTiled` encodes the +full sequence into a latent, and `LTXVLoopingSampler` tiles along the temporal +axis with overlapping chunks. + +For a video with N latent frames, the sampler produces tiles as: + +``` +Tile 0: frames [0, temporal_tile_size) +Tile 1: frames [temporal_tile_size - temporal_overlap, 2*temporal_tile_size - temporal_overlap) +Tile 2: ... +``` + +Each tile is denoised independently (conditioned on the overlap region from the +previous tile), then the results are stitched. There is no hard upper bound on +video length — the sampler simply produces more temporal tiles. + +**Practical limits** are set by: + +- **VAE encode/decode memory**: the full video must be encoded and decoded. + `VAEEncodeTiled` and `LTXVSpatioTemporalTiledVAEDecode` tile spatially and + temporally, so this scales to long clips. Increase `spatial_tiles` or reduce + `temporal_tile_length` in the VAE decode node if the VAE step OOMs. +- **Latent tensor size**: the full video latent (shape `[1, 128, T, H, W]`) + must fit in memory at once. At 1280px, each latent frame is ~0.26MB (128 + channels × 40 × 40 × bf16). A 10-minute clip at 24fps (14400 frames → + ~1800 latent frames) is ~470MB — easily fits. +- **Wall-clock time**: each temporal tile requires a full sampling pass. On + unified memory (~130GB/s bandwidth), a single tile at 1280px takes minutes. + A 10-minute clip with `temporal_tile_size=32, temporal_overlap=16` produces + ~113 tiles, which could take many hours. +- **Quality drift over many tiles**: temporal overlap conditioning keeps + adjacent tiles coherent, but over very long sequences the style can drift + gradually. `optional_normalizing_latents` and `adain_factor` can mitigate + this by anchoring color/contrast statistics. + +In practice, "infinite length" means you can process clips of any duration if +you have the patience. Memory is not the bottleneck — compute time is. + +--- + +## Strix Halo 128GB Unified Memory — OOM Prevention + +The default settings (1920px, single spatial tile, `temporal_tile_size=56`) +are tuned for discrete GPUs with fast HBM. On Strix Halo with ~120GB unified +memory allocated via TTM, the peak activation memory during sampling at 1920px +can exceed available GPU memory. + +### Where the memory goes + +| Component | Approximate size | +|---|---| +| LTX-2 19B (BF16) | ~38GB | +| Gemma 3 12B (Q4 quantized) | ~7GB | +| VAE | ~0.5GB | +| Activations during sampling (resolution-dependent) | 40–80GB+ at 1920px | + +With `--highvram` keeping all models resident, ~46GB is consumed before any +activations are allocated. + +### Recommended settings + +| Parameter | Default | Recommended | +|---|---|---| +| `ImageScaleToMaxDimension` | 1920 | **1280** (or 1024) | +| `horizontal_tiles` | 1 | **2** (at 1920px) or **1** (at 1280px) | +| `vertical_tiles` | 1 | **2** (at 1920px) or **1** (at 1280px) | +| `temporal_tile_size` | 56 | **32** | +| `temporal_overlap` | 24 | **16** | +| `ManualSigmas` | `0.909, 0.725, 0.422, 0.0` | `0.5, 0.35, 0.2, 0.0` | +| `LoRA strength` | 1.0 | **0.7** | +| `LTXVSpatioTemporalTiledVAEDecode spatial_tiles` | 4 | **6–8** if VAE OOMs | + +**Spatial tiling** (`horizontal_tiles × vertical_tiles`) is the most impactful +setting. It tiles the spatial dimension during sampling so that attention and +feedforward layers operate on a fraction of the full resolution. 2×2 at 1920px +reduces per-tile activation memory by roughly 4×. + +**Temporal tile size** reduction also helps: fewer frames per tile means a +shorter sequence length for the transformer, reducing both attention (O(n²)) +and feedforward memory. + +### LTXV Chunk FeedForward (KJNodes) + +The `LTXV Chunk FeedForward` node from comfyui-kjnodes can be added between +the model loader and the guider. It patches the feedforward layers in each +transformer block to process the token sequence in chunks rather than all at +once, reducing peak activation memory in the FFN (which expands hidden dim +by 4×). + +| Parameter | Recommended | +|---|---| +| `chunks` | **2** (start here; increase to 3–4 if still tight) | +| `dim_threshold` | **4096** (default — only activates for large sequences) | + +This is a secondary optimization — spatial tiling has more impact because it +reduces memory for both attention and FFN. Use Chunk FeedForward in addition +to spatial tiling, not instead of it. Note the node is marked experimental and +may cause minor numerical differences in output. + +### If 1920px is required + +Use the two-stage approach: + +**Pass 1:** source → 1024, sigmas `0.5, 0.35, 0.2, 0.0`, 1×1 spatial tiles +**Pass 2:** 1024 → 1920, sigmas `0.25, 0.15, 0.05, 0.0`, 2×2 spatial tiles + +Each pass individually fits in memory. diff --git a/example_workflows/generate_two_pass_i2v_looping.py b/example_workflows/generate_two_pass_i2v_looping.py new file mode 100644 index 0000000..7ebaaa8 --- /dev/null +++ b/example_workflows/generate_two_pass_i2v_looping.py @@ -0,0 +1,527 @@ +#!/usr/bin/env python3 +"""Generate a two-pass I2V arbitrary-length workflow for LTX-2.3. + +Stage 1: LTXVLoopingSampler at base resolution (~544p) with soft guiding + images at tile boundaries for subject/scene continuity. +Stage 2: Spatial upscale (2x) → LTXVLoopingSampler refinement at high + resolution with spatial tiling. + +Run: python generate_two_pass_i2v_looping.py +Out: LTX-2.3_Two_Pass_I2V_Looping.json (importable ComfyUI workflow) +""" + +import json +import uuid + +# ─── Workflow builder ──────────────────────────────────────────────── + +_link_counter = 0 +_nodes: list[dict] = [] +_links: list[list] = [] + + +def _next_link_id(): + global _link_counter + _link_counter += 1 + return _link_counter + + +def node( + nid: int, + ntype: str, + pos: tuple[int, int], + widgets: list | None = None, + size: tuple[int, int] = (300, 200), + title: str | None = None, + color: str | None = None, + bgcolor: str | None = None, +): + """Register a node and return its id for wiring.""" + n = { + "id": nid, + "type": ntype, + "pos": list(pos), + "size": list(size), + "flags": {}, + "order": nid, + "mode": 0, + "inputs": [], + "outputs": [], + "properties": {"Node name for S&R": ntype}, + "widgets_values": widgets if widgets is not None else [], + } + if title: + n["title"] = title + if color: + n["color"] = color + if bgcolor: + n["bgcolor"] = bgcolor + _nodes.append(n) + return nid + + +def inp(nid: int, name: str, typ: str): + """Declare an input slot on a node (call in slot order).""" + for n in _nodes: + if n["id"] == nid: + n["inputs"].append({"name": name, "type": typ, "link": None}) + return + raise ValueError(f"node {nid} not found") + + +def out(nid: int, name: str, typ: str): + """Declare an output slot on a node (call in slot order).""" + for n in _nodes: + if n["id"] == nid: + n["outputs"].append( + { + "name": name, + "type": typ, + "links": [], + "slot_index": len(n["outputs"]), + } + ) + return + raise ValueError(f"node {nid} not found") + + +def link(from_id: int, from_slot: int, to_id: int, to_slot: int, typ: str): + """Wire from_id:from_slot → to_id:to_slot.""" + lid = _next_link_id() + _links.append([lid, from_id, from_slot, to_id, to_slot, typ]) + # Update node bookkeeping + for n in _nodes: + if n["id"] == from_id and from_slot < len(n["outputs"]): + n["outputs"][from_slot]["links"].append(lid) + if n["id"] == to_id and to_slot < len(n["inputs"]): + n["inputs"][to_slot]["link"] = lid + + +def build(): + return { + "id": str(uuid.uuid4()), + "revision": 0, + "last_node_id": max(n["id"] for n in _nodes), + "last_link_id": _link_counter, + "nodes": _nodes, + "links": _links, + "groups": [], + "config": {}, + "extra": { + "ds": {"scale": 0.6, "offset": [0, 0]}, + "info": { + "name": "LTX-2.3 Two-Pass I2V Looping", + "description": ( + "Two-pass I2V workflow for arbitrary-length video. " + "Stage 1 generates at base resolution with temporal tiling. " + "Stage 2 spatially upscales and refines. " + "Soft guiding images at tile boundaries maintain subject continuity." + ), + }, + }, + "version": 0.4, + } + + +# ─── Layout constants ─────────────────────────────────────────────── + +COL_INPUT = 0 +COL_MODELS = 450 +COL_TEXT = 900 +COL_S1_PREP = 1400 +COL_S1_SAMPLE = 1950 +COL_MID = 2500 +COL_S2_SAMPLE = 3050 +COL_OUTPUT = 3600 + +ROW_TOP = 0 +ROW_MID = 400 +ROW_BOT = 800 +ROW_DEEP = 1200 + +# Group colors +S1_COLOR = "#335533" +S1_BG = "#223322" +S2_COLOR = "#333355" +S2_BG = "#222233" + +# ─── Nodes ─────────────────────────────────────────────────────────── + +# ── Shared primitives ── + +node(1, "LoadImage", (COL_INPUT, ROW_TOP), ["reference_image.png", "image"], (300, 300)) +out(1, "IMAGE", "IMAGE") +out(1, "MASK", "MASK") + +node(2, "LTXVPreprocess", (COL_INPUT, ROW_TOP + 340), [18]) +inp(2, "image", "IMAGE") +out(2, "output_image", "IMAGE") +link(1, 0, 2, 0, "IMAGE") # LoadImage → Preprocess + +node(3, "PrimitiveInt", (COL_INPUT, ROW_BOT), [241, "fixed"], (200, 100), + title="Frame Count") +out(3, "INT", "INT") + +node(4, "PrimitiveFloat", (COL_INPUT, ROW_BOT + 130), [24], (200, 100), + title="Frame Rate") +out(4, "FLOAT", "FLOAT") + +node(5, "PrimitiveBoolean", (COL_INPUT, ROW_BOT + 260), [True], (200, 80), + title="I2V Enable") +out(5, "BOOLEAN", "BOOLEAN") + +# Guiding image indices — comma-separated pixel frame positions. +# Default "0" = reference at first frame only. +# For 3 tiles (241 frames, tile_size=128, overlap=24): +# "0, 104, 208" places the guiding image at each tile boundary. +# The number of indices must match the number of guiding images. +# With a single image and "0", only frame 0 gets soft conditioning. +# Use optional_negative_index_latents for global subject anchoring. +node(6, "Note", (COL_INPUT, ROW_DEEP), [ + "## Guiding Image Indices\n\n" + "Set `optional_cond_image_indices` on the Stage 1 Looping Sampler.\n" + "Default: \"0\" (reference image at first frame only).\n\n" + "For multi-tile conditioning, set indices at tile boundaries.\n" + "With tile_size=128, overlap=24, new content starts every 104 frames:\n" + " \"0, 104, 208\" for a 241-frame (3-tile) clip.\n\n" + "The number of images in the guiding batch must match the indices.\n" + "Use LatentBatch or ImageBatch to provide multiple images.\n" + "By default, a single reference image at index 0 is used." +], (400, 280)) + +# ── Model loading ── + +node(10, "CheckpointLoaderSimple", (COL_MODELS, ROW_TOP), + ["ltx-2.3-22b-dev.safetensors"], (350, 150)) +out(10, "MODEL", "MODEL") +out(10, "CLIP", "CLIP") +out(10, "VAE", "VAE") + +node(11, "LTXAVTextEncoderLoader", (COL_MODELS, ROW_TOP + 180), + ["comfy_gemma_3_12B_it.safetensors", "ltx-2.3-22b-dev.safetensors", "default"], + (380, 130)) +out(11, "CLIP", "CLIP") + +node(12, "LTXVAudioVAELoader", (COL_MODELS, ROW_MID), + ["ltx-2.3-22b-dev.safetensors"], (350, 100)) +out(12, "Audio VAE", "VAE") + +node(13, "LoraLoaderModelOnly", (COL_MODELS, ROW_MID + 130), + ["ltx-2.3-22b-distilled-lora-384.safetensors", 0.5], (380, 100), + title="Distilled LoRA (both stages)") +inp(13, "model", "MODEL") +out(13, "MODEL", "MODEL") +link(10, 0, 13, 0, "MODEL") # Checkpoint → LoRA + +node(14, "LatentUpscaleModelLoader", (COL_MODELS, ROW_MID + 260), + ["ltx-2.3-spatial-upscaler-x2-1.1.safetensors"], (380, 100)) +out(14, "LATENT_UPSCALE_MODEL", "LATENT_UPSCALE_MODEL") + +# ── Text encoding ── + +node(20, "CLIPTextEncode", (COL_TEXT, ROW_TOP), + ["A woman walks through a sunlit meadow. Warm breeze rustles the tall grass. " + "Birds sing in the distance. She pauses to admire wildflowers."], + (400, 180), title="Positive Prompt") +inp(20, "clip", "CLIP") +out(20, "CONDITIONING", "CONDITIONING") +link(11, 0, 20, 0, "CLIP") + +node(21, "CLIPTextEncode", (COL_TEXT, ROW_TOP + 220), + ["pc game, console game, video game, cartoon, childish, ugly, blurry"], + (400, 120), title="Negative Prompt") +inp(21, "clip", "CLIP") +out(21, "CONDITIONING", "CONDITIONING") +link(11, 0, 21, 0, "CLIP") + +node(22, "LTXVConditioning", (COL_TEXT, ROW_MID), [24], (300, 120)) +inp(22, "positive", "CONDITIONING") +inp(22, "negative", "CONDITIONING") +inp(22, "frame_rate", "FLOAT") +out(22, "positive", "CONDITIONING") +out(22, "negative", "CONDITIONING") +link(20, 0, 22, 0, "CONDITIONING") +link(21, 0, 22, 1, "CONDITIONING") +link(4, 0, 22, 2, "FLOAT") + +# ── Resize reference image (for both stages) ── + +node(23, "ResizeImageMaskNode", (COL_INPUT + 340, ROW_TOP + 340), + ["scale longer dimension", 1536, "lanczos"], (300, 120), + title="Resize Reference") +inp(23, "input", "IMAGE,MASK") +out(23, "resized", "IMAGE") +link(1, 0, 23, 0, "IMAGE") # Original image → resize + +# ── Stage 1 prep ── + +node(30, "EmptyLTXVLatentVideo", (COL_S1_PREP, ROW_TOP), [960, 544, 241, 1], + (250, 150), title="Stage 1 Empty Latent") +inp(30, "length", "INT") +out(30, "LATENT", "LATENT") +link(3, 0, 30, 0, "INT") # Frame count + +node(31, "LTXVEmptyLatentAudio", (COL_S1_PREP, ROW_TOP + 180), [97, 25, 1], + (250, 130)) +inp(31, "audio_vae", "VAE") +inp(31, "frames_number", "INT") +inp(31, "frame_rate", "INT") +out(31, "Latent", "LATENT") +link(12, 0, 31, 0, "VAE") # Audio VAE +link(3, 0, 31, 1, "INT") # Frame count + +node(34, "CM_FloatToInt", (COL_S1_PREP - 100, ROW_MID + 60), [0], (150, 80), + title="FPS→Int") +inp(34, "a", "FLOAT") +out(34, "INT", "INT") +link(4, 0, 34, 0, "FLOAT") +link(34, 0, 31, 2, "INT") # Frame rate int → audio + +node(32, "LTXVImgToVideoConditionOnly", (COL_S1_PREP, ROW_MID), + [0.7, False], (300, 130), title="Stage 1 I2V Cond", + color=S1_COLOR, bgcolor=S1_BG) +inp(32, "vae", "VAE") +inp(32, "image", "IMAGE") +inp(32, "latent", "LATENT") +inp(32, "bypass", "BOOLEAN") +out(32, "latent", "LATENT") +link(10, 2, 32, 0, "VAE") # Checkpoint VAE +link(2, 0, 32, 1, "IMAGE") # Preprocessed reference +link(30, 0, 32, 2, "LATENT") # Empty latent +link(5, 0, 32, 3, "BOOLEAN") # I2V enable + +node(33, "LTXVConcatAVLatent", (COL_S1_PREP, ROW_BOT), [], + (250, 100), title="Stage 1 AV Concat", + color=S1_COLOR, bgcolor=S1_BG) +inp(33, "video_latent", "LATENT") +inp(33, "audio_latent", "LATENT") +out(33, "latent", "LATENT") +link(32, 0, 33, 0, "LATENT") # Conditioned video latent +link(31, 0, 33, 1, "LATENT") # Empty audio latent + +# VAE-encode reference for negative_index_latents (global subject anchor) +node(35, "VAEEncode", (COL_S1_PREP, ROW_DEEP), [], (250, 100), + title="Encode Reference Latent") +inp(35, "pixels", "IMAGE") +inp(35, "vae", "VAE") +out(35, "LATENT", "LATENT") +link(2, 0, 35, 0, "IMAGE") # Preprocessed reference +link(10, 2, 35, 1, "VAE") # Checkpoint VAE + +# ── Stage 1 sampling ── + +node(40, "RandomNoise", (COL_S1_SAMPLE, ROW_TOP - 80), [42, "fixed"], + (200, 100), title="Stage 1 Noise") +out(40, "NOISE", "NOISE") + +node(41, "KSamplerSelect", (COL_S1_SAMPLE, ROW_TOP + 40), + ["euler_ancestral_cfg_pp"], (250, 80), title="Stage 1 Sampler") +out(41, "SAMPLER", "SAMPLER") + +node(42, "ManualSigmas", (COL_S1_SAMPLE, ROW_TOP + 140), + ["1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875, 0.0"], + (350, 80), title="Stage 1 Sigmas") +out(42, "SIGMAS", "SIGMAS") + +node(43, "CFGGuider", (COL_S1_SAMPLE, ROW_TOP + 240), [1], (250, 130), + title="Stage 1 Guider", color=S1_COLOR, bgcolor=S1_BG) +inp(43, "model", "MODEL") +inp(43, "positive", "CONDITIONING") +inp(43, "negative", "CONDITIONING") +out(43, "GUIDER", "GUIDER") +link(13, 0, 43, 0, "MODEL") # Model with distilled LoRA +link(22, 0, 43, 1, "CONDITIONING") # Positive +link(22, 1, 43, 2, "CONDITIONING") # Negative + +# LTXVLoopingSampler — Stage 1 +# Widgets: temporal_tile_size, temporal_overlap, guiding_strength, +# temporal_overlap_cond_strength, cond_image_strength, +# horizontal_tiles, vertical_tiles, spatial_overlap, +# adain_factor, guiding_start_step, guiding_end_step, +# optional_cond_image_indices +node(44, "LTXVLoopingSampler", (COL_S1_SAMPLE, ROW_MID), + [128, 24, 1.0, 0.5, 1.0, 1, 1, 1, 0.15, 0, 1000, "0"], + (400, 580), title="Stage 1 — Generate", + color=S1_COLOR, bgcolor=S1_BG) +# Required inputs (slots 0-6) +inp(44, "model", "MODEL") +inp(44, "vae", "VAE") +inp(44, "noise", "NOISE") +inp(44, "sampler", "SAMPLER") +inp(44, "sigmas", "SIGMAS") +inp(44, "guider", "GUIDER") +inp(44, "latents", "LATENT") +# Optional inputs (slots 7-11) +inp(44, "optional_cond_images", "IMAGE") +inp(44, "optional_guiding_latents", "LATENT") +inp(44, "optional_positive_conditionings", "CONDITIONING") +inp(44, "optional_negative_index_latents", "LATENT") +inp(44, "optional_normalizing_latents", "LATENT") +out(44, "denoised_output", "LATENT") + +link(13, 0, 44, 0, "MODEL") # Model with distilled LoRA +link(10, 2, 44, 1, "VAE") # Checkpoint VAE +link(40, 0, 44, 2, "NOISE") # Noise +link(41, 0, 44, 3, "SAMPLER") # Sampler +link(42, 0, 44, 4, "SIGMAS") # Sigmas +link(43, 0, 44, 5, "GUIDER") # Guider +link(33, 0, 44, 6, "LATENT") # AV latent (video + audio) +link(2, 0, 44, 7, "IMAGE") # Guiding images (preprocessed reference) +# slot 8: optional_guiding_latents — not connected (no IC-LoRA guide) +# slot 9: optional_positive_conditionings — not connected (single prompt) +link(35, 0, 44, 10, "LATENT") # Negative index latents (global subject anchor) +# slot 11: optional_normalizing_latents — not connected + +# ── Between stages ── +# Single split of stage 1 AV output: +# video → upscaler → stage 2 looping sampler +# audio → directly to final audio decode (bypasses stage 2) + +node(50, "LTXVSeparateAVLatent", (COL_MID, ROW_MID), [], (250, 100), + title="Split Stage 1 AV") +inp(50, "av_latent", "LATENT") +out(50, "video_latent", "LATENT") +out(50, "audio_latent", "LATENT") +link(44, 0, 50, 0, "LATENT") # Stage 1 output → split + +node(51, "LTXVLatentUpsampler", (COL_MID, ROW_MID + 130), [], (300, 100), + title="Spatial Upscale 2x") +inp(51, "samples", "LATENT") +inp(51, "upscale_model", "LATENT_UPSCALE_MODEL") +inp(51, "vae", "VAE") +out(51, "LATENT", "LATENT") +link(50, 0, 51, 0, "LATENT") # Video latent only +link(14, 0, 51, 1, "LATENT_UPSCALE_MODEL") # Upscale model +link(10, 2, 51, 2, "VAE") # VAE + +node(52, "LTXVImgToVideoConditionOnly", (COL_MID, ROW_MID + 260), + [1.0, False], (300, 130), title="Stage 2 I2V Cond", + color=S2_COLOR, bgcolor=S2_BG) +inp(52, "vae", "VAE") +inp(52, "image", "IMAGE") +inp(52, "latent", "LATENT") +inp(52, "bypass", "BOOLEAN") +out(52, "latent", "LATENT") +link(10, 2, 52, 0, "VAE") # VAE +link(23, 0, 52, 1, "IMAGE") # Resized reference (full res for stage 2) +link(51, 0, 52, 2, "LATENT") # Upscaled video latent +link(5, 0, 52, 3, "BOOLEAN") # I2V enable + +# Stage 2 receives AV latent (upscaled video + stage 1 audio). +# The looping sampler preserves input audio data for refinement: +# base tile uses the corresponding input audio slice, extend tiles +# pass source audio for new-frame initialization via _audio_new_init. +node(53, "LTXVConcatAVLatent", (COL_MID, ROW_BOT + 200), [], (250, 100), + title="Stage 2 AV Concat", color=S2_COLOR, bgcolor=S2_BG) +inp(53, "video_latent", "LATENT") +inp(53, "audio_latent", "LATENT") +out(53, "latent", "LATENT") +link(52, 0, 53, 0, "LATENT") # Conditioned upscaled video +link(50, 1, 53, 1, "LATENT") # Audio from stage 1 + +# ── Stage 2 sampling ── + +node(60, "RandomNoise", (COL_S2_SAMPLE, ROW_TOP - 80), [43, "fixed"], + (200, 100), title="Stage 2 Noise") +out(60, "NOISE", "NOISE") + +node(61, "KSamplerSelect", (COL_S2_SAMPLE, ROW_TOP + 40), + ["euler_cfg_pp"], (250, 80), title="Stage 2 Sampler") +out(61, "SAMPLER", "SAMPLER") + +node(62, "ManualSigmas", (COL_S2_SAMPLE, ROW_TOP + 140), + ["0.85, 0.7250, 0.4219, 0.0"], (300, 80), title="Stage 2 Sigmas") +out(62, "SIGMAS", "SIGMAS") + +node(63, "CFGGuider", (COL_S2_SAMPLE, ROW_TOP + 240), [1], (250, 130), + title="Stage 2 Guider", color=S2_COLOR, bgcolor=S2_BG) +inp(63, "model", "MODEL") +inp(63, "positive", "CONDITIONING") +inp(63, "negative", "CONDITIONING") +out(63, "GUIDER", "GUIDER") +link(13, 0, 63, 0, "MODEL") # Same model with distilled LoRA +link(22, 0, 63, 1, "CONDITIONING") # Same positive +link(22, 1, 63, 2, "CONDITIONING") # Same negative + +# LTXVLoopingSampler — Stage 2 +# spatial tiling 2x1 for upscaled resolution +node(64, "LTXVLoopingSampler", (COL_S2_SAMPLE, ROW_MID), + [128, 24, 1.0, 0.5, 1.0, 2, 1, 1, 0.0, 0, 1000, "0"], + (400, 580), title="Stage 2 — Refine", + color=S2_COLOR, bgcolor=S2_BG) +inp(64, "model", "MODEL") +inp(64, "vae", "VAE") +inp(64, "noise", "NOISE") +inp(64, "sampler", "SAMPLER") +inp(64, "sigmas", "SIGMAS") +inp(64, "guider", "GUIDER") +inp(64, "latents", "LATENT") +inp(64, "optional_cond_images", "IMAGE") +inp(64, "optional_guiding_latents", "LATENT") +inp(64, "optional_positive_conditionings", "CONDITIONING") +inp(64, "optional_negative_index_latents", "LATENT") +inp(64, "optional_normalizing_latents", "LATENT") +out(64, "denoised_output", "LATENT") + +link(13, 0, 64, 0, "MODEL") # Model with distilled LoRA +link(10, 2, 64, 1, "VAE") # VAE +link(60, 0, 64, 2, "NOISE") # Noise +link(61, 0, 64, 3, "SAMPLER") # Sampler +link(62, 0, 64, 4, "SIGMAS") # Sigmas +link(63, 0, 64, 5, "GUIDER") # Guider +link(53, 0, 64, 6, "LATENT") # Stage 2 AV latent (upscaled video + stage 1 audio) +link(23, 0, 64, 7, "IMAGE") # Guiding images (resized reference) +# slot 8-11: not connected for stage 2 + +# ── Output ── +# Both video and audio from stage 2 (refined jointly). + +node(70, "LTXVSeparateAVLatent", (COL_OUTPUT, ROW_MID), [], (250, 100), + title="Split Final AV") +inp(70, "av_latent", "LATENT") +out(70, "video_latent", "LATENT") +out(70, "audio_latent", "LATENT") +link(64, 0, 70, 0, "LATENT") # Stage 2 AV output + +node(71, "LTXVSpatioTemporalTiledVAEDecode", (COL_OUTPUT, ROW_MID + 130), + [6, 4, 16, 4, False, "auto", "auto"], (350, 200), + title="Decode Video (Tiled)") +inp(71, "samples", "LATENT") +inp(71, "vae", "VAE") +out(71, "IMAGE", "IMAGE") +link(70, 0, 71, 0, "LATENT") # Refined video +link(10, 2, 71, 1, "VAE") # VAE + +node(72, "LTXVAudioVAEDecode", (COL_OUTPUT, ROW_MID + 360), [], (250, 100)) +inp(72, "samples", "LATENT") +inp(72, "audio_vae", "VAE") +out(72, "Audio", "AUDIO") +link(70, 1, 72, 0, "LATENT") # Refined audio +link(12, 0, 72, 1, "VAE") # Audio VAE + +node(73, "CreateVideo", (COL_OUTPUT, ROW_BOT + 200), [30], (250, 100)) +inp(73, "images", "IMAGE") +inp(73, "audio", "AUDIO") +inp(73, "fps", "FLOAT") +out(73, "VIDEO", "VIDEO") +link(71, 0, 73, 0, "IMAGE") +link(72, 0, 73, 1, "AUDIO") +link(4, 0, 73, 2, "FLOAT") # Frame rate + +node(74, "SaveVideo", (COL_OUTPUT, ROW_DEEP), ["LTX-2.3/Looping", "auto", "auto"], + (250, 100)) +inp(74, "video", "VIDEO") +link(73, 0, 74, 0, "VIDEO") + + +# ─── Generate ──────────────────────────────────────────────────────── + +if __name__ == "__main__": + import os + + wf = build() + out_path = os.path.join(os.path.dirname(__file__), "LTX-2.3_Two_Pass_I2V_Looping.json") + with open(out_path, "w") as f: + json.dump(wf, f, indent=2) + print(f"Wrote {out_path}") + print(f" {len(_nodes)} nodes, {len(_links)} links") diff --git a/looping_sampler.py b/looping_sampler.py index 874a06a..20c7892 100644 --- a/looping_sampler.py +++ b/looping_sampler.py @@ -1,10 +1,12 @@ import copy from dataclasses import dataclass +from typing import Optional import comfy import torch +from comfy.nested_tensor import NestedTensor -from .easy_samplers import LTXVBaseSampler, LTXVExtendSampler, LTXVInContextSampler +from .easy_samplers import LTXVBaseSampler, LTXVExtendSampler, LTXVInContextSampler, _get_raw_conds_from_guider from .latents import LTXVDilateLatent, LTXVSelectLatents from .nodes_registry import comfy_node @@ -229,6 +231,16 @@ def INPUT_TYPES(s): "tooltip": "The latents to use for normalizing the output latents, they will be used to normalize the output latents to the same statistics as the input latents." }, ), + "optional_negative_index_strength": ( + "FLOAT", + { + "default": 1.0, + "min": 0.0, + "max": 1.0, + "step": 0.01, + "tooltip": "The strength of the negative-index latent conditioning. Lower values reduce the influence of the reference image(s) provided via optional_negative_index_latents.", + }, + ), }, } @@ -244,9 +256,14 @@ def _extract_latent_spatial_tile(self, latent_dict, v_start, v_end, h_start, h_e return None tile_samples = latent_dict["samples"][:, :, :, v_start:v_end, h_start:h_end] if "noise_mask" in latent_dict and latent_dict["noise_mask"] is not None: - tile_masks = latent_dict["noise_mask"][ - :, :, :, v_start:v_end, h_start:h_end - ] + noise_mask = latent_dict["noise_mask"] + # If the noise mask has broadcast spatial dims (1x1), keep them + # as-is rather than slicing (which would produce zero-size dims + # for tiles starting past index 0). + if noise_mask.ndim == 5 and noise_mask.shape[3] <= 1 and noise_mask.shape[4] <= 1: + tile_masks = noise_mask + else: + tile_masks = noise_mask[:, :, :, v_start:v_end, h_start:h_end] return {"samples": tile_samples, "noise_mask": tile_masks} else: return {"samples": tile_samples} @@ -312,11 +329,13 @@ def _process_temporal_chunks( tile_config: TileConfig, sampling_config: SamplingConfig, model_config: ModelConfig, + audio_info: Optional[dict] = None, ): """Process all temporal chunks for a single spatial tile.""" chunk_index = 0 tile_out_latents = None first_tile_out_latents = None + accumulated_audio = None for i_temporal_tile, (start_index, end_index) in enumerate( zip( @@ -431,6 +450,55 @@ def _process_temporal_chunks( [str(i) for i in this_chunk_keyframe_indices] ) if start_index == 0: + # Create audio tile for the base tile. + # If input audio data is available (stage-2 refinement), + # use the corresponding slice; otherwise create zeros + # (stage-1 generation from scratch). + audio_tile = None + if audio_info is not None: + video_tile_frames = min( + sampling_config.temporal_tile_size, + tile_config.tile_latents["samples"].shape[2], + ) + audio_tile_frames = max( + 1, + round( + video_tile_frames + * audio_info["total_audio_frames"] + / max(audio_info["total_video_frames"], 1) + ), + ) + src_audio = audio_info.get("tensor") + if src_audio is not None: + # Refinement: use input audio slice + available = min(audio_tile_frames, src_audio.shape[2]) + audio_tile = src_audio[:, :, :available].clone() + if available < audio_tile_frames: + pad = torch.zeros( + 1, audio_info["channels"], + audio_tile_frames - available, + audio_info["freq_bins"], + device=audio_info["device"], + dtype=audio_info["dtype"], + ) + audio_tile = torch.cat([audio_tile, pad], dim=2) + print( + f"[LoopingSampler] Base tile audio (from input): {audio_tile.shape}" + ) + else: + # Generation: start from zeros + audio_tile = torch.zeros( + 1, + audio_info["channels"], + audio_tile_frames, + audio_info["freq_bins"], + device=audio_info["device"], + dtype=audio_info["dtype"], + ) + print( + f"[LoopingSampler] Base tile audio (zeros): {audio_tile.shape}" + ) + if tile_config.tile_guiding_latents is not None: tile_out_latents = LTXVInContextSampler().sample( vae=model_config.vae, @@ -450,6 +518,7 @@ def _process_temporal_chunks( guiding_strength=sampling_config.guiding_strength, guiding_start_step=sampling_config.guiding_start_step, guiding_end_step=sampling_config.guiding_end_step, + _audio_tile=audio_tile, )[0] else: tile_out_latents = LTXVBaseSampler().sample( @@ -483,9 +552,43 @@ def _process_temporal_chunks( optional_initialization_latents=latent_chunk, guiding_start_step=sampling_config.guiding_start_step, guiding_end_step=sampling_config.guiding_end_step, + _audio_tile=audio_tile, )[0] + + # Extract denoised audio from base tile + accumulated_audio = tile_out_latents.pop("_audio", None) first_tile_out_latents = copy.deepcopy(tile_out_latents) else: + # Compute audio init data for the "new frames" portion of + # this extend tile (for stage-2 refinement). + _audio_new_init = None + src_audio = audio_info.get("tensor") if audio_info else None + if src_audio is not None and accumulated_audio is not None: + # The extend tile adds new video frames after the overlap. + # Map the video new-frame region to audio frames. + acc_audio_T = accumulated_audio.shape[2] + audio_ratio = ( + audio_info["total_audio_frames"] + / max(audio_info["total_video_frames"], 1) + ) + video_new_latent = ( + latent_chunk["samples"].shape[2] + - sampling_config.temporal_overlap + ) + audio_new_frames = max( + 1, round(video_new_latent * audio_ratio) + ) + # The new audio starts where accumulated audio ends + audio_start = acc_audio_T + audio_end = min( + audio_start + audio_new_frames, + src_audio.shape[2], + ) + if audio_start < src_audio.shape[2]: + _audio_new_init = src_audio[ + :, :, audio_start:audio_end + ] + tile_out_latents = LTXVExtendSampler().sample( model=model_config.model, vae=model_config.vae, @@ -516,10 +619,19 @@ def _process_temporal_chunks( optional_initialization_latents=latent_chunk, guiding_start_step=sampling_config.guiding_start_step, guiding_end_step=sampling_config.guiding_end_step, + _audio_tile=accumulated_audio, + _audio_new_init=_audio_new_init, )[0] + # Update accumulated audio from extend tile + accumulated_audio = tile_out_latents.pop("_audio", accumulated_audio) + chunk_index += 1 + # Store accumulated audio in the output for the caller + if accumulated_audio is not None: + tile_out_latents["_audio"] = accumulated_audio + return tile_out_latents def _create_spatial_weights( @@ -600,17 +712,24 @@ def _prepare_guider_for_chunk( """Prepare the guider for a specific chunk, handling optional positive conditionings.""" if optional_positive_conditionings is not None: new_guider = copy.copy(guider) - positive, negative = guider.raw_conds + positive, negative = _get_raw_conds_from_guider(guider) # Use the conditioning at chunk_index, or the last one if we've run out conditioning_index = min( chunk_index, len(optional_positive_conditionings) - 1 ) + new_cond = optional_positive_conditionings[conditioning_index] + print( + f"[LoopingSampler] Chunk {chunk_index}: using prompt {conditioning_index} " + f"(of {len(optional_positive_conditionings)}), " + f"cond shape={new_cond[0][0].shape if new_cond and len(new_cond[0]) > 0 else 'N/A'}, " + f"has frame_rate={'frame_rate' in new_cond[0][1] if new_cond and len(new_cond[0]) > 1 else 'N/A'}" + ) new_guider.set_conds( - optional_positive_conditionings[conditioning_index], + new_cond, negative, ) new_guider.raw_conds = ( - optional_positive_conditionings[conditioning_index], + new_cond, negative, ) return new_guider @@ -710,7 +829,7 @@ def sample( cond_image_strength=1.0, optional_guiding_latents=None, optional_negative_index_latents=None, - optional_negative_index_strength=1.0, # hidden interface + optional_negative_index_strength=1.0, optional_positive_conditionings=None, guiding_start_step=0, guiding_end_step=1000, @@ -720,13 +839,36 @@ def sample( ): # Get dimensions and prepare for spatial tiling samples = latents["samples"] + + # Handle AV latents: separate video and audio, process video through + # the tile loop, then reassemble AV output at the end. + audio_info = None if ( - isinstance(samples, comfy.nested_tensor.NestedTensor) + isinstance(samples, NestedTensor) and len(samples.tensors) == 2 ): - raise ValueError( - "LoopingSampler currently does not support Audio Visual latents. please only use video latents." + video_tensor = samples.tensors[0] + audio_tensor = samples.tensors[1] + audio_info = { + "channels": audio_tensor.shape[1], + "freq_bins": audio_tensor.shape[3], + "total_video_frames": video_tensor.shape[2], + "total_audio_frames": audio_tensor.shape[2], + "device": audio_tensor.device, + "dtype": audio_tensor.dtype, + "tensor": audio_tensor, # preserve for stage-2 refinement + } + # Switch to video-only for existing tiling logic + latents = latents.copy() + latents["samples"] = video_tensor + if "noise_mask" in latents and isinstance(latents["noise_mask"], NestedTensor): + latents["noise_mask"] = latents["noise_mask"].tensors[0] + samples = video_tensor + print( + f"[LoopingSampler] AV latent detected: video={video_tensor.shape}, " + f"audio={audio_tensor.shape}. Audio will be generated jointly." ) + batch, channels, frames, height, width = samples.shape time_scale_factor, width_scale_factor, height_scale_factor = ( vae.downscale_index_formula @@ -890,12 +1032,19 @@ def sample( guider=guider, ) + # Only process audio for the first spatial tile (audio has no spatial dim) + tile_audio_info = audio_info if (v == 0 and h == 0) else None tile_out_latents = self._process_temporal_chunks( tile_config, sampling_config, model_config, + audio_info=tile_audio_info, ) + # Extract accumulated audio from first spatial tile + if v == 0 and h == 0 and audio_info is not None: + accumulated_audio = tile_out_latents.pop("_audio", None) + # Initialize output tensors on first tile (to get correct temporal dimension) if final_output is None: out_temporal = tile_out_latents["samples"].shape[2] @@ -931,7 +1080,16 @@ def sample( # Normalize by weights final_output = final_output / (weights + 1e-8) - out_latents = {"samples": final_output} + + # Reassemble AV output if audio was processed + if audio_info is not None and accumulated_audio is not None: + out_latents = {"samples": NestedTensor([final_output, accumulated_audio])} + print( + f"[LoopingSampler] AV output: video={final_output.shape}, " + f"audio={accumulated_audio.shape}" + ) + else: + out_latents = {"samples": final_output} noise.seed = first_seed return (out_latents,) @@ -951,11 +1109,23 @@ def INPUT_TYPES(s): { "multiline": True, "dynamicPrompts": True, - "tooltip": "Prompts to encode, one per line. Each prompt will be encoded separately. Each prompt will be used in one temporal_tile in LTXVLoopingSampler.", + "tooltip": "Prompts to encode, separated by |. Each prompt will be encoded separately. Each prompt will be used in one temporal_tile in LTXVLoopingSampler.", }, ), "clip": ("CLIP", {"tooltip": "CLIP model to encode the prompts."}), }, + "optional": { + "frame_rate": ( + "FLOAT", + { + "default": 24.0, + "min": 0.0, + "max": 1000.0, + "step": 0.01, + "tooltip": "Frame rate to embed in the conditioning (same as LTXVConditioning). Required for proper temporal and audio generation.", + }, + ), + }, } RETURN_TYPES = ("CONDITIONING",) @@ -964,11 +1134,16 @@ def INPUT_TYPES(s): FUNCTION = "get_prompt_list" CATEGORY = "prompt" - def get_prompt_list(self, prompts, clip): + def get_prompt_list(self, prompts, clip, frame_rate=24.0): + import node_helpers + prompt_list = prompts.split("|") prompt_list = [prompt.strip() for prompt in prompt_list] encoded_prompt_list = [ - clip.encode_from_tokens_scheduled(clip.tokenize(prompt)) + node_helpers.conditioning_set_values( + clip.encode_from_tokens_scheduled(clip.tokenize(prompt)), + {"frame_rate": frame_rate}, + ) for prompt in prompt_list ] return (encoded_prompt_list,)