huggingface · yiyixuxu · May 28, 2026 · May 14, 2026 · May 15, 2026 · May 15, 2026
diff --git a/.gitignore b/.gitignore
@@ -170,6 +170,9 @@ tags
 
 # RL pipelines may produce mp4 outputs
 *.mp4
+*.jpg
+*.jepg
+*.wav
 
 # dependencies
 /transformers

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -321,6 +321,8 @@
         title: CogView4Transformer2DModel
       - local: api/models/consisid_transformer3d
         title: ConsisIDTransformer3DModel
+      - local: api/models/cosmos3_omni_transformer
+        title: Cosmos3OmniTransformer
       - local: api/models/cosmos_transformer3d
         title: CosmosTransformer3DModel
       - local: api/models/dit_transformer2d
@@ -645,6 +647,8 @@
         title: ConsisID
       - local: api/pipelines/cosmos
         title: Cosmos
+      - local: api/pipelines/cosmos3
+        title: Cosmos3
       - local: api/pipelines/framepack
         title: Framepack
       - local: api/pipelines/helios

diff --git a/docs/source/en/api/models/cosmos3_omni_transformer.md b/docs/source/en/api/models/cosmos3_omni_transformer.md
@@ -0,0 +1,34 @@
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# Cosmos3OmniTransformer
+
+A Mixture-of-Transformer (MoT) joint vision-language transformer introduced as part of NVIDIA's Cosmos3 world foundation model family. The model runs two parallel computation pathways over a packed joint sequence:
+
+- a **causal "understanding" pathway** that self-attends over text tokens with causal masking, and
+- a **bi-directional "generation" pathway** that cross-attends from generation tokens (vision + optional sound latents) over the full understanding-plus-generation key/value set.
+
+The two pathways share the same hidden size and number of layers but maintain **separate Q/K/V/O projections, MLPs, and RMSNorm parameters**, which is what makes the architecture a Mixture-of-Transformer rather than a standard Mixture-of-Experts. Position information is supplied through a 3D multimodal RoPE (mRoPE) that interleaves temporal / height / width frequencies for video latents and reuses the temporal axis for text and audio.
+
+The model can be loaded as follows.
+
+```python
+import torch
+from diffusers import Cosmos3OmniTransformer
+
+transformer = Cosmos3OmniTransformer.from_pretrained(
+    "nvidia/Cosmos3-Nano", subfolder="transformer", torch_dtype=torch.bfloat16
+)
+```
+
+## Cosmos3OmniTransformer
+
+[[autodoc]] Cosmos3OmniTransformer
diff --git a/docs/source/en/api/pipelines/cosmos3.md b/docs/source/en/api/pipelines/cosmos3.md
diff --git a/docs/source/en/api/pipelines/ltx2.md b/docs/source/en/api/pipelines/ltx2.md
@@ -38,7 +38,7 @@ from diffusers import FlowMatchEulerDiscreteScheduler
 from diffusers.pipelines.ltx2 import LTX2Pipeline, LTX2LatentUpsamplePipeline
 from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
 from diffusers.pipelines.ltx2.utils import STAGE_2_DISTILLED_SIGMA_VALUES
-from diffusers.pipelines.ltx2.export_utils import encode_video
+from diffusers.utils import encode_video
 
 device = "cuda:0"
 width = 768
@@ -124,7 +124,7 @@ import torch
 from diffusers.pipelines.ltx2 import LTX2Pipeline, LTX2LatentUpsamplePipeline
 from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
 from diffusers.pipelines.ltx2.utils import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
-from diffusers.pipelines.ltx2.export_utils import encode_video
+from diffusers.utils import encode_video
 
 device = "cuda"
 width = 768
@@ -203,7 +203,7 @@ from diffusers import LTX2ConditionPipeline, LTX2LatentUpsamplePipeline
 from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
 from diffusers.pipelines.ltx2.pipeline_ltx2_condition import LTX2VideoCondition
 from diffusers.pipelines.ltx2.utils import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
-from diffusers.pipelines.ltx2.export_utils import encode_video
+from diffusers.utils import encode_video
 from diffusers.utils import load_image
 
 device = "cuda"
@@ -292,7 +292,7 @@ You can use both image and video conditions:
 import torch
 from diffusers import LTX2ConditionPipeline
 from diffusers.pipelines.ltx2.pipeline_ltx2_condition import LTX2VideoCondition
-from diffusers.pipelines.ltx2.export_utils import encode_video
+from diffusers.utils import encode_video
 from diffusers.pipelines.ltx2.utils import DEFAULT_NEGATIVE_PROMPT
 from diffusers.utils import load_image, load_video
 
@@ -367,7 +367,7 @@ These are controlled by the `guidance_scale`, `stg_scale`, and `modality_scale`
 ```py
 import torch
 from diffusers import LTX2ImageToVideoPipeline
-from diffusers.pipelines.ltx2.export_utils import encode_video
+from diffusers.utils import encode_video
 from diffusers.pipelines.ltx2.utils import DEFAULT_NEGATIVE_PROMPT
 from diffusers.utils import load_image
 
@@ -440,7 +440,7 @@ The LTX-2.X models are sensitive to prompting style. Refer to the [official prom
 import torch
 from transformers import Gemma3Processor
 from diffusers import LTX2Pipeline
-from diffusers.pipelines.ltx2.export_utils import encode_video
+from diffusers.utils import encode_video
 from diffusers.pipelines.ltx2.utils import DEFAULT_NEGATIVE_PROMPT, T2V_DEFAULT_SYSTEM_PROMPT
 
 device = "cuda"

diff --git a/docs/source/en/api/utilities.md b/docs/source/en/api/utilities.md
@@ -38,6 +38,10 @@ Utility and helper functions for working with 🤗 Diffusers.
 
 [[autodoc]] utils.export_to_video
 
+## encode_video
+
+[[autodoc]] utils.encode_video
+
 ## make_image_grid
 
 [[autodoc]] utils.make_image_grid

diff --git a/examples/cosmos3/README.md b/examples/cosmos3/README.md
@@ -0,0 +1,63 @@
+# Cosmos3 — smoke-test runner
+
+The canonical reference for `Cosmos3OmniPipeline` lives in the diffusers docs:
+[`docs/source/en/api/pipelines/cosmos3.md`](../../docs/source/en/api/pipelines/cosmos3.md). Use the
+examples there as the source of truth for application code — they cover text-to-image,
+text-to-video, image-to-video, and text+sound modes.
+
+This directory provides a small CLI wrapper (`inference_cosmos3.py`) that exercises the full
+load → encode → denoise → decode path against either the Hub release or a local checkpoint
+during development.
+
+## Setup
+
+```bash
+pip install -r examples/cosmos3/requirements.txt
+```
+
+## Usage
+
+Text-to-image:
+
+```bash
+python examples/cosmos3/inference_cosmos3.py \
+    --prompt "A medium shot of a modern robotics research laboratory…" \
+    --num-frames 1
+```
+
+Text-to-video:
+
+```bash
+python examples/cosmos3/inference_cosmos3.py \
+    --prompt "A waterfall cascading down a rocky cliff in a lush forest."
+```
+
+Image-to-video:
+
+```bash
+python examples/cosmos3/inference_cosmos3.py \
+    --prompt "The right robotic hand picks up the red sphere…" \
+    --vision-path https://github.com/nvidia-cosmos/cosmos-dependencies/releases/download/assets/robot_153.jpg
+```
+
+Text-to-video-with-sound (sound-capable checkpoint only):
+
+```bash
+python examples/cosmos3/inference_cosmos3.py \
+    --prompt "A waterfall in a lush forest." \
+    --enable-sound
+```
+
+### Useful flags
+
+| Flag | Default | Description |
+|---|---|---|
+| `--prompt` | (required) | Text prompt. |
+| `--vision-path` | `None` | URL or local path for an image-conditioning frame (image-to-video). |
+| `--num-frames` | `189` | `1` = image, otherwise number of video frames (`189` ≈ 7.9 s @ 24 FPS). |
+| `--height` / `--width` | `720` / `1280` | Output resolution (must be a multiple of the VAE spatial scale factor). |
+| `--fps` | `24.0` | Frame rate of the generated video. |
+| `--enable-sound` | off | Generate a synchronized audio track. |
+| `--no-duration-template` | off | Skip the duration metadata sentence appended to the prompt and negative prompt. Ignored for `--num-frames 1`. |
+| `--no-resolution-template` | off | Skip the resolution metadata sentence appended to the prompt and negative prompt. |
+| `--output` | `.` | Directory to write `sample.jpg` or `sample.mp4`. |
diff --git a/examples/cosmos3/inference_cosmos3.py b/examples/cosmos3/inference_cosmos3.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Minimal smoke-test runner for the Cosmos3 diffusers pipeline.
+
+Canonical examples live in the docs page at
+``docs/source/en/api/pipelines/cosmos3.md`` — copy from there for production use.
+This script exists to exercise the full load → encode → denoise → decode path
+during development.
+
+Text-to-image:
+    python inference_cosmos3.py --prompt "A robot in a lab." --num-frames 1
+
+Text-to-video:
+    python inference_cosmos3.py --prompt "A waterfall in a forest."
+
+Image-to-video:
+    python inference_cosmos3.py --prompt "..." --vision-path /path/to/image.jpg
+
+Text-to-video-with-sound (requires a sound-capable checkpoint):
+    python inference_cosmos3.py --prompt "..." --enable-sound
+"""
+
+import argparse
+import pathlib
+
+import torch
+from huggingface_hub import snapshot_download
+
+from diffusers import Cosmos3OmniPipeline
+from diffusers.utils import encode_video, export_to_video, load_image
+
+
+HF_REPOS = {
+    "nano": "nvidia/Cosmos3-Nano",
+    "super": "nvidia/Cosmos3-Super",
+}
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("--prompt", required=True, help="Text prompt.")
+    parser.add_argument(
+        "--model",
+        choices=sorted(HF_REPOS),
+        default="nano",
+        help="Which Cosmos3 checkpoint to load (maps to the corresponding nvidia/Cosmos3-* repo).",
+    )
+    parser.add_argument(
+        "--vision-path",
+        default=None,
+        help="Optional URL or local path for an image-conditioning frame (enables image-to-video).",
+    )
+    parser.add_argument("--output", default=".", help="Directory to save generated video/image/audio files.")
+    parser.add_argument("--height", type=int, default=720)
+    parser.add_argument("--width", type=int, default=1280)
+    parser.add_argument(
+        "--num-frames",
+        type=int,
+        default=189,
+        help="Number of frames to generate. Use 1 for text-to-image; defaults to 189 for video (≈ 7.9s @ 24 FPS).",
+    )
+    parser.add_argument("--fps", type=float, default=24.0)
+    parser.add_argument(
+        "--enable-sound",
+        action="store_true",
+        default=False,
+        help="Generate sound alongside video (requires a sound-capable checkpoint).",
+    )
+    parser.add_argument(
+        "--no-duration-template",
+        dest="add_duration_template",
+        action="store_false",
+        default=True,
+        help="Skip the duration metadata sentence appended to the prompt and negative prompt (video only).",
+    )
+    parser.add_argument(
+        "--no-resolution-template",
+        dest="add_resolution_template",
+        action="store_false",
+        default=True,
+        help="Skip the resolution metadata sentence appended to the prompt and negative prompt.",
+    )
+    parser.add_argument(
+        "--disable-safety-checker",
+        action="store_true",
+        default=False,
+        help="Disable the Cosmos Guardrail safety checker at pipeline construction (no checker instantiated).",
+    )
+    parser.add_argument(
+        "--no-safety-check",
+        action="store_true",
+        default=False,
+        help="Skip the Cosmos Guardrail text/video safety checks for this call (checker still constructed).",
+    )
+    args = parser.parse_args()
+
+    hf_repo = HF_REPOS[args.model]
+    print(f"Downloading pipeline from {hf_repo}")
+    pipeline_path = pathlib.Path(snapshot_download(repo_id=hf_repo))
+    print(f"Loading pipeline from {pipeline_path} …")
+    pipeline = Cosmos3OmniPipeline.from_pretrained(
+        str(pipeline_path),
+        torch_dtype=torch.bfloat16,
+        device_map="cuda",
+        enable_safety_checker=not args.disable_safety_checker,
+    )
+    print("Pipeline loaded successfully.")
+
+    output_dir = pathlib.Path(args.output)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    image = load_image(args.vision_path) if args.vision_path is not None else None
+
+    result = pipeline(
+        prompt=args.prompt,
+        image=image,
+        num_frames=args.num_frames,
+        height=args.height,
+        width=args.width,
+        fps=args.fps,
+        enable_sound=args.enable_sound,
+        add_resolution_template=args.add_resolution_template,
+        add_duration_template=args.add_duration_template,
+        enable_safety_check=not args.no_safety_check,
+    )
+
+    if args.num_frames == 1:
+        save_path = output_dir / "sample.jpg"
+        result.video[0].save(save_path, format="JPEG", quality=85)
+    else:
+        save_path = output_dir / "sample.mp4"
+        if result.sound is not None:
+            assert pipeline.sound_tokenizer is not None
+            encode_video(
+                result.video,
+                fps=int(args.fps),
+                audio=result.sound,
+                audio_sample_rate=pipeline.sound_tokenizer.config.sampling_rate,
+                output_path=str(save_path),
+            )
+        else:
+            # macro_block_size=1 allows arbitrary frame sizes (Cosmos3 outputs are not always divisible by 16).
+            export_to_video(result.video, str(save_path), fps=int(args.fps), quality=10, macro_block_size=1)
+    print(f"Saved: {save_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/cosmos3/requirements.txt b/examples/cosmos3/requirements.txt
@@ -0,0 +1,17 @@
+--extra-index-url https://download.pytorch.org/whl/cu130
+torch
+torchvision
+accelerate>=0.31.0
+av
+huggingface_hub
+imageio
+imageio-ffmpeg
+transformers>=4.41.2,<5
+einops
+peft>=0.11.1
+datasets
+numpy
+tqdm
+sentencepiece
+tensorboard
+wandb
-Original file line number
+Diff line change
@@ Expand Up / @@ -170,6 +170,9 @@ tags @@
     # RL pipelines may produce mp4 outputs
     *.mp4
+    *.jpg
+    *.jepg
+    *.wav
     # dependencies
     /transformers
@@ Expand Down @@