huggingface · yzhautouskay · May 28, 2026 · May 28, 2026 · May 28, 2026 · May 28, 2026
diff --git a/docs/source/en/api/pipelines/cosmos3.md b/docs/source/en/api/pipelines/cosmos3.md
@@ -459,6 +459,105 @@ encode_video(
 </hfoption>
 </hfoptions>
 
+## Action policy
+
+Action policy generation predicts future video and action tokens from the first observation frame, text prompt, and action domain metadata. The example below uses the Bridge robot domain and writes the predicted action chunk to JSON in model-normalized action space.
+
+<hfoptions id="model">
+<hfoption id="Nano">
+
+```python
+import json
+
+import torch
+from diffusers import Cosmos3OmniPipeline
+from diffusers.utils import export_to_video, load_video
+
+pipe = Cosmos3OmniPipeline.from_pretrained(
+    "nvidia/Cosmos3-Nano", torch_dtype=torch.bfloat16, device_map="cuda"
+)
+
+prompt = (
+    "Put the pot to the left of the purple item. This video is captured from a first-person perspective looking "
+    "at the scene."
+)
+video = load_video(
+    "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4"
+)
+
+result = pipe(
+    prompt=prompt,
+    video=video,
+    num_frames=17,
+    height=480,
+    width=832,
+    fps=5,
+    num_inference_steps=30,
+    guidance_scale=1.0,
+    action_mode="policy",
+    action_chunk_size=16,
+    raw_action_dim=10,
+    domain_name="bridge_orig_lerobot",
+    use_system_prompt=False,
+)
+
+# macro_block_size=1 allows arbitrary frame sizes (Cosmos3 outputs are not always divisible by 16).
+export_to_video(result.video, "sample.mp4", fps=5, macro_block_size=1)
+
+if result.action is not None:
+    with open("sample_action.json", "w") as f:
+        json.dump(result.action[0].tolist(), f)
+```
+
+</hfoption>
+<hfoption id="Super">
+
+```python
+import json
+
+import torch
+from diffusers import Cosmos3OmniPipeline
+from diffusers.utils import export_to_video, load_video
+
+pipe = Cosmos3OmniPipeline.from_pretrained(
+    "nvidia/Cosmos3-Super", torch_dtype=torch.bfloat16, device_map="cuda"
+)
+
+prompt = (
+    "Put the pot to the left of the purple item. This video is captured from a first-person perspective looking "
+    "at the scene."
+)
+video = load_video(
+    "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4"
+)
+
+result = pipe(
+    prompt=prompt,
+    video=video,
+    num_frames=17,
+    height=480,
+    width=832,
+    fps=5,
+    num_inference_steps=30,
+    guidance_scale=1.0,
+    action_mode="policy",
+    action_chunk_size=16,
+    raw_action_dim=10,
+    domain_name="bridge_orig_lerobot",
+    use_system_prompt=False,
+)
+
+# macro_block_size=1 allows arbitrary frame sizes (Cosmos3 outputs are not always divisible by 16).
+export_to_video(result.video, "sample.mp4", fps=5, macro_block_size=1)
+
+if result.action is not None:
+    with open("sample_action.json", "w") as f:
+        json.dump(result.action[0].tolist(), f)
+```
+
+</hfoption>
+</hfoptions>
+
 ## Metadata templates
 
 `tokenize_prompt` appends short metadata sentences inside the user message so the LLM sees the conditioning the model was trained with. The positive prompt gets sentences like *"The video is 7.9 seconds long and is of 24 FPS."* and *"This video is of 720x1280 resolution."*; the negative prompt gets the inverse (*"… is not …"*).

diff --git a/examples/cosmos3/README.md b/examples/cosmos3/README.md
@@ -48,6 +48,104 @@ python examples/cosmos3/inference_cosmos3.py \
     --enable-sound
 ```
 
+Action forward dynamics, robot domain (predict video from an observation video and a provided action chunk):
+
+```bash
+python examples/cosmos3/inference_cosmos3.py \
+    --model nano \
+    --prompt "Put the pot to the left of the purple item. This video is captured from a first-person perspective looking at the scene." \
+    --vision-path "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4" \
+    --action-mode forward_dynamics \
+    --action-path "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.json" \
+    --action-chunk-size 16 \
+    --domain-name bridge_orig_lerobot \
+    --height 480 --width 832 --fps 5 \
+    --num-inference-steps 30 --guidance-scale 1.0 --flow-shift 5.0 --seed 0 \
+    --output results/cosmos3_forward_dynamics_robot
+```
+
+Action forward dynamics, autonomous-vehicle domain:
+
+```bash
+python examples/cosmos3/inference_cosmos3.py \
+    --model nano \
+    --prompt "You are an autonomous vehicle planning system. This video is captured from a first-person perspective looking at the scene." \
+    --vision-path "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/av_vision_25_73d01c91-51f0-46cf-9b76-5682a76fb349.mp4" \
+    --action-mode forward_dynamics \
+    --action-path "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/av_action_25.json" \
+    --action-chunk-size 60 \
+    --domain-name av \
+    --height 480 --width 832 --fps 10 \
+    --num-inference-steps 30 --guidance-scale 1.0 --flow-shift 5.0 --seed 0 \
+    --output results/cosmos3_forward_dynamics_av
+```
+
+Action inverse dynamics, robot domain (predict actions from an observed video):
+
+```bash
+python examples/cosmos3/inference_cosmos3.py \
+    --model nano \
+    --prompt "Put the pot to the left of the purple item. This video is captured from a first-person perspective looking at the scene." \
+    --vision-path "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4" \
+    --action-mode inverse_dynamics \
+    --action-chunk-size 16 \
+    --raw-action-dim 10 \
+    --domain-name bridge_orig_lerobot \
+    --height 480 --width 832 --fps 5 \
+    --num-inference-steps 30 --guidance-scale 1.0 --flow-shift 5.0 --seed 0 \
+    --output results/cosmos3_inverse_dynamics_robot
+```
+
+Action inverse dynamics, autonomous-vehicle domain:
+
+```bash
+python examples/cosmos3/inference_cosmos3.py \
+    --model nano \
+    --prompt "You are an autonomous vehicle planning system. This video is captured from a first-person perspective looking at the scene." \
+    --vision-path "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/av_vision_25_73d01c91-51f0-46cf-9b76-5682a76fb349.mp4" \
+    --action-mode inverse_dynamics \
+    --action-chunk-size 60 \
+    --raw-action-dim 9 \
+    --domain-name av \
+    --height 480 --width 832 --fps 10 \
+    --num-inference-steps 30 --guidance-scale 1.0 --flow-shift 5.0 --seed 0 \
+    --output results/cosmos3_inverse_dynamics_av
+```
+
+Action policy, robot domain (predict both future video and actions from the first observation frame):
+
+```bash
+python examples/cosmos3/inference_cosmos3.py \
+    --model nano \
+    --prompt "Put the pot to the left of the purple item. This video is captured from a first-person perspective looking at the scene." \
+    --vision-path "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4" \
+    --action-mode policy \
+    --action-chunk-size 16 \
+    --raw-action-dim 10 \
+    --domain-name bridge_orig_lerobot \
+    --height 480 --width 832 --fps 5 \
+    --num-inference-steps 30 --guidance-scale 1.0 --flow-shift 5.0 --seed 0 \
+    --output results/cosmos3_policy_robot
+```
+
+Action policy, autonomous-vehicle domain:
+
+```bash
+python examples/cosmos3/inference_cosmos3.py \
+    --model nano \
+    --prompt "You are an autonomous vehicle planning system. Please go backward. This video is captured from a first-person perspective looking at the scene." \
+    --vision-path "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/av_vision_25_73d01c91-51f0-46cf-9b76-5682a76fb349.mp4" \
+    --action-mode policy \
+    --action-chunk-size 60 \
+    --raw-action-dim 9 \
+    --domain-name av \
+    --height 480 --width 832 --fps 10 \
+    --num-inference-steps 30 --guidance-scale 1.0 --flow-shift 5.0 --seed 0 \
+    --output results/cosmos3_policy_av
+```
+
+Action modes use `action_chunk_size + 1` video frames. `forward_dynamics` consumes `--action-path`; `inverse_dynamics` and `policy` write predicted actions to `sample-*_action.json` in model-normalized action space. The upstream camera-pose forward-dynamics sample uses a still image (`mountain_720.png`), while this wrapper currently expects `--vision-path` to load as video for action modes.
+
 ### Useful flags
 
 | Flag | Default | Description |
@@ -58,6 +156,11 @@ python examples/cosmos3/inference_cosmos3.py \
 | `--height` / `--width` | `720` / `1280` | Output resolution (must be a multiple of the VAE spatial scale factor). |
 | `--fps` | `24.0` | Frame rate of the generated video. |
 | `--enable-sound` | off | Generate a synchronized audio track. |
+| `--action-mode` | `None` | Enable action conditioning/generation. One of `forward_dynamics`, `inverse_dynamics`, or `policy`. |
+| `--action-path` | `None` | URL or local JSON action path for `forward_dynamics`. |
+| `--action-chunk-size` | `None` | Number of action tokens. Action runs generate/use `action_chunk_size + 1` video frames. |
+| `--domain-name` | `None` | Action embodiment domain, for example `bridge_orig_lerobot` or `av`. |
+| `--raw-action-dim` | `None` | Slice predicted action output to the unpadded action dimension. Required for `inverse_dynamics` and `policy`. |
 | `--no-duration-template` | off | Skip the duration metadata sentence appended to the prompt and negative prompt. Ignored for `--num-frames 1`. |
 | `--no-resolution-template` | off | Skip the resolution metadata sentence appended to the prompt and negative prompt. |
 | `--output` | `.` | Directory to write `sample.jpg` or `sample.mp4`. |
diff --git a/examples/cosmos3/inference_cosmos3.py b/examples/cosmos3/inference_cosmos3.py
@@ -23,13 +23,15 @@
 """
 
 import argparse
+import json
 import pathlib
+import urllib.request
 
 import torch
 from huggingface_hub import snapshot_download
 
 from diffusers import Cosmos3OmniPipeline
-from diffusers.utils import encode_video, export_to_video, load_image
+from diffusers.utils import encode_video, export_to_video, load_image, load_video
 
 
 HF_REPOS = {
@@ -38,6 +40,22 @@
 }
 
 
+def _load_action(path: str | None):
+    if path is None:
+        raise ValueError("--action-path is required for forward_dynamics mode.")
+    if path.startswith(("http://", "https://")):
+        with urllib.request.urlopen(path) as response:
+            action = json.loads(response.read().decode("utf-8"))
+    else:
+        action = json.loads(pathlib.Path(path).read_text())
+    tensor = torch.as_tensor(action, dtype=torch.float32)
+    if tensor.ndim == 3 and tensor.shape[0] == 1:
+        tensor = tensor.squeeze(0)
+    if tensor.ndim != 2:
+        raise ValueError(f"Cosmos3 action must have shape [T, D], got {tuple(tensor.shape)}.")
+    return tensor
+
+
 def main():
     parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
     parser.add_argument("--prompt", required=True, help="Text prompt.")
@@ -50,7 +68,7 @@ def main():
     parser.add_argument(
         "--vision-path",
         default=None,
-        help="Optional URL or local path for an image-conditioning frame (enables image-to-video).",
+        help="Optional URL or local path for an image-conditioning frame, or an action conditioning video.",
     )
     parser.add_argument("--output", default=".", help="Directory to save generated video/image/audio files.")
     parser.add_argument("--height", type=int, default=720)
@@ -62,12 +80,25 @@ def main():
         help="Number of frames to generate. Use 1 for text-to-image; defaults to 189 for video (≈ 7.9s @ 24 FPS).",
     )
     parser.add_argument("--fps", type=float, default=24.0)
+    parser.add_argument("--guidance-scale", type=float, default=6.0, help="Classifier-free guidance scale.")
+    parser.add_argument("--num-inference-steps", type=int, default=35, help="Number of denoising steps.")
+    parser.add_argument("--seed", type=int, default=None, help="Random seed for latent initialization.")
     parser.add_argument(
         "--enable-sound",
         action="store_true",
         default=False,
         help="Generate sound alongside video (requires a sound-capable checkpoint).",
     )
+    parser.add_argument(
+        "--action-mode",
+        choices=["forward_dynamics", "inverse_dynamics", "policy"],
+        default=None,
+        help="Enable Cosmos3 action generation with a loaded conditioning video.",
+    )
+    parser.add_argument("--action-path", default=None, help="JSON action path for forward_dynamics mode.")
+    parser.add_argument("--action-chunk-size", type=int, default=None, help="Number of action tokens to generate/use.")
+    parser.add_argument("--domain-name", default=None, help="Cosmos3 action embodiment domain name.")
+    parser.add_argument("--raw-action-dim", type=int, default=None, help="Slice predicted action output to this size.")
     parser.add_argument(
         "--no-duration-template",
         dest="add_duration_template",
@@ -110,21 +141,52 @@ def main():
 
     output_dir = pathlib.Path(args.output)
     output_dir.mkdir(parents=True, exist_ok=True)
-
-    image = load_image(args.vision_path) if args.vision_path is not None else None
-
-    result = pipeline(
-        prompt=args.prompt,
-        image=image,
-        num_frames=args.num_frames,
-        height=args.height,
-        width=args.width,
-        fps=args.fps,
-        enable_sound=args.enable_sound,
-        add_resolution_template=args.add_resolution_template,
-        add_duration_template=args.add_duration_template,
-        enable_safety_check=not args.no_safety_check,
-    )
+    generator = torch.Generator().manual_seed(args.seed) if args.seed is not None else None
+
+    if args.action_mode is not None:
+        if args.vision_path is None:
+            raise ValueError("--vision-path must point to a video for action modes.")
+        if args.action_chunk_size is None:
+            raise ValueError("--action-chunk-size is required for action modes.")
+        video = load_video(args.vision_path)
+        action = _load_action(args.action_path) if args.action_mode == "forward_dynamics" else None
+        result = pipeline(
+            prompt=args.prompt,
+            video=video,
+            num_frames=args.action_chunk_size + 1,
+            height=args.height,
+            width=args.width,
+            fps=args.fps,
+            num_inference_steps=args.num_inference_steps,
+            action_mode=args.action_mode,
+            raw_actions=action,
+            action_chunk_size=args.action_chunk_size,
+            domain_name=args.domain_name,
+            raw_action_dim=args.raw_action_dim,
+            guidance_scale=args.guidance_scale,
+            generator=generator,
+            use_system_prompt=False,
+            add_resolution_template=args.add_resolution_template,
+            add_duration_template=args.add_duration_template,
+            enable_safety_check=not args.no_safety_check,
+        )
+    else:
+        image = load_image(args.vision_path) if args.vision_path is not None else None
+        result = pipeline(
+            prompt=args.prompt,
+            image=image,
+            num_frames=args.num_frames,
+            height=args.height,
+            width=args.width,
+            fps=args.fps,
+            num_inference_steps=args.num_inference_steps,
+            enable_sound=args.enable_sound,
+            guidance_scale=args.guidance_scale,
+            generator=generator,
+            add_resolution_template=args.add_resolution_template,
+            add_duration_template=args.add_duration_template,
+            enable_safety_check=not args.no_safety_check,
+        )
 
     if args.num_frames == 1:
         save_path = output_dir / "sample.jpg"
@@ -145,6 +207,13 @@ def main():
             export_to_video(result.video, str(save_path), fps=int(args.fps), quality=10, macro_block_size=1)
     print(f"Saved: {save_path}")
 
+    if result.action is not None:
+        for action in result.action:
+            action_path = output_dir / "sample_action.json"
+            with open(action_path, "w") as f:
+                json.dump(action.tolist(), f)
+            print(f"Saved: {action_path}")
+
 
 if __name__ == "__main__":
     main()