Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions docs/source/en/api/pipelines/cosmos3.md
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,105 @@ encode_video(
</hfoption>
</hfoptions>

## Action policy

Action policy generation predicts future video and action tokens from the first observation frame, text prompt, and action domain metadata. The example below uses the Bridge robot domain and writes the predicted action chunk to JSON in model-normalized action space.

<hfoptions id="model">
<hfoption id="Nano">

```python
import json

import torch
from diffusers import Cosmos3OmniPipeline
from diffusers.utils import export_to_video, load_video

pipe = Cosmos3OmniPipeline.from_pretrained(
"nvidia/Cosmos3-Nano", torch_dtype=torch.bfloat16, device_map="cuda"
)

prompt = (
"Put the pot to the left of the purple item. This video is captured from a first-person perspective looking "
"at the scene."
)
video = load_video(
"https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4"
)

result = pipe(
prompt=prompt,
video=video,
num_frames=17,
height=480,
width=832,
fps=5,
num_inference_steps=30,
guidance_scale=1.0,
action_mode="policy",
action_chunk_size=16,
raw_action_dim=10,
domain_name="bridge_orig_lerobot",
use_system_prompt=False,
)

# macro_block_size=1 allows arbitrary frame sizes (Cosmos3 outputs are not always divisible by 16).
export_to_video(result.video, "sample.mp4", fps=5, macro_block_size=1)

if result.action is not None:
with open("sample_action.json", "w") as f:
json.dump(result.action[0].tolist(), f)
```

</hfoption>
<hfoption id="Super">

```python
import json

import torch
from diffusers import Cosmos3OmniPipeline
from diffusers.utils import export_to_video, load_video

pipe = Cosmos3OmniPipeline.from_pretrained(
"nvidia/Cosmos3-Super", torch_dtype=torch.bfloat16, device_map="cuda"
)

prompt = (
"Put the pot to the left of the purple item. This video is captured from a first-person perspective looking "
"at the scene."
)
video = load_video(
"https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4"
)

result = pipe(
prompt=prompt,
video=video,
num_frames=17,
height=480,
width=832,
fps=5,
num_inference_steps=30,
guidance_scale=1.0,
action_mode="policy",
action_chunk_size=16,
raw_action_dim=10,
domain_name="bridge_orig_lerobot",
use_system_prompt=False,
)

# macro_block_size=1 allows arbitrary frame sizes (Cosmos3 outputs are not always divisible by 16).
export_to_video(result.video, "sample.mp4", fps=5, macro_block_size=1)

if result.action is not None:
with open("sample_action.json", "w") as f:
json.dump(result.action[0].tolist(), f)
```

</hfoption>
</hfoptions>

## Metadata templates

`tokenize_prompt` appends short metadata sentences inside the user message so the LLM sees the conditioning the model was trained with. The positive prompt gets sentences like *"The video is 7.9 seconds long and is of 24 FPS."* and *"This video is of 720x1280 resolution."*; the negative prompt gets the inverse (*"… is not …"*).
Expand Down
103 changes: 103 additions & 0 deletions examples/cosmos3/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,104 @@ python examples/cosmos3/inference_cosmos3.py \
--enable-sound
```

Action forward dynamics, robot domain (predict video from an observation video and a provided action chunk):

```bash
python examples/cosmos3/inference_cosmos3.py \
--model nano \
--prompt "Put the pot to the left of the purple item. This video is captured from a first-person perspective looking at the scene." \
--vision-path "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4" \
--action-mode forward_dynamics \
--action-path "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.json" \
--action-chunk-size 16 \
--domain-name bridge_orig_lerobot \
--height 480 --width 832 --fps 5 \
--num-inference-steps 30 --guidance-scale 1.0 --flow-shift 5.0 --seed 0 \
--output results/cosmos3_forward_dynamics_robot
```

Action forward dynamics, autonomous-vehicle domain:

```bash
python examples/cosmos3/inference_cosmos3.py \
--model nano \
--prompt "You are an autonomous vehicle planning system. This video is captured from a first-person perspective looking at the scene." \
--vision-path "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/av_vision_25_73d01c91-51f0-46cf-9b76-5682a76fb349.mp4" \
--action-mode forward_dynamics \
--action-path "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/av_action_25.json" \
--action-chunk-size 60 \
--domain-name av \
--height 480 --width 832 --fps 10 \
--num-inference-steps 30 --guidance-scale 1.0 --flow-shift 5.0 --seed 0 \
--output results/cosmos3_forward_dynamics_av
```

Action inverse dynamics, robot domain (predict actions from an observed video):

```bash
python examples/cosmos3/inference_cosmos3.py \
--model nano \
--prompt "Put the pot to the left of the purple item. This video is captured from a first-person perspective looking at the scene." \
--vision-path "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4" \
--action-mode inverse_dynamics \
--action-chunk-size 16 \
--raw-action-dim 10 \
--domain-name bridge_orig_lerobot \
--height 480 --width 832 --fps 5 \
--num-inference-steps 30 --guidance-scale 1.0 --flow-shift 5.0 --seed 0 \
--output results/cosmos3_inverse_dynamics_robot
```

Action inverse dynamics, autonomous-vehicle domain:

```bash
python examples/cosmos3/inference_cosmos3.py \
--model nano \
--prompt "You are an autonomous vehicle planning system. This video is captured from a first-person perspective looking at the scene." \
--vision-path "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/av_vision_25_73d01c91-51f0-46cf-9b76-5682a76fb349.mp4" \
--action-mode inverse_dynamics \
--action-chunk-size 60 \
--raw-action-dim 9 \
--domain-name av \
--height 480 --width 832 --fps 10 \
--num-inference-steps 30 --guidance-scale 1.0 --flow-shift 5.0 --seed 0 \
--output results/cosmos3_inverse_dynamics_av
```

Action policy, robot domain (predict both future video and actions from the first observation frame):

```bash
python examples/cosmos3/inference_cosmos3.py \
--model nano \
--prompt "Put the pot to the left of the purple item. This video is captured from a first-person perspective looking at the scene." \
--vision-path "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4" \
--action-mode policy \
--action-chunk-size 16 \
--raw-action-dim 10 \
--domain-name bridge_orig_lerobot \
--height 480 --width 832 --fps 5 \
--num-inference-steps 30 --guidance-scale 1.0 --flow-shift 5.0 --seed 0 \
--output results/cosmos3_policy_robot
```

Action policy, autonomous-vehicle domain:

```bash
python examples/cosmos3/inference_cosmos3.py \
--model nano \
--prompt "You are an autonomous vehicle planning system. Please go backward. This video is captured from a first-person perspective looking at the scene." \
--vision-path "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/av_vision_25_73d01c91-51f0-46cf-9b76-5682a76fb349.mp4" \
--action-mode policy \
--action-chunk-size 60 \
--raw-action-dim 9 \
--domain-name av \
--height 480 --width 832 --fps 10 \
--num-inference-steps 30 --guidance-scale 1.0 --flow-shift 5.0 --seed 0 \
--output results/cosmos3_policy_av
```

Action modes use `action_chunk_size + 1` video frames. `forward_dynamics` consumes `--action-path`; `inverse_dynamics` and `policy` write predicted actions to `sample-*_action.json` in model-normalized action space. The upstream camera-pose forward-dynamics sample uses a still image (`mountain_720.png`), while this wrapper currently expects `--vision-path` to load as video for action modes.

### Useful flags

| Flag | Default | Description |
Expand All @@ -58,6 +156,11 @@ python examples/cosmos3/inference_cosmos3.py \
| `--height` / `--width` | `720` / `1280` | Output resolution (must be a multiple of the VAE spatial scale factor). |
| `--fps` | `24.0` | Frame rate of the generated video. |
| `--enable-sound` | off | Generate a synchronized audio track. |
| `--action-mode` | `None` | Enable action conditioning/generation. One of `forward_dynamics`, `inverse_dynamics`, or `policy`. |
| `--action-path` | `None` | URL or local JSON action path for `forward_dynamics`. |
| `--action-chunk-size` | `None` | Number of action tokens. Action runs generate/use `action_chunk_size + 1` video frames. |
| `--domain-name` | `None` | Action embodiment domain, for example `bridge_orig_lerobot` or `av`. |
| `--raw-action-dim` | `None` | Slice predicted action output to the unpadded action dimension. Required for `inverse_dynamics` and `policy`. |
| `--no-duration-template` | off | Skip the duration metadata sentence appended to the prompt and negative prompt. Ignored for `--num-frames 1`. |
| `--no-resolution-template` | off | Skip the resolution metadata sentence appended to the prompt and negative prompt. |
| `--output` | `.` | Directory to write `sample.jpg` or `sample.mp4`. |
103 changes: 86 additions & 17 deletions examples/cosmos3/inference_cosmos3.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,15 @@
"""

import argparse
import json
import pathlib
import urllib.request

import torch
from huggingface_hub import snapshot_download

from diffusers import Cosmos3OmniPipeline
from diffusers.utils import encode_video, export_to_video, load_image
from diffusers.utils import encode_video, export_to_video, load_image, load_video


HF_REPOS = {
Expand All @@ -38,6 +40,22 @@
}


def _load_action(path: str | None):
if path is None:
raise ValueError("--action-path is required for forward_dynamics mode.")
if path.startswith(("http://", "https://")):
with urllib.request.urlopen(path) as response:
action = json.loads(response.read().decode("utf-8"))
else:
action = json.loads(pathlib.Path(path).read_text())
tensor = torch.as_tensor(action, dtype=torch.float32)
if tensor.ndim == 3 and tensor.shape[0] == 1:
tensor = tensor.squeeze(0)
if tensor.ndim != 2:
raise ValueError(f"Cosmos3 action must have shape [T, D], got {tuple(tensor.shape)}.")
return tensor


def main():
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("--prompt", required=True, help="Text prompt.")
Expand All @@ -50,7 +68,7 @@ def main():
parser.add_argument(
"--vision-path",
default=None,
help="Optional URL or local path for an image-conditioning frame (enables image-to-video).",
help="Optional URL or local path for an image-conditioning frame, or an action conditioning video.",
)
parser.add_argument("--output", default=".", help="Directory to save generated video/image/audio files.")
parser.add_argument("--height", type=int, default=720)
Expand All @@ -62,12 +80,25 @@ def main():
help="Number of frames to generate. Use 1 for text-to-image; defaults to 189 for video (≈ 7.9s @ 24 FPS).",
)
parser.add_argument("--fps", type=float, default=24.0)
parser.add_argument("--guidance-scale", type=float, default=6.0, help="Classifier-free guidance scale.")
parser.add_argument("--num-inference-steps", type=int, default=35, help="Number of denoising steps.")
parser.add_argument("--seed", type=int, default=None, help="Random seed for latent initialization.")
parser.add_argument(
"--enable-sound",
action="store_true",
default=False,
help="Generate sound alongside video (requires a sound-capable checkpoint).",
)
parser.add_argument(
"--action-mode",
choices=["forward_dynamics", "inverse_dynamics", "policy"],
default=None,
help="Enable Cosmos3 action generation with a loaded conditioning video.",
)
parser.add_argument("--action-path", default=None, help="JSON action path for forward_dynamics mode.")
parser.add_argument("--action-chunk-size", type=int, default=None, help="Number of action tokens to generate/use.")
parser.add_argument("--domain-name", default=None, help="Cosmos3 action embodiment domain name.")
parser.add_argument("--raw-action-dim", type=int, default=None, help="Slice predicted action output to this size.")
parser.add_argument(
"--no-duration-template",
dest="add_duration_template",
Expand Down Expand Up @@ -110,21 +141,52 @@ def main():

output_dir = pathlib.Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)

image = load_image(args.vision_path) if args.vision_path is not None else None

result = pipeline(
prompt=args.prompt,
image=image,
num_frames=args.num_frames,
height=args.height,
width=args.width,
fps=args.fps,
enable_sound=args.enable_sound,
add_resolution_template=args.add_resolution_template,
add_duration_template=args.add_duration_template,
enable_safety_check=not args.no_safety_check,
)
generator = torch.Generator().manual_seed(args.seed) if args.seed is not None else None

if args.action_mode is not None:
if args.vision_path is None:
raise ValueError("--vision-path must point to a video for action modes.")
if args.action_chunk_size is None:
raise ValueError("--action-chunk-size is required for action modes.")
video = load_video(args.vision_path)
action = _load_action(args.action_path) if args.action_mode == "forward_dynamics" else None
result = pipeline(
prompt=args.prompt,
video=video,
num_frames=args.action_chunk_size + 1,
height=args.height,
width=args.width,
fps=args.fps,
num_inference_steps=args.num_inference_steps,
action_mode=args.action_mode,
raw_actions=action,
action_chunk_size=args.action_chunk_size,
domain_name=args.domain_name,
raw_action_dim=args.raw_action_dim,
guidance_scale=args.guidance_scale,
generator=generator,
use_system_prompt=False,
add_resolution_template=args.add_resolution_template,
add_duration_template=args.add_duration_template,
enable_safety_check=not args.no_safety_check,
)
else:
image = load_image(args.vision_path) if args.vision_path is not None else None
result = pipeline(
prompt=args.prompt,
image=image,
num_frames=args.num_frames,
height=args.height,
width=args.width,
fps=args.fps,
num_inference_steps=args.num_inference_steps,
enable_sound=args.enable_sound,
guidance_scale=args.guidance_scale,
generator=generator,
add_resolution_template=args.add_resolution_template,
add_duration_template=args.add_duration_template,
enable_safety_check=not args.no_safety_check,
)

if args.num_frames == 1:
save_path = output_dir / "sample.jpg"
Expand All @@ -145,6 +207,13 @@ def main():
export_to_video(result.video, str(save_path), fps=int(args.fps), quality=10, macro_block_size=1)
print(f"Saved: {save_path}")

if result.action is not None:
for action in result.action:
action_path = output_dir / "sample_action.json"
with open(action_path, "w") as f:
json.dump(action.tolist(), f)
print(f"Saved: {action_path}")


if __name__ == "__main__":
main()
Loading
Loading