starting cp benchmarking

pstjohn · pstjohn · commit 18b7f9ead55a · 2026-02-11T09:40:59.000-08:00
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/bionemo-recipes/recipes/llama3_native_te/dataset.py b/bionemo-recipes/recipes/llama3_native_te/dataset.py
@@ -17,6 +17,7 @@
 
 import datasets
 import datasets.distributed
+import torch
 from torch.utils.data import DataLoader, DistributedSampler
 from torchdata.stateful_dataloader import StatefulDataLoader
 from transformers import AutoTokenizer
@@ -306,3 +307,75 @@ def create_thd_dataloader(
     )
 
     return train_dataloader, tokenized_dataset
+
+
+class MockTokenDataset(torch.utils.data.Dataset):
+    """Dataset that generates random token sequences for benchmarking.
+
+    All sequences have the same fixed length, so no padding is needed.
+
+    Args:
+        vocab_size: Vocabulary size for random token generation.
+        seq_length: Length of each generated sequence.
+        num_samples: Total number of samples in the dataset.
+    """
+
+    def __init__(self, vocab_size: int, seq_length: int, num_samples: int):
+        """Initialize the mock dataset."""
+        self.vocab_size = vocab_size
+        self.seq_length = seq_length
+        self.num_samples = num_samples
+
+    def __len__(self):
+        """Return the number of samples."""
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        """Return a random token sequence."""
+        input_ids = torch.randint(0, self.vocab_size, (self.seq_length,))
+        return {"input_ids": input_ids}
+
+
+def _mock_collator(features: list[dict[str, torch.Tensor]]) -> dict[str, torch.Tensor]:
+    """Collator for MockTokenDataset that stacks fixed-length sequences into a batch."""
+    input_ids = torch.stack([f["input_ids"] for f in features])
+    return {"input_ids": input_ids, "labels": input_ids.clone(), "attention_mask": torch.ones_like(input_ids)}
+
+
+def create_mock_dataloader(
+    distributed_config: DistributedConfig,
+    micro_batch_size: int,
+    max_seq_length: int,
+    vocab_size: int = 128256,
+    num_samples: int = 100_000,
+    **kwargs,
+):
+    """Create a mock dataloader with random tokens for benchmarking.
+
+    Args:
+        distributed_config: The distributed configuration.
+        micro_batch_size: The batch size per device.
+        max_seq_length: The sequence length of each generated sample.
+        vocab_size: Vocabulary size for random token generation. Defaults to Llama 3 vocab size.
+        num_samples: Total number of samples in the dataset.
+        **kwargs: Ignored extra arguments for compatibility with other dataloader configs.
+
+    Returns:
+        A tuple of (dataloader, sampler).
+    """
+    dataset = MockTokenDataset(vocab_size, max_seq_length, num_samples)
+    sampler = DistributedSampler(
+        dataset,
+        rank=distributed_config.rank,
+        num_replicas=distributed_config.world_size,
+        seed=42,
+    )
+    train_dataloader = DataLoader(
+        dataset,
+        batch_size=micro_batch_size,
+        sampler=sampler,
+        collate_fn=_mock_collator,
+        num_workers=0,
+        pin_memory=True,
+    )
+    return train_dataloader, sampler
diff --git a/bionemo-recipes/recipes/llama3_native_te/hydra_config/L2_cp_benchmark.yaml b/bionemo-recipes/recipes/llama3_native_te/hydra_config/L2_cp_benchmark.yaml
@@ -0,0 +1,41 @@
+defaults:
+  - defaults
+  - _self_
+
+config_name_or_path: ./model_configs/meta-llama/Llama-3.2-1B
+
+config_kwargs:
+  attn_input_format: "bshd"
+  self_attn_mask_type: "causal"
+
+cp_size: 1
+
+use_mock_dataset: true
+use_sequence_packing: false
+use_meta_device: true
+use_torch_compile: false
+
+num_train_steps: 100
+
+dataset:
+  tokenizer_name_or_path: null  # Not needed for mock dataset
+  micro_batch_size: 1
+  max_seq_length: 8192
+  num_samples: 100_000
+  load_dataset_kwargs: null  # Not needed for mock dataset
+
+wandb:
+  name: "llama3-cp-benchmark"
+  mode: "offline"
+
+lr_scheduler_kwargs:
+  num_warmup_steps: 10
+  num_decay_steps: 90
+
+checkpoint:
+  ckpt_dir: null
+  save_final_model: false
+  resume_from_checkpoint: false
+
+logger:
+  frequency: 1
diff --git a/bionemo-recipes/recipes/llama3_native_te/hydra_config/defaults.yaml b/bionemo-recipes/recipes/llama3_native_te/hydra_config/defaults.yaml
@@ -13,6 +13,7 @@ use_meta_device: true
 use_torch_compile: false
 
 use_sequence_packing: false
+use_mock_dataset: false
 
 dataset:
   tokenizer_name_or_path: ??? # Set to the path of your tokenizer (e.g.,  meta-llama/Llama-3.1-8B or ./tokenizers/nucleotide_fast_tokenizer)
diff --git a/bionemo-recipes/recipes/llama3_native_te/train_fsdp2_cp.py b/bionemo-recipes/recipes/llama3_native_te/train_fsdp2_cp.py
@@ -30,7 +30,7 @@
 
 from checkpoint import load_checkpoint_fsdp2, save_checkpoint_fsdp2, save_final_model_fsdp2, should_save_checkpoint
 from collator import ContextParallelDataLoaderWrapper, DataCollatorForContextParallel
-from dataset import create_bshd_dataloader, create_thd_dataloader
+from dataset import create_bshd_dataloader, create_mock_dataloader, create_thd_dataloader
 from distributed_config import DistributedConfig
 from modeling_llama_te import NVLlamaConfig, NVLlamaForCausalLM
 from perf_logger import PerfLogger
@@ -119,7 +119,11 @@ def main(args: DictConfig) -> float | None:
         logger.info("pad_sequences_to_be_divisible_by is not provided, using cp_mesh.size() * 2")
         OmegaConf.update(args, "dataset.pad_sequences_to_be_divisible_by", device_mesh["cp"].size() * 2)
     if device_mesh["cp"].get_local_rank() == 0:
-        if args.use_sequence_packing:
+        if args.use_mock_dataset:
+            train_dataloader, dataset_or_sampler = create_mock_dataloader(
+                dist_config, vocab_size=config.vocab_size, **args.dataset
+            )
+        elif args.use_sequence_packing:
             train_dataloader, dataset_or_sampler = create_thd_dataloader(dist_config, **args.dataset)
         else:
             train_dataloader, dataset_or_sampler = create_bshd_dataloader(dist_config, **args.dataset)