NVIDIA
diff --git a/‎bionemo-recipes/models/esm2/tests/common/test_modeling_common.py‎
Lines changed: 125 additions & 0 deletions b/‎bionemo-recipes/models/esm2/tests/common/test_modeling_common.py‎
Lines changed: 125 additions & 0 deletions
diff --git a/‎bionemo-recipes/models/llama3/tests/common/test_modeling_common.py‎
Lines changed: 125 additions & 0 deletions b/‎bionemo-recipes/models/llama3/tests/common/test_modeling_common.py‎
Lines changed: 125 additions & 0 deletions
diff --git a/‎bionemo-recipes/models/llama3/tests/test_modeling_llama_te.py‎
Lines changed: 19 additions & 1 deletion b/‎bionemo-recipes/models/llama3/tests/test_modeling_llama_te.py‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎bionemo-recipes/models/mixtral/modeling_mixtral_te.py‎
Lines changed: 3 additions & 3 deletions b/‎bionemo-recipes/models/mixtral/modeling_mixtral_te.py‎
Lines changed: 3 additions & 3 deletions
@@ -82,6 +82,10 @@ class BaseModelTest(ABC):
     Subclasses must implement all abstract methods to provide model-specific
     configuration, data preparation, and conversion functions.
 
+    Set ``is_autoregressive = True`` in subclasses for causal LM models to
+    enable generation / KV-cache smoke tests.  Non-autoregressive models
+    (e.g. ESM2) leave the default ``False`` and those tests are skipped.
+
     Example:
         ```python
         class ESM2ModelTester(BioNeMoModelTester):
@@ -98,6 +102,8 @@ def get_upstream_model_id(self):
         ```
     """
 
+    is_autoregressive: bool = False
+
     @abstractmethod
     def get_model_class(self) -> Type[PreTrainedModel]:
         """Return the TransformerEngine model class to test.
@@ -980,4 +986,123 @@ def test_meta_fp8_init(self, fp8_recipe):
         model.init_empty_weights()
         self.verify_model_parameters_initialized_correctly(model, should_be_fp8=True)
 
+    # ==================== Generation Tests (Autoregressive Models Only) ====================
+
+    def _create_inference_params(self, config, batch_size=1, max_seq_len=256, num_beams=1):
+        """Create inference params for KV-cache generation tests.
+
+        Autoregressive model tests must override this method to provide
+        model-specific ``HFInferenceParams`` with allocated KV-cache memory.
+
+        Args:
+            config: Model configuration.
+            batch_size: Batch size.
+            max_seq_len: Maximum sequence length.
+            num_beams: Number of beams for beam search.
+
+        Returns:
+            HFInferenceParams instance with allocated memory.
+        """
+        raise NotImplementedError(
+            "Autoregressive models must override _create_inference_params to provide model-specific HFInferenceParams."
+        )
+
+    def test_generate_without_cache(self):
+        """Test basic generation without KV-cache (BSHD, use_cache=False)."""
+        if not self.is_autoregressive:
+            pytest.skip("Not an autoregressive model")
+
+        config = self.create_test_config(attn_input_format="bshd", self_attn_mask_type="causal")
+        model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
+        model.eval()
+
+        tokenizer = self.get_tokenizer()
+        prompt = "The quick brown fox jumps over"
+        inputs = tokenizer(prompt, return_tensors="pt")
+        inputs = {k: v.to("cuda") for k, v in inputs.items()}
+
+        with torch.no_grad():
+            output_ids = model.generate(**inputs, max_new_tokens=16, use_cache=False)
+
+        assert output_ids.shape[1] > inputs["input_ids"].shape[1]
+
+    def test_generate_with_cache(self):
+        """Test single-prompt generation with KV-cache (THD format)."""
+        if not self.is_autoregressive:
+            pytest.skip("Not an autoregressive model")
+
+        config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
+        model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
+        model.eval()
+
+        tokenizer = self.get_tokenizer()
+        prompt = "The quick brown fox jumps over"
+        inputs = tokenizer(prompt, return_tensors="pt")
+        inputs = {k: v.to("cuda") for k, v in inputs.items()}
+
+        past_key_values = self._create_inference_params(config, batch_size=1)
+
+        with torch.no_grad():
+            output_ids = model.generate(**inputs, max_new_tokens=16, use_cache=True, past_key_values=past_key_values)
+
+        assert output_ids.shape[1] > inputs["input_ids"].shape[1]
+
+    def test_generate_with_cache_batched(self):
+        """Test batched generation with KV-cache (left-padded BSHD converted to THD)."""
+        if not self.is_autoregressive:
+            pytest.skip("Not an autoregressive model")
+
+        config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
+        model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
+        model.eval()
+
+        tokenizer = self.get_tokenizer()
+        prompts = (
+            "The quick brown fox jumps over the lazy dog.",
+            "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+        )
+        inputs = tokenizer(prompts, return_tensors="pt", padding=True, padding_side="left")
+        inputs = {k: v.to("cuda") for k, v in inputs.items()}
+
+        past_key_values = self._create_inference_params(config, batch_size=2)
+
+        with torch.no_grad():
+            output_ids = model.generate(**inputs, max_new_tokens=16, use_cache=True, past_key_values=past_key_values)
+
+        assert output_ids.shape[0] == 2
+        assert output_ids.shape[1] > inputs["input_ids"].shape[1]
+
+    def test_generate_with_cache_beam_search(self):
+        """Test batched generation with KV-cache and beam search."""
+        if not self.is_autoregressive:
+            pytest.skip("Not an autoregressive model")
+
+        config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
+        model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
+        model.eval()
+
+        tokenizer = self.get_tokenizer()
+        prompts = (
+            "The quick brown fox jumps over the lazy dog.",
+            "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+        )
+        inputs = tokenizer(prompts, return_tensors="pt", padding=True, padding_side="left")
+        inputs = {k: v.to("cuda") for k, v in inputs.items()}
+
+        num_beams = 2
+        past_key_values = self._create_inference_params(config, batch_size=2, num_beams=num_beams)
+
+        with torch.no_grad():
+            output_ids = model.generate(
+                **inputs,
+                max_new_tokens=16,
+                use_cache=True,
+                past_key_values=past_key_values,
+                num_beams=num_beams,
+                do_sample=True,
+            )
+
+        assert output_ids.shape[0] == 2
+        assert output_ids.shape[1] > inputs["input_ids"].shape[1]
+
     # TODO: add multi-GPU tests, e.g., meta-device init after fully_shard, cp tests, etc.
@@ -82,6 +82,10 @@ class BaseModelTest(ABC):
     Subclasses must implement all abstract methods to provide model-specific
     configuration, data preparation, and conversion functions.
 
+    Set ``is_autoregressive = True`` in subclasses for causal LM models to
+    enable generation / KV-cache smoke tests.  Non-autoregressive models
+    (e.g. ESM2) leave the default ``False`` and those tests are skipped.
+
     Example:
         ```python
         class ESM2ModelTester(BioNeMoModelTester):
@@ -98,6 +102,8 @@ def get_upstream_model_id(self):
         ```
     """
 
+    is_autoregressive: bool = False
+
     @abstractmethod
     def get_model_class(self) -> Type[PreTrainedModel]:
         """Return the TransformerEngine model class to test.
@@ -980,4 +986,123 @@ def test_meta_fp8_init(self, fp8_recipe):
         model.init_empty_weights()
         self.verify_model_parameters_initialized_correctly(model, should_be_fp8=True)
 
+    # ==================== Generation Tests (Autoregressive Models Only) ====================
+
+    def _create_inference_params(self, config, batch_size=1, max_seq_len=256, num_beams=1):
+        """Create inference params for KV-cache generation tests.
+
+        Autoregressive model tests must override this method to provide
+        model-specific ``HFInferenceParams`` with allocated KV-cache memory.
+
+        Args:
+            config: Model configuration.
+            batch_size: Batch size.
+            max_seq_len: Maximum sequence length.
+            num_beams: Number of beams for beam search.
+
+        Returns:
+            HFInferenceParams instance with allocated memory.
+        """
+        raise NotImplementedError(
+            "Autoregressive models must override _create_inference_params to provide model-specific HFInferenceParams."
+        )
+
+    def test_generate_without_cache(self):
+        """Test basic generation without KV-cache (BSHD, use_cache=False)."""
+        if not self.is_autoregressive:
+            pytest.skip("Not an autoregressive model")
+
+        config = self.create_test_config(attn_input_format="bshd", self_attn_mask_type="causal")
+        model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
+        model.eval()
+
+        tokenizer = self.get_tokenizer()
+        prompt = "The quick brown fox jumps over"
+        inputs = tokenizer(prompt, return_tensors="pt")
+        inputs = {k: v.to("cuda") for k, v in inputs.items()}
+
+        with torch.no_grad():
+            output_ids = model.generate(**inputs, max_new_tokens=16, use_cache=False)
+
+        assert output_ids.shape[1] > inputs["input_ids"].shape[1]
+
+    def test_generate_with_cache(self):
+        """Test single-prompt generation with KV-cache (THD format)."""
+        if not self.is_autoregressive:
+            pytest.skip("Not an autoregressive model")
+
+        config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
+        model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
+        model.eval()
+
+        tokenizer = self.get_tokenizer()
+        prompt = "The quick brown fox jumps over"
+        inputs = tokenizer(prompt, return_tensors="pt")
+        inputs = {k: v.to("cuda") for k, v in inputs.items()}
+
+        past_key_values = self._create_inference_params(config, batch_size=1)
+
+        with torch.no_grad():
+            output_ids = model.generate(**inputs, max_new_tokens=16, use_cache=True, past_key_values=past_key_values)
+
+        assert output_ids.shape[1] > inputs["input_ids"].shape[1]
+
+    def test_generate_with_cache_batched(self):
+        """Test batched generation with KV-cache (left-padded BSHD converted to THD)."""
+        if not self.is_autoregressive:
+            pytest.skip("Not an autoregressive model")
+
+        config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
+        model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
+        model.eval()
+
+        tokenizer = self.get_tokenizer()
+        prompts = (
+            "The quick brown fox jumps over the lazy dog.",
+            "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+        )
+        inputs = tokenizer(prompts, return_tensors="pt", padding=True, padding_side="left")
+        inputs = {k: v.to("cuda") for k, v in inputs.items()}
+
+        past_key_values = self._create_inference_params(config, batch_size=2)
+
+        with torch.no_grad():
+            output_ids = model.generate(**inputs, max_new_tokens=16, use_cache=True, past_key_values=past_key_values)
+
+        assert output_ids.shape[0] == 2
+        assert output_ids.shape[1] > inputs["input_ids"].shape[1]
+
+    def test_generate_with_cache_beam_search(self):
+        """Test batched generation with KV-cache and beam search."""
+        if not self.is_autoregressive:
+            pytest.skip("Not an autoregressive model")
+
+        config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
+        model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
+        model.eval()
+
+        tokenizer = self.get_tokenizer()
+        prompts = (
+            "The quick brown fox jumps over the lazy dog.",
+            "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+        )
+        inputs = tokenizer(prompts, return_tensors="pt", padding=True, padding_side="left")
+        inputs = {k: v.to("cuda") for k, v in inputs.items()}
+
+        num_beams = 2
+        past_key_values = self._create_inference_params(config, batch_size=2, num_beams=num_beams)
+
+        with torch.no_grad():
+            output_ids = model.generate(
+                **inputs,
+                max_new_tokens=16,
+                use_cache=True,
+                past_key_values=past_key_values,
+                num_beams=num_beams,
+                do_sample=True,
+            )
+
+        assert output_ids.shape[0] == 2
+        assert output_ids.shape[1] > inputs["input_ids"].shape[1]
+
     # TODO: add multi-GPU tests, e.g., meta-device init after fully_shard, cp tests, etc.
@@ -49,6 +49,8 @@ class TestLlama3Model(BaseModelTest):
     This class provides LLaMA3-specific configuration for the common test suite.
     """
 
+    is_autoregressive = True
+
     def get_model_class(self) -> Type[PreTrainedModel]:
         """Return the LLaMA3 TE model class."""
         return NVLlamaForCausalLM
@@ -138,7 +140,23 @@ def get_tolerances(self) -> TestTolerances:
             cp_loss_rtol=0.25,
         )
 
-    # ==================== LLaMA3-Specific Tests ====================
+    # ==================== LLaMA3-Specific Overrides ====================
+
+    def _create_inference_params(self, config, batch_size=1, max_seq_len=256, num_beams=1):
+        """Create HFInferenceParams for the given config."""
+        past_key_values = HFInferenceParams(
+            max_batch_size=batch_size * num_beams,
+            max_sequence_length=max_seq_len,
+            num_heads_kv=config.num_key_value_heads,
+            head_dim_k=config.hidden_size // config.num_attention_heads,
+            dtype=torch.bfloat16,
+            qkv_format="thd",
+            max_ctx_len=max_seq_len,
+        )
+        for layer_number in range(1, config.num_hidden_layers + 1):
+            past_key_values.allocate_memory(layer_number)
+        return past_key_values
+
     def test_golden_values(self, input_format):  # pyright: ignore[reportIncompatibleMethodOverride]
         """For llama3, we can test both the dynamic sequence packing and native bshd attention formats."""
         model_hf = self.get_reference_model(dtype=torch.bfloat16)
 
@@ -166,7 +166,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
         # Permute tokens by expert using TE moe_permute
         permuted_hidden, row_id_map = transformer_engine.pytorch.moe_permute(
-            hidden_states, selected_experts, map_type="index"
+            hidden_states, selected_experts.to(torch.int32), map_type="index"
         )
 
         # Compute m_splits: number of tokens per expert
@@ -185,11 +185,11 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # Down projection
         expert_output = self.experts_down(intermediate, m_splits=m_splits)  # [total_tokens, H]
 
-        # Unpermute and combine with routing weights
+        # Unpermute and combine with routing weights (keep probs in float32 for numerical stability)
         output = transformer_engine.pytorch.moe_unpermute(
             expert_output,
             row_id_map,
-            merging_probs=routing_weights.to(expert_output.dtype),
+            merging_probs=routing_weights,
             map_type="index",
         )