addressing coderabbit review

pstjohn · pstjohn · commit c91136b6871b · 2026-03-03T11:07:37.000-07:00
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/bionemo-recipes/models/qwen/convert_qwen2.py b/bionemo-recipes/models/qwen/convert_qwen2.py
@@ -74,16 +74,18 @@ def _split_qkv_bias(ctx: state.TransformCTX, qkv_bias: torch.Tensor):
     qkv_bias = qkv_bias.reshape(qkv_total_dim, head_size)
     q_slice = torch.cat(
         [
-            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+            torch.arange(
+                (heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group, device=qkv_bias.device
+            )
             for i in range(num_query_groups)
         ]
     )
-    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
-    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2), device=qkv_bias.device)
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2), device=qkv_bias.device)
 
-    q_bias = qkv_bias[q_slice].reshape(-1).cpu()
-    k_bias = qkv_bias[k_slice].reshape(-1).cpu()
-    v_bias = qkv_bias[v_slice].reshape(-1).cpu()
+    q_bias = qkv_bias[q_slice].reshape(-1)
+    k_bias = qkv_bias[k_slice].reshape(-1)
+    v_bias = qkv_bias[v_slice].reshape(-1)
 
     return q_bias, k_bias, v_bias
 
@@ -206,6 +208,11 @@ def convert_qwen2_te_to_hf(model_te: NVQwen2ForCausalLM, **config_kwargs) -> Qwe
     with torch.device("meta"):
         model_hf = Qwen2ForCausalLM(hf_config)
 
+    if model_hf.config.tie_word_embeddings:
+        state_dict_ignored_entries = model_hf._tied_weights_keys
+    else:
+        state_dict_ignored_entries = []
+
     output_model = state.apply_transforms(
         model_te,
         model_hf,
@@ -241,10 +248,12 @@ def convert_qwen2_te_to_hf(model_te: NVQwen2ForCausalLM, **config_kwargs) -> Qwe
                 fn=state.TransformFns.split_fc1,
             ),
         ],
-        state_dict_ignored_entries=model_hf._tied_weights_keys,
+        state_dict_ignored_entries=state_dict_ignored_entries,
     )
 
     output_model.model.rotary_emb.inv_freq = model_te.model.rotary_emb.inv_freq.clone()
-    output_model.tie_weights()
+
+    if model_hf.config.tie_word_embeddings:
+        output_model.tie_weights()
 
     return output_model
diff --git a/bionemo-recipes/models/qwen/export.py b/bionemo-recipes/models/qwen/export.py
@@ -53,7 +53,7 @@ def export_hf_checkpoint(tag: str, export_path: Path):
     with open(export_path / "config.json", "w") as f:
         json.dump(config, f, indent=2, sort_keys=True)
 
-    shutil.copy("modeling_qwen3_te.py", export_path / "modeling_qwen3_te.py")
+    shutil.copy(Path(__file__).parent / "modeling_qwen3_te.py", export_path / "modeling_qwen3_te.py")
 
 
 if __name__ == "__main__":
diff --git a/bionemo-recipes/models/qwen/modeling_qwen2_te.py b/bionemo-recipes/models/qwen/modeling_qwen2_te.py
@@ -212,7 +212,7 @@ def forward(
             # attention backend, but it should be faster for the flash attention backend.
             assert attention_mask is not None, "Attention mask is required when packing BSHD inputs."
             batch_size = hidden_states.size(0)
-            padded_seq_len = input_ids.size(1)
+            padded_seq_len = input_ids.size(1) if input_ids is not None else hidden_states.size(1)
             hidden_states, indices, cu_seqlens, max_seqlen, _ = _unpad_input(hidden_states, attention_mask)
             kwargs["cu_seq_lens_q"] = kwargs["cu_seq_lens_k"] = cu_seqlens
             kwargs["max_length_q"] = kwargs["max_length_k"] = max_seqlen
diff --git a/bionemo-recipes/models/qwen/modeling_qwen3_te.py b/bionemo-recipes/models/qwen/modeling_qwen3_te.py
@@ -139,7 +139,10 @@ def _init_method(x):
                     qk_norm_eps=config.rms_norm_eps,
                     qk_norm_before_rope=True,
                     window_size=(config.sliding_window, config.sliding_window)
-                    if config.layer_types[layer_idx] == "sliding_attention"
+                    if config.layer_types is not None
+                    and len(config.layer_types) > layer_idx
+                    and config.layer_types[layer_idx] == "sliding_attention"
+                    and config.sliding_window is not None
                     else None,
                     layer_number=layer_idx + 1,
                     params_dtype=config.dtype,
diff --git a/bionemo-recipes/models/qwen/tests/common/test_modeling_common.py b/bionemo-recipes/models/qwen/tests/common/test_modeling_common.py
@@ -20,7 +20,7 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Callable, Dict, List, Literal, Type
+from typing import Any, Callable, Dict, List, Literal, Type
 
 import pytest
 import torch
@@ -987,8 +987,8 @@ def test_meta_fp8_init(self, fp8_recipe):
         self.verify_model_parameters_initialized_correctly(model, should_be_fp8=True)
 
     # ==================== Generation Tests (Autoregressive Models Only) ====================
-
-    def _create_inference_params(self, config, batch_size=1, max_seq_len=256, num_beams=1):
+    @abstractmethod
+    def create_inference_params(self, config, batch_size=1, max_seq_len=256, num_beams=1) -> Any:
         """Create inference params for KV-cache generation tests.
 
         Autoregressive model tests must override this method to provide
@@ -1003,9 +1003,7 @@ def _create_inference_params(self, config, batch_size=1, max_seq_len=256, num_be
         Returns:
             HFInferenceParams instance with allocated memory.
         """
-        raise NotImplementedError(
-            "Autoregressive models must override _create_inference_params to provide model-specific HFInferenceParams."
-        )
+        pass
 
     def test_generate_without_cache(self):
         """Test basic generation without KV-cache (BSHD, use_cache=False)."""
@@ -1040,7 +1038,7 @@ def test_generate_with_cache(self):
         inputs = tokenizer(prompt, return_tensors="pt")
         inputs = {k: v.to("cuda") for k, v in inputs.items()}
 
-        past_key_values = self._create_inference_params(config, batch_size=1)
+        past_key_values = self.create_inference_params(config, batch_size=1)
 
         with torch.no_grad():
             output_ids = model.generate(**inputs, max_new_tokens=16, use_cache=True, past_key_values=past_key_values)
@@ -1064,7 +1062,7 @@ def test_generate_with_cache_batched(self):
         inputs = tokenizer(prompts, return_tensors="pt", padding=True, padding_side="left")
         inputs = {k: v.to("cuda") for k, v in inputs.items()}
 
-        past_key_values = self._create_inference_params(config, batch_size=2)
+        past_key_values = self.create_inference_params(config, batch_size=2)
 
         with torch.no_grad():
             output_ids = model.generate(**inputs, max_new_tokens=16, use_cache=True, past_key_values=past_key_values)
@@ -1090,7 +1088,7 @@ def test_generate_with_cache_beam_search(self):
         inputs = {k: v.to("cuda") for k, v in inputs.items()}
 
         num_beams = 2
-        past_key_values = self._create_inference_params(config, batch_size=2, num_beams=num_beams)
+        past_key_values = self.create_inference_params(config, batch_size=2, num_beams=num_beams)
 
         with torch.no_grad():
             output_ids = model.generate(
diff --git a/bionemo-recipes/models/qwen/tests/test_modeling_qwen2_te.py b/bionemo-recipes/models/qwen/tests/test_modeling_qwen2_te.py
@@ -67,7 +67,9 @@ def get_upstream_model_revision(self) -> str:
 
     def get_tokenizer(self) -> PreTrainedTokenizer:
         """Return the Qwen2 tokenizer."""
-        tokenizer = AutoTokenizer.from_pretrained(self.get_upstream_model_id())
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.get_upstream_model_id(), revision=self.get_upstream_model_revision()
+        )
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
         return tokenizer
@@ -152,7 +154,7 @@ def test_quantized_model_init_forward_and_backward(self, fp8_recipe, input_forma
 
     # ==================== Qwen2-Specific Overrides ====================
 
-    def _create_inference_params(self, config, batch_size=1, max_seq_len=256, num_beams=1):
+    def create_inference_params(self, config, batch_size=1, max_seq_len=256, num_beams=1):
         """Create HFInferenceParams for the given config.
 
         Uses hidden_size // num_attention_heads for head_dim since Qwen2 does not
diff --git a/bionemo-recipes/models/qwen/tests/test_modeling_qwen3_te.py b/bionemo-recipes/models/qwen/tests/test_modeling_qwen3_te.py
@@ -67,7 +67,9 @@ def get_upstream_model_revision(self) -> str:
 
     def get_tokenizer(self) -> PreTrainedTokenizer:
         """Return the Qwen3 tokenizer."""
-        tokenizer = AutoTokenizer.from_pretrained(self.get_upstream_model_id())
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.get_upstream_model_id(), revision=self.get_upstream_model_revision()
+        )
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
         return tokenizer
@@ -153,7 +155,7 @@ def test_quantized_model_init_forward_and_backward(self, fp8_recipe, input_forma
 
     # ==================== Qwen3-Specific Overrides ====================
 
-    def _create_inference_params(self, config, batch_size=1, max_seq_len=256, num_beams=1):
+    def create_inference_params(self, config, batch_size=1, max_seq_len=256, num_beams=1):
         """Create HFInferenceParams for the given config.
 
         Uses config.head_dim (not hidden_size // num_attention_heads) since Qwen3
diff --git a/ci/scripts/check_copied_files.py b/ci/scripts/check_copied_files.py
@@ -38,7 +38,7 @@
     "bionemo-recipes/models/esm2/collator.py": [
         "bionemo-recipes/models/llama3/collator.py",
         "bionemo-recipes/models/mixtral/collator.py",
-        "bionemo-recipes/models/qwen3/collator.py",
+        "bionemo-recipes/models/qwen/collator.py",
         "bionemo-recipes/recipes/esm2_native_te/collator.py",
         "bionemo-recipes/recipes/llama3_native_te/collator.py",
         "bionemo-recipes/recipes/esm2_peft_te/collator.py",
@@ -47,7 +47,7 @@
         "bionemo-recipes/models/amplify/src/amplify/state.py",
         "bionemo-recipes/models/llama3/state.py",
         "bionemo-recipes/models/mixtral/state.py",
-        "bionemo-recipes/models/qwen3/state.py",
+        "bionemo-recipes/models/qwen/state.py",
     ],
     "bionemo-recipes/models/llama3/modeling_llama_te.py": [
         "bionemo-recipes/recipes/llama3_native_te/modeling_llama_te.py",
@@ -62,7 +62,7 @@
     "bionemo-recipes/models/esm2/tests/common": [
         "bionemo-recipes/models/llama3/tests/common",
         "bionemo-recipes/models/mixtral/tests/common",
-        "bionemo-recipes/models/qwen3/tests/common",
+        "bionemo-recipes/models/qwen/tests/common",
     ],
 }