NVIDIA-NeMo · DongjiGao · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/nemo/collections/speechlm2/vllm/__init__.py b/nemo/collections/speechlm2/vllm/__init__.py
diff --git a/nemo/collections/speechlm2/vllm/nemotron_v3/__init__.py b/nemo/collections/speechlm2/vllm/nemotron_v3/__init__.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""vLLM plugin registration for NeMo Speech LM models.
+
+Registers NeMoSpeechLMConfig and NeMoSpeechLMForConditionalGeneration
+into vLLM's model and config registries via the ``vllm.general_plugins``
+entry point.
+"""
+
+_PKG = "nemo.collections.speechlm2.vllm.nemotron_v3"
+
+
+def register():
+    """Register the NeMo Speech LM model and config with vLLM.
+
+    Called automatically by vLLM when ``VLLM_PLUGINS=nemo_speechlm``
+    is set, via the ``vllm.general_plugins`` entry point in
+    ``pyproject.toml``.
+    """
+    from transformers import AutoConfig
+
+    from nemo.collections.speechlm2.vllm.nemotron_v3.config import NeMoSpeechLMConfig
+
+    AutoConfig.register("nemo_speechlm", NeMoSpeechLMConfig)
+
+    from vllm.transformers_utils.config import _CONFIG_REGISTRY
+
+    _CONFIG_REGISTRY["nemo_speechlm"] = NeMoSpeechLMConfig
+
+    from vllm.model_executor.models.registry import ModelRegistry
+
+    ModelRegistry.register_model(
+        "NeMoSpeechLMForConditionalGeneration",
+        f"{_PKG}.model:NeMoSpeechLMForConditionalGeneration",
+    )
+
+    _apply_backend_patches()
+
+
+def _apply_backend_patches():
+    """Apply patches for LLM backends that need them.
+
+    NemotronH's HF config uses ``layer_norm_epsilon`` but vLLM expects
+    ``rms_norm_eps``.  This patches the config class at runtime.
+    """
+    try:
+        from transformers import AutoConfig as _AC
+
+        _nhc = _AC.from_pretrained(
+            "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
+            trust_remote_code=True,
+        )
+        NHConfigCls = type(_nhc)
+        _orig_getattr = getattr(NHConfigCls, "__getattr__", None)
+
+        def _patched_getattr(self, name):
+            if name == "rms_norm_eps":
+                return getattr(self, "layer_norm_epsilon", 1e-5)
+            if _orig_getattr:
+                return _orig_getattr(self, name)
+            raise AttributeError(name)
+
+        NHConfigCls.__getattr__ = _patched_getattr
+    except Exception:
+        pass
diff --git a/nemo/collections/speechlm2/vllm/nemotron_v3/config.py b/nemo/collections/speechlm2/vllm/nemotron_v3/config.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration for NeMo Speech LM models in vLLM.
+
+Provides ``NeMoSpeechLMConfig``, a HuggingFace-compatible config class
+that wraps the LLM backbone's text config with NeMo-specific fields
+(perception, audio_locator_tag, etc.).  The checkpoint's ``config.json``
+determines which LLM backbone and encoder are used.
+"""
+
+from transformers import AutoConfig, PretrainedConfig
+
+
+class NeMoSpeechLMConfig(PretrainedConfig):
+    """HuggingFace config for NeMo Speech LM multimodal models.
+
+    Wraps a pretrained LLM config (e.g. NemotronH, Qwen3) with
+    additional fields for the speech perception module.  The LLM
+    backbone config is loaded from ``pretrained_llm`` at init time.
+    """
+
+    model_type = "nemo_speechlm"
+
+    def __init__(
+        self,
+        perception: dict | None = None,
+        pretrained_llm: str = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
+        pretrained_asr: str = "nvidia/canary-1b-v2",
+        audio_locator_tag: str = "<|audio|>",
+        prompt_format: str = "nemotron-nano-v3",
+        pretrained_weights: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.perception = perception or {}
+        self.pretrained_llm = pretrained_llm
+        self.pretrained_asr = pretrained_asr
+        self.audio_locator_tag = audio_locator_tag
+        self.prompt_format = prompt_format
+        self.pretrained_weights = pretrained_weights
+
+        self.text_config = AutoConfig.from_pretrained(pretrained_llm, trust_remote_code=True)
+        self.text_config.architectures = ["NemotronHForCausalLM"]
+
+        if not hasattr(self.text_config, "total_num_kv_heads") or self.text_config.total_num_kv_heads is None:
+            self.text_config.total_num_kv_heads = getattr(self.text_config, "num_key_value_heads", 2)
+
+        if not hasattr(self.text_config, "rms_norm_eps"):
+            self.text_config.rms_norm_eps = getattr(self.text_config, "layer_norm_epsilon", 1e-5)
+
+        # Extend vocab to accommodate audio special tokens added at runtime.
+        # The embedding layer uses org_num_embeddings for weight loading
+        # so the checkpoint stays compatible.
+        self.text_config.vocab_size = self.text_config.vocab_size + 10
+
+    def get_text_config(self, decoder=False) -> PretrainedConfig:
+        """Return the LLM backbone's text config."""
+        return self.text_config
+
+    _ATTR_ALIASES = {
+        "rms_norm_eps": "layer_norm_epsilon",
+        "layer_norm_eps": "layer_norm_epsilon",
+    }
+
+    def __getattr__(self, name):
+        if name.startswith("_") or name in (
+            "perception",
+            "pretrained_llm",
+            "pretrained_asr",
+            "audio_locator_tag",
+            "prompt_format",
+            "pretrained_weights",
+            "text_config",
+            "_ATTR_ALIASES",
+        ):
+            raise AttributeError(name)
+        alias = self._ATTR_ALIASES.get(name, name)
+        try:
+            return getattr(self.text_config, alias)
+        except AttributeError:
+            if alias != name:
+                try:
+                    return getattr(self.text_config, name)
+                except AttributeError:
+                    pass
+            raise AttributeError(f"'{type(self).__name__}' has no attribute '{name}'")