diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index 9d012155bb..1041b492b9 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -70,6 +70,7 @@ def __init__(self, model: torch.nn.Module, **kwargs) -> None: self.model = model self.config = model.config self.hash_params = create_model_params(self, **kwargs) + self.hash_params["num_kv_heads_repeat"] = kwargs.get("num_kv_heads_repeat", 1) self.onnx_path: Optional[str] = None self.qpc_path: Optional[str] = None self.qpc_session: Optional[QAICInferenceSession] = None @@ -440,23 +441,43 @@ def transform( **compiler_options, ): # Apply the transformations that are dependent on compilation parameters + def _transform_tracking_root(module: torch.nn.Module) -> torch.nn.Module: + """ + Use the shared wrapped model as transform-tracking root when available. + This lets encoder/decoder wrappers coordinate one-time transforms. + """ + wrapped = getattr(module, "model", None) + return wrapped if isinstance(wrapped, torch.nn.Module) else module qaic_config = qaic_config if qaic_config else getattr(self.model, "qaic_config", None) - model_config = getattr(self.model, "config", None) or getattr(self.model.model, "config", None) + model_config = getattr(self.model, "config", None) or getattr( + getattr(self.model, "model", None), "config", None + ) if model_config: - if "DeepseekV3ForCausalLM" in (getattr(model_config, "architectures", None) or []): - if qaic_config: - if qaic_config.get("blocking_mode", None) == "h": - qaic_config["head_block_size"] = qaic_config.get("head_block_size", num_devices) - num_kv_heads_repeat = qaic_config.get("num_kv_heads_repeat", 1) + architectures = getattr(model_config, "architectures", None) or [] + is_deepseek_v3 = "DeepseekV3ForCausalLM" in architectures + if qaic_config: + if is_deepseek_v3 and (qaic_config.get("blocking_mode", None) == "h"): + qaic_config["head_block_size"] = qaic_config.get("head_block_size", num_devices) + num_kv_heads_repeat = qaic_config.get("num_kv_heads_repeat", 1) + transform_root = _transform_tracking_root(self.model) + applied_transforms = getattr(transform_root, "_qeff_runtime_transforms_applied", set()) + + if ReplicateKVHeadTransform.__name__ in applied_transforms: + replicate_kv_transformed = False + logger.warning("Skipping RepeatKVTransform: already applied on this model instance.") + else: self.model, replicate_kv_transformed = ReplicateKVHeadTransform.apply( - self.model, num_kv_heads_repeat + self.model, + num_kv_heads_repeat=num_kv_heads_repeat, ) if replicate_kv_transformed: - self.hash_params["config"] = self.model.config.to_diff_dict() - + applied_transforms.add(ReplicateKVHeadTransform.__name__) + setattr(transform_root, "_qeff_runtime_transforms_applied", applied_transforms) + if replicate_kv_transformed: + self.hash_params["config"] = self.model.config.to_diff_dict() blocking_config = build_transformer_blocking_config_for_transform( model_config, ctx_len=ctx_len, diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py index c0b7053ab6..57135fac2e 100644 --- a/QEfficient/transformers/models/gemma3/modeling_gemma3.py +++ b/QEfficient/transformers/models/gemma3/modeling_gemma3.py @@ -598,6 +598,7 @@ class QEffGemma3EncoderWrapper(nn.Module): def __init__(self, model): super().__init__() self.model = model + self.config = self.model.config self.model.vision_model = self.model.vision_tower def get_submodules_for_export(self) -> Type[nn.Module]: diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index 228b748a8b..18f83322a4 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -20,6 +20,7 @@ class QEffInternEncoderWrapper(nn.Module): def __init__(self, model): super().__init__() self.model = model + self.config = self.model.config def get_submodules_for_export(self) -> Type[nn.Module]: """ diff --git a/QEfficient/transformers/models/llama4/modeling_llama4.py b/QEfficient/transformers/models/llama4/modeling_llama4.py index cd98465b5f..5a4d2dd761 100644 --- a/QEfficient/transformers/models/llama4/modeling_llama4.py +++ b/QEfficient/transformers/models/llama4/modeling_llama4.py @@ -831,6 +831,7 @@ class QEffLlama4EncoderWrapper(nn.Module): def __init__(self, model): super().__init__() self.model = model + self.config = self.model.config def get_submodules_for_export(self) -> Type[nn.Module]: """ diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py index dac3b19e61..7bc0f8ad33 100644 --- a/QEfficient/transformers/models/llava/modeling_llava.py +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -29,6 +29,7 @@ def __init__(self, model): super().__init__() self.model = model self.model.vision_model = self.model.vision_tower + self.config = self.model.config def get_submodules_for_export(self) -> Type[nn.Module]: """ diff --git a/QEfficient/transformers/models/llava_next/modeling_llava_next.py b/QEfficient/transformers/models/llava_next/modeling_llava_next.py index 3822223ed2..171775979f 100755 --- a/QEfficient/transformers/models/llava_next/modeling_llava_next.py +++ b/QEfficient/transformers/models/llava_next/modeling_llava_next.py @@ -29,6 +29,7 @@ def __init__(self, model): super().__init__() self.model = model self.model.vision_model = self.model.vision_tower + self.config = self.model.config def get_submodules_for_export(self) -> Type[nn.Module]: """ diff --git a/QEfficient/transformers/models/mistral3/modeling_mistral3.py b/QEfficient/transformers/models/mistral3/modeling_mistral3.py index eae4580c50..3af453ead5 100644 --- a/QEfficient/transformers/models/mistral3/modeling_mistral3.py +++ b/QEfficient/transformers/models/mistral3/modeling_mistral3.py @@ -149,6 +149,7 @@ class QEFFMistral3EncoderWrapper(nn.Module): def __init__(self, model): super().__init__() self.model = model + self.config = self.model.config self.model.vision_model = self.model.vision_tower def get_submodules_for_export(self) -> Type[nn.Module]: diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 4ad56592fb..8c0539fa84 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1238,6 +1238,7 @@ def __init__( self.ccl_enabled = qaic_config.get("ccl_enabled", False) self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None self.input_shapes, self.output_names = None, None + # self.model, replicate_kv_transformed = ReplicateKVHeadTransform.apply(self.model, **kwargs) # ---Sampling--- # Note: SamplerTransform should be applied after all other transforms # are done. The role of the sampler is to just add nodes at the output of the @@ -1273,6 +1274,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, qaic_config: Option kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) + num_kv_heads_repeat = kwargs.pop("num_kv_heads_repeat", 1) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs) kwargs.update({"enable_proxy": enable_proxy} if enable_proxy else {}) @@ -1281,6 +1283,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, qaic_config: Option model, pretrained_model_name_or_path=pretrained_model_name_or_path, qaic_config=qaic_config, + num_kv_heads_repeat=num_kv_heads_repeat, **kwargs, ) @@ -1371,7 +1374,12 @@ def export( if prefill_only and prefill_seq_len > 1: offload_pt_weights = False # to keep weight for decode onnx else: - offload_pt_weights = kwargs.get("offload_pt_weights", True) + num_kv_heads_repeat = ( + (self.lang_model.model.qaic_config or {}).get("num_kv_heads_repeat", 1) + if hasattr(self.lang_model.model, "qaic_config") + else 1 + ) + offload_pt_weights = kwargs.get("offload_pt_weights", num_kv_heads_repeat <= 1) if not skip_lang: self.lang_model.export( @@ -2037,6 +2045,7 @@ def __init__( self.model.config.text_config.use_cache = True else: self.model.config.use_cache = True + # self.model, replicate_kv_transformed = ReplicateKVHeadTransform.apply(self.model, **kwargs) self.hash_params["qeff_auto_class"] = self.__class__.__name__ self.ccl_enabled = False if qaic_config: @@ -2086,6 +2095,7 @@ def from_pretrained( config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True) config._attn_implementation = "eager" config.vision_config.use_flash_attn = "false" + num_kv_heads_repeat = kwargs.pop("num_kv_heads_repeat", 1) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, config, *args, **kwargs) kwargs.update({"enable_proxy": enable_proxy} if enable_proxy else {}) @@ -2094,6 +2104,7 @@ def from_pretrained( model, pretrained_model_name_or_path=pretrained_model_name_or_path, qaic_config=qaic_config, + num_kv_heads_repeat=num_kv_heads_repeat, **kwargs, ) @@ -2698,6 +2709,7 @@ def from_pretrained( logger.warning("Updating low_cpu_mem_usage=False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) + num_kv_heads_repeat = kwargs.pop("num_kv_heads_repeat", 1) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs) kwargs.update({"enable_proxy": enable_proxy} if enable_proxy else {}) @@ -2708,6 +2720,7 @@ def from_pretrained( continuous_batching=continuous_batching, pretrained_model_name_or_path=pretrained_model_name_or_path, qaic_config=qaic_config, + num_kv_heads_repeat=num_kv_heads_repeat, **kwargs, ) @@ -2867,6 +2880,7 @@ def __init__( setattr(self.model, "mla_absorption", mla_absorption) self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None self.hash_params["max_seq_len_cached"] = max_seq_len_cached + # self.model, replicate_kv_transformed = ReplicateKVHeadTransform.apply(self.model, **kwargs) # ---Sampling--- # Note: SamplerTransform should be applied after all other transforms @@ -2950,6 +2964,7 @@ def from_pretrained( kv_offload = kwargs.pop("kv_offload", None) kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) + num_kv_heads_repeat = kwargs.pop("num_kv_heads_repeat", 1) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) if qaic_config is not None: qaic_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path @@ -2963,6 +2978,7 @@ def from_pretrained( pretrained_model_name_or_path=pretrained_model_name_or_path, qaic_config=qaic_config, continuous_batching=continuous_batching, + num_kv_heads_repeat=num_kv_heads_repeat, **kwargs, ) return cls( @@ -2971,6 +2987,7 @@ def from_pretrained( qaic_config=qaic_config, pretrained_model_name_or_path=pretrained_model_name_or_path, max_seq_len_cached=max_seq_len_cached, + num_kv_heads_repeat=num_kv_heads_repeat, **kwargs, ) diff --git a/QEfficient/transformers/models/molmo/modeling_molmo.py b/QEfficient/transformers/models/molmo/modeling_molmo.py index cf1beddc87..5ea8ac3387 100644 --- a/QEfficient/transformers/models/molmo/modeling_molmo.py +++ b/QEfficient/transformers/models/molmo/modeling_molmo.py @@ -565,6 +565,7 @@ class QEffMolmoEncoderWrapper(nn.Module): def __init__(self, model): super().__init__() self.model = model + self.config = self.model.config def get_submodules_for_export(self) -> Type[nn.Module]: """ diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 705649929c..85cf8ea89d 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -237,8 +237,13 @@ WhisperPositionalEmbedding, ) -from QEfficient.base.pytorch_transforms import ExternalModuleMapperTransform, ModuleMappingTransform +from QEfficient.base.pytorch_transforms import ( + ExternalModuleMapperTransform, + ModuleMappingTransform, + ModuleMutatorTransform, +) from QEfficient.customop import CustomRMSNormAIC, GemmaCustomRMSNormAIC +from QEfficient.customop.matmulnbits import QuantLinearORT from QEfficient.transformers.embeddings.embedding_utils import POOLING_MAP, PooledModel, validate_user_pooling_function from QEfficient.transformers.models.codegen.modeling_codegen import ( QEffCodeGenAttention, @@ -507,8 +512,18 @@ QEffWhisperPositionalEmbedding, ) from QEfficient.transformers.post_processing import build_and_attach_mlp, model_type_registry +from QEfficient.transformers.quantizers.awq import WQLinear_GEMM +from QEfficient.transformers.quantizers.gptq import QuantLinearGPTQ +from QEfficient.transformers.quantizers.quantizer_compressed_tensors import FP8DeQuantLinear from QEfficient.transformers.sampler.sampler import sampler_forward from QEfficient.transformers.spd.spd_transform_forward import tlm_forward +from QEfficient.utils.config_utils import ( + resolve_attention_heads, + resolve_hidden_size, + resolve_kv_heads, + set_kv_head_aliases, +) +from QEfficient.utils.constants import ATTENTION_HEAD_CONFIG_KEYS, HIDDEN_SIZE_CONFIG_KEYS, KV_HEAD_CONFIG_KEYS from QEfficient.utils.logging_utils import logger SPD_TARGET = "target" @@ -777,17 +792,102 @@ class RevertPrefillOnlyTransform(ModuleMappingTransform): } -class ReplicateKVHeadTransform: +class ReplicateKVHeadTransform(ModuleMutatorTransform): """ Replicates KV heads in attention modules to match the number of KV heads in the target model. This transform is used when the source model has fewer KV heads than required in target model. """ - def _duplicate_weights_for_linear_layer( - layer: nn.Module, orig_kv_heads: int, repeat: int, dim: int, hidden_size: int - ): - new_kv_heads = repeat # for mla + _module_mapping = { + QEffCodeGenForCausalLM, + QEffFalconForCausalLM, + QEffGPT2LMHeadModel, + QEffGPTJForCausalLM, + QEffLlamaForCausalLM, + QEffLlama4ForConditionalGeneration, + QEffLlavaForConditionalGeneration, + QEffLlavaNextForConditionalGeneration, + QEffGemmaForCausalLM, + QEffGemma2ForCausalLM, + QEffGemma3ForConditionalGeneration, + QEffGraniteForCausalLM, + QEffGraniteMoeForCausalLM, + QEffMllamaForConditionalGeneration, + QEffMistralForCausalLM, + QEffMistral3ForConditionalGeneration, + QEffMptForCausalLM, + QEffPhiForCausalLM, + QEffPhi3ForCausalLM, + QEffQwen2ForCausalLM, + QEffQwen3ForCausalLM, + QEffQwen_2_5_vl_ForConditionalGeneration, + QEffQwen3MoeForCausalLM, + QEffQwen3VLForConditionalGeneration, + QEffQwen3VLMoeForConditionalGeneration, + QEffStarcoder2ForCausalLM, + QEffGPTBigCodeForCausalLM, + QEffOlmo2ForCausalLM, + } + _module_string_mapping = { + "DeepseekV3ForCausalLM", + "InternVLChatModel", + "MolmoForCausalLM,", + "QEffGemma3DecoderWrapper", + "QEffGemma3EncoderWrapper", + "QEffInternDecoderWrapper", + "QEffInternEncoderWrapper", + "QEffLlama4DecoderWrapper", + "QEffLlama4EncoderWrapper", + "QEFFLlavaDecoderWrapper", + "QEFFLlavaEncoderWrapper", + "QEffLlavaNextDecoderWrapper", + "QEffLlavaNextEncoderWrapper", + "QEFFMistral3DecoderWrapper", + "QEFFMistral3EncoderWrapper", + "QEffMolmoDecoderWrapper", + "QEffMolmoEncoderWrapper", + "QEffQwen_2_5_vl_DecoderWrapper", + "QEffQwen_2_5_vl_EncoderWrapper", + "QEffQwen3VLDecoderWrapper", + "QEffQwen3VLEncoderWrapper", + } + @classmethod + def _get_attention_module(cls, block: nn.Module) -> nn.Module: + for attr in ("cross_attn", "self_attn", "attention", "attn"): + attn = getattr(block, attr, None) + if attn is not None: + return attn + raise AttributeError(f"No attention module found in block type {block.__class__.__name__}") + + @staticmethod + def _get_projection_layer(attn: nn.Module, names: tuple) -> nn.Module: + for name in names: + layer = getattr(attn, name, None) + if layer is not None: + return layer + raise AttributeError(f"Missing projection layer in {attn.__class__.__name__}; expected one of {names}") + + @staticmethod + def _is_mla_attention(attn: nn.Module) -> bool: + return ( + hasattr(attn, "kv_a_proj_with_mqa") and hasattr(attn, "kv_lora_rank") and hasattr(attn, "qk_rope_head_dim") + ) + + @classmethod + def _is_mla_model(cls, text_model: nn.Module) -> bool: + for block in getattr(text_model, "layers", []): + try: + attn = cls._get_attention_module(block) + except AttributeError: + continue + if cls._is_mla_attention(attn): + return True + return False + + @staticmethod + def _duplicate_weights_for_mla_layer(layer: nn.Module, orig_kv_heads: int, repeat: int, dim: int, hidden_size: int): + new_kv_heads = repeat * orig_kv_heads layer.weight.data = torch.repeat_interleave( layer.weight.data.view(orig_kv_heads, dim, hidden_size), repeat, 0 ).view(new_kv_heads * dim, hidden_size) @@ -797,52 +897,237 @@ def _duplicate_weights_for_linear_layer( new_kv_heads * dim ) + @staticmethod + def _duplicate_weights_for_linear_layer( + layer: nn.Module, orig_kv_heads: int, repeat: int, head_dim: int, hidden_size: int + ): + new_kv_heads = repeat * orig_kv_heads + if isinstance(layer, (WQLinear_GEMM, QuantLinearGPTQ, QuantLinearORT)): + if head_dim % 8 != 0: + raise ValueError( + f"the value head_dim={head_dim} is not divisible by 8 which is according to the assumption that model is 4-bit quantized." + ) + if hidden_size % layer.group_size != 0: + raise ValueError( + f"The value of hidden_size={hidden_size} is not divisible by k_proj.group_size={layer.group_size}" + ) + + # Duplication of quantized weights + layer.qweight.data = torch.repeat_interleave( + layer.qweight.data.view(hidden_size, orig_kv_heads, head_dim // 8), repeat, 1 + ).view(hidden_size, (new_kv_heads * head_dim) // 8) + # Duplication of quantized zero points + layer.qzeros.data = torch.repeat_interleave( + layer.qzeros.data.view(hidden_size // layer.group_size, orig_kv_heads, head_dim // 8), + repeat, + 1, + ).view(hidden_size // layer.group_size, (new_kv_heads * head_dim) // 8) + # Duplication of quantization scales + layer.scales.data = torch.repeat_interleave( + layer.scales.data.view(hidden_size // layer.group_size, orig_kv_heads, head_dim), + repeat, + 1, + ).view(hidden_size // layer.group_size, new_kv_heads * head_dim) + layer.out_features = layer.out_features * repeat + + elif isinstance(layer, FP8DeQuantLinear): + layer.weight.data = torch.repeat_interleave( + layer.weight.data.view(orig_kv_heads, head_dim, hidden_size), repeat, 0 + ).view(new_kv_heads * head_dim, hidden_size) + layer.weight_scale.data = torch.repeat_interleave( + layer.weight_scale.data.view(orig_kv_heads, head_dim), repeat, 0 + ).view(new_kv_heads * head_dim, -1) + + else: + layer.weight.data = torch.repeat_interleave( + layer.weight.data.view(orig_kv_heads, head_dim, hidden_size), repeat, 0 + ).view(new_kv_heads * head_dim, hidden_size) + if layer.bias is not None: + layer.bias.data = torch.repeat_interleave(layer.bias.data.view(orig_kv_heads, head_dim), repeat, 0).view( + new_kv_heads * head_dim + ) + + def _is_valid_text_model(candidate: nn.Module) -> bool: + """ + Validate whether a candidate object looks like a text stack suitable for KV replication. + """ + if candidate is None: + return False + cfg = getattr(candidate, "config", None) + layers = getattr(candidate, "layers", None) + attn_heads = resolve_attention_heads(cfg) if cfg is not None else None + kv_heads = resolve_kv_heads(cfg) if cfg is not None else None + hidden_size = resolve_hidden_size(cfg) if cfg is not None else None + return ( + cfg is not None + and layers is not None + and attn_heads is not None + and kv_heads is not None + and hidden_size is not None + ) + def _get_text_model(model): """ Determine and return the appropriate text_model from a given model object. + + Some VLM wrappers expose multiple nested text attributes (e.g. `language_model`, + `language_model.model`, `model.language_model`). We pick the first valid module + that has both `config` and `layers` required for KV head replication. """ - # Check for VLMs - if hasattr(model, "language_model"): - if hasattr(model.language_model, "model"): - return model.language_model.model - else: - return model.language_model - # Check for CausalLMs - if hasattr(model, "model"): - return model.model + candidate_paths = ( + ("language_model",), + ("language_model", "model"), + ("model", "language_model"), + ("model", "language_model", "model"), + ("model",), + ("model", "model"), + ("transformer",), + ("transformer", "model"), + ("llm",), + ("llm", "model"), + ("backbone",), + ) - raise AttributeError("No suitable text model found in the provided model.") + for path in candidate_paths: + candidate = model + valid_path = True + for attr in path: + if not hasattr(candidate, attr): + valid_path = False + break + candidate = getattr(candidate, attr) + if valid_path and ReplicateKVHeadTransform._is_valid_text_model(candidate): + return candidate + + raise AttributeError( + f"No suitable text model found in the provided model ({model.__class__.__name__}). " + "Expected a module with `layers` and text `config` attributes." + ) + + def _get_replication_root(model: nn.Module) -> nn.Module: + """ + Return a shared root module for wrapper and non-wrapper models so KV replication + can be applied once across encoder/decoder components of the same model. + """ + candidate = getattr(model, "model", None) + return candidate if isinstance(candidate, nn.Module) else model @classmethod - def apply(cls, model: nn.Module, num_kv_heads_repeat: int = 1) -> nn.Module: + def mutate(cls, original_module: nn.Module, parent_module: nn.Module, n_repeat: int) -> nn.Module: """ - Replicates KV heads in attention modules based on provided multiplier. + Mutates the matched top-level model module in-place by replicating its KV heads. Args: - model: The model to apply the transform to. - num_kv_heads_repeat: The number of times to repeat the KV heads. + original_module: The matched top-level model module to mutate. + parent_module: The parent module (unused, present for interface compatibility). + n_repeat: The number of times to repeat the KV heads. + + Returns: + The mutated module (same object, modified in-place). """ - transformed = False - if num_kv_heads_repeat is not None and num_kv_heads_repeat > 1: - text_model = cls._get_text_model(model) + replication_root = cls._get_replication_root(original_module) + if getattr(replication_root, "_qeff_kv_replication_applied", False): + logger.warning("KV head replication already applied for this model instance; skipping.") + return original_module + + text_model = cls._get_text_model(original_module) + cfg = text_model.config + orig_kv_heads = resolve_kv_heads(cfg) + num_attention_heads = resolve_attention_heads(cfg) + hidden_size = resolve_hidden_size(cfg) + is_mla_model = cls._is_mla_model(text_model) + + if orig_kv_heads is None or num_attention_heads is None or hidden_size is None: + raise ValueError( + "Unable to resolve attention/KV heads or hidden size from config for RepeatKV transform. " + f"Supported attention keys={ATTENTION_HEAD_CONFIG_KEYS}, kv keys={KV_HEAD_CONFIG_KEYS}, " + f"hidden size keys={HIDDEN_SIZE_CONFIG_KEYS}." + ) + if orig_kv_heads < 1 or num_attention_heads < 1: + raise ValueError( + f"Invalid head values for RepeatKV transform: " + f"num_attention_heads={num_attention_heads}, num_key_value_heads={orig_kv_heads}" + ) + if is_mla_model: + # Legacy MLA path treats compressed-KV projection as single KV head. + orig_kv_heads = 1 - orig_kv_heads = 1 # for mla #text_model.config.num_key_value_heads - new_kv_heads = num_kv_heads_repeat * orig_kv_heads - text_model.config.orig_kv_heads = orig_kv_heads - text_model.config.num_key_value_heads = new_kv_heads + new_kv_heads = n_repeat * orig_kv_heads + if (not is_mla_model) and (new_kv_heads > num_attention_heads or (num_attention_heads % new_kv_heads) != 0): + raise ValueError( + f"Invalid RepeatKV configuration: num_attention_heads={num_attention_heads}, " + f"orig_kv_heads={orig_kv_heads}, num_kv_heads_repeat={n_repeat}, new_kv_heads={new_kv_heads}. " + "Expected new_kv_heads <= num_attention_heads and divisibility." + ) - hidden_size = text_model.config.hidden_size + cfg.orig_kv_heads = orig_kv_heads + set_kv_head_aliases(cfg, new_kv_heads) - logger.warning(f"Original KV heads: {orig_kv_heads}") - logger.warning(f"Modified KV heads: {new_kv_heads}") - transformed = True - for block in text_model.layers: - attn = getattr(block, "cross_attn", getattr(block, "self_attn", None)) + logger.warning(f"Original KV heads: {orig_kv_heads}") + logger.warning(f"Modified KV heads: {new_kv_heads}") + for block in text_model.layers: + attn = cls._get_attention_module(block) + if hasattr(attn, "num_key_value_heads"): attn.num_key_value_heads = new_kv_heads - head_dim = attn.kv_lora_rank + attn.qk_rope_head_dim + if hasattr(attn, "n_kv_heads"): + attn.n_kv_heads = new_kv_heads + + if cls._is_mla_attention(attn): + # Legacy MLA support: KV compression projection is organized as + # [kv_heads, kv_lora_rank + qk_rope_head_dim, hidden_size]. + mla_orig_kv_heads = 1 + mla_head_dim = int(attn.kv_lora_rank + attn.qk_rope_head_dim) + cls._duplicate_weights_for_mla_layer( + attn.kv_a_proj_with_mqa, + mla_orig_kv_heads, + n_repeat, + mla_head_dim, + hidden_size, + ) + else: + n_kv_groups = num_attention_heads // new_kv_heads + if hasattr(attn, "num_key_value_groups"): + attn.num_key_value_groups = n_kv_groups + if hasattr(attn, "n_kv_groups"): + attn.n_kv_groups = n_kv_groups + head_dim = getattr(attn, "head_dim", hidden_size // num_attention_heads) + k_proj = cls._get_projection_layer(attn, ("k_proj", "key_proj")) + v_proj = cls._get_projection_layer(attn, ("v_proj", "value_proj")) + cls._duplicate_weights_for_linear_layer(k_proj, orig_kv_heads, n_repeat, head_dim, hidden_size) + cls._duplicate_weights_for_linear_layer(v_proj, orig_kv_heads, n_repeat, head_dim, hidden_size) + + setattr(replication_root, "_qeff_kv_replication_applied", True) + return original_module + + @classmethod + def apply(cls, model: nn.Module, num_kv_heads_repeat: Optional[int] = None, **kwargs) -> Tuple[nn.Module, bool]: + """ + Replicates KV heads in attention modules based on provided multiplier. + + Args: + model: The model to apply the transform to. + kwargs: Additional arguments for the transformation. Includes: + - num_kv_heads_repeat: The number of times to repeat the KV heads. + """ + if num_kv_heads_repeat is None: + n_repeat = kwargs.pop("num_kv_heads_repeat", 1) + else: + kwargs.pop("num_kv_heads_repeat", None) + n_repeat = num_kv_heads_repeat + # Validate n_repeat is a positive integer + if not isinstance(n_repeat, int) or n_repeat < 1: + raise ValueError( + f"num_kv_heads_repeat must be a positive integer, got: {n_repeat} (type: {type(n_repeat).__name__})" + ) - cls._duplicate_weights_for_linear_layer( - attn.kv_a_proj_with_mqa, orig_kv_heads, num_kv_heads_repeat, head_dim, hidden_size + transformed = False + if n_repeat is not None and n_repeat > 1: + if (model.__class__ in cls._module_mapping) or (model.__class__.__name__ in cls._module_string_mapping): + cls.mutate(model, None, n_repeat) + transformed = True + else: + raise NotImplementedError( + f"Model class {model.__class__.__name__} is not supported for KV head replication." ) return model, transformed diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index da687e0ede..7fd9414dfa 100644 --- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -695,6 +695,7 @@ def __init__(self, model): super().__init__() self.model = model self.model.vision_model = self.model.visual + self.config = self.model.config def get_submodules_for_export(self) -> Type[nn.Module]: """ diff --git a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py index 21847f25de..d530054fb8 100644 --- a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py +++ b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py @@ -639,6 +639,7 @@ def __init__(self, model): super().__init__() self.model = model self.model.vision_model = self.model.visual + self.config = self.model.config def get_submodules_for_export(self) -> Type[nn.Module]: """ diff --git a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py index 240d04d996..11d6cb9883 100644 --- a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +++ b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -729,6 +729,7 @@ def __init__(self, model): super().__init__() self.model = model self.model.vision_model = self.model.visual + self.config = self.model.config def get_submodules_for_export(self) -> Type[nn.Module]: """ diff --git a/QEfficient/utils/config_utils.py b/QEfficient/utils/config_utils.py new file mode 100644 index 0000000000..1e6125d7ec --- /dev/null +++ b/QEfficient/utils/config_utils.py @@ -0,0 +1,40 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +from typing import Iterable, Optional + +from QEfficient.utils.constants import ATTENTION_HEAD_CONFIG_KEYS, HIDDEN_SIZE_CONFIG_KEYS, KV_HEAD_CONFIG_KEYS + + +def get_first_config_value(config, names: Iterable[str], default=None, cast_int: bool = False): + for name in names: + value = getattr(config, name, None) + if value is not None: + return int(value) if cast_int else value + return default + + +def resolve_attention_heads(config) -> Optional[int]: + return get_first_config_value(config, ATTENTION_HEAD_CONFIG_KEYS, cast_int=True) + + +def resolve_kv_heads(config) -> Optional[int]: + value = get_first_config_value(config, KV_HEAD_CONFIG_KEYS, cast_int=True) + if value is None: + value = resolve_attention_heads(config) + return value + + +def resolve_hidden_size(config) -> Optional[int]: + return get_first_config_value(config, HIDDEN_SIZE_CONFIG_KEYS, cast_int=True) + + +def set_kv_head_aliases(config, value: int): + setattr(config, "num_key_value_heads", value) + for key in KV_HEAD_CONFIG_KEYS: + if hasattr(config, key): + setattr(config, key, value) diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 339e4f4dac..20d2819ff7 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -104,6 +104,11 @@ def get_models_dir(): DEFAULT_AIC_HW_VERSION = "ai100" ONNX_TRANSFORM_MEMORY_CLEANUP_INTERVAL = 100 +# Generic config key aliases used across model families. +ATTENTION_HEAD_CONFIG_KEYS = ("num_attention_heads", "n_head", "n_heads", "num_heads") +KV_HEAD_CONFIG_KEYS = ("num_key_value_heads", "n_kv_heads", "num_kv_heads", "effective_n_kv_heads") +HIDDEN_SIZE_CONFIG_KEYS = ("hidden_size", "n_embd", "d_model") + # InternVL constants # Fixing the feature size with reference to OpenGVLab/InternVL2_5-1B, OpenGVLab/InternVL2_5-38B and OpenGVLab/InternVL2_5-78B INTERN_FEATURE_SIZE = 256 diff --git a/QEfficient/utils/test_utils.py b/QEfficient/utils/test_utils.py index f451d48933..93cda92f49 100644 --- a/QEfficient/utils/test_utils.py +++ b/QEfficient/utils/test_utils.py @@ -283,6 +283,14 @@ def load_qeff_model_with_sampler( return qeff_model +def get_text_config(config): + if hasattr(config, "text_config"): + return config.text_config + elif hasattr(config, "llm_config"): + return config.llm_config + return config + + # Processor class for InternVL models class InternProcessor: """ @@ -473,6 +481,31 @@ class ModelConfig: "Qwen/Qwen3-VL-2B-Instruct", } + REPEAT_KV_TEST_MODELS = { + "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "ibm-granite/granite-3.1-1b-a400m-base", + "Qwen/Qwen2-0.5B", + "bigcode/starcoder2-3b", + # "mistralai/Mixtral-8x7B-Instruct-v0.1", + "meta-llama/Llama-3.2-1B", + # "unsloth/gemma-2b", + # "unsloth/gemma-2-2b", + # "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", + "TheBloke/Llama-2-7B-GPTQ", + "neuralmagic/Llama-3.2-3B-Instruct-FP8", + "ibm-granite/granite-3.1-2b-instruct", + "llava-hf/llava-1.5-7b-hf", + "meta-llama/Llama-4-Scout-17B-16E-Instruct", + # "google/gemma-3-4b-it", + "allenai/Molmo-7B-D-0924", + "mistralai/Mistral-Small-3.1-24B-Instruct-2503", + "Qwen/Qwen2.5-VL-3B-Instruct", + "Qwen/Qwen3-VL-2B-Instruct", + "Qwen/Qwen3-VL-30B-A3B-Instruct", + "allenai/Molmo-7B-D-0924", + "OpenGVLab/InternVL2_5-1B", + } + EXTERNAL_MODELS = { "hpcai-tech/grok-1": { "pytorch_hf_tokens_custom_case": [ diff --git a/tests/configs/causal_model_configs.json b/tests/configs/causal_model_configs.json index 6d4f2c5b68..e18c2c50a1 100644 --- a/tests/configs/causal_model_configs.json +++ b/tests/configs/causal_model_configs.json @@ -324,6 +324,19 @@ "num_key_value_heads": 1 } }, + { + "model_name": "hpcai-tech/grok-1", + "model_type": null, + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 131072, + "num_key_value_heads": 1 + } + }, { "model_name": "Snowflake/Llama-3.1-SwiftKV-8B-Instruct", "model_type": null, @@ -715,6 +728,5 @@ "num_local_experts": 4 } } - ] -} +} \ No newline at end of file diff --git a/tests/transformers/models/causal_lm_models/check_causal_models.py b/tests/transformers/models/causal_lm_models/check_causal_models.py index f878acbe73..e604cb72f9 100644 --- a/tests/transformers/models/causal_lm_models/check_causal_models.py +++ b/tests/transformers/models/causal_lm_models/check_causal_models.py @@ -16,7 +16,8 @@ from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers from QEfficient.utils._utils import load_hf_tokenizer -from QEfficient.utils.constants import Constants +from QEfficient.utils.config_utils import get_first_config_value +from QEfficient.utils.constants import ATTENTION_HEAD_CONFIG_KEYS, KV_HEAD_CONFIG_KEYS, Constants from QEfficient.utils.run_utils import ApiRunner from QEfficient.utils.test_utils import ModelConfig, load_hf_causal_lm_model @@ -39,6 +40,52 @@ def get_custom_n_layers(model_name): return 1 +def check_kv_repeat_causal_lm_pytorch_vs_ai100( + model_name: str, + manual_cleanup: callable, + prompt_len: int = Constants.PROMPT_LEN, + ctx_len: int = Constants.CTX_LEN, + n_layer: int = -1, + config: Optional[AutoConfig] = None, +): + """ + Validate causal LM flow with repeated KV heads configuration. + """ + if config is None: + model_config = AutoConfig.from_pretrained( + model_name, + trust_remote_code=model_name in ModelConfig.EXTERNAL_MODELS, + ) + else: + model_config = config + + num_attention_heads = get_first_config_value(model_config, ATTENTION_HEAD_CONFIG_KEYS, default=1, cast_int=True) + num_key_value_heads = get_first_config_value(model_config, KV_HEAD_CONFIG_KEYS, default=None, cast_int=True) + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + if num_attention_heads < 1 or num_key_value_heads < 1: + raise ValueError( + f"Invalid heads in config for RepeatKV: " + f"num_attention_heads={num_attention_heads}, num_key_value_heads={num_key_value_heads}" + ) + if num_attention_heads % num_key_value_heads != 0: + raise ValueError( + f"Invalid heads in config for RepeatKV: num_attention_heads ({num_attention_heads}) " + f"is not divisible by num_key_value_heads ({num_key_value_heads})." + ) + num_kv_heads_repeat = num_attention_heads // num_key_value_heads + + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( + model_name=model_name, + manual_cleanup=manual_cleanup, + prompt_len=prompt_len, + ctx_len=ctx_len, + n_layer=n_layer, + config=config, + qaic_config={"num_kv_heads_repeat": num_kv_heads_repeat}, + ) + + def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( model_name: str, manual_cleanup: callable, @@ -71,15 +118,6 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( pytorch_kv_tokens = None ort_tokens = None - api_runner = ApiRunner( - batch_size, - tokenizer, - config, - prompts, - Constants.PROMPT_LEN, - Constants.CTX_LEN, - full_batch_size if continuous_batching else None, - ) qeff_model = QEFFAutoModelForCausalLM( copy.deepcopy(model_hf), is_tlm=is_tlm, @@ -94,6 +132,15 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( num_devices=num_devices, qaic_config=qaic_config, ) + api_runner = ApiRunner( + batch_size, + tokenizer, + qeff_model.config, + prompts, + Constants.PROMPT_LEN, + Constants.CTX_LEN, + full_batch_size if continuous_batching else None, + ) if continuous_batching is False: pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model) diff --git a/tests/transformers/models/causal_lm_models/test_causal_lm_models.py b/tests/transformers/models/causal_lm_models/test_causal_lm_models.py index 8c61cdc98d..ac476cd461 100644 --- a/tests/transformers/models/causal_lm_models/test_causal_lm_models.py +++ b/tests/transformers/models/causal_lm_models/test_causal_lm_models.py @@ -17,6 +17,7 @@ from .check_causal_models import ( check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100, + check_kv_repeat_causal_lm_pytorch_vs_ai100, get_custom_n_layers, ) @@ -67,6 +68,32 @@ def test_dummy_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, manual_cleanu check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, config=hf_config, manual_cleanup=manual_cleanup) +@pytest.mark.dummy_layers +@pytest.mark.on_qaic +@pytest.mark.llm_model +@pytest.mark.parametrize("model_name", test_models_causal) +def test_check_kv_repeat_custom_causal_lm_pytorch_vs_ai100(model_name, manual_cleanup): + """ + Test function to validate the PyTorch model and the Cloud AI 100 model with repeating original KV heads. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + """ + custom_config = model_config_dict[model_name] + hf_config = AutoConfig.from_pretrained( + model_name, + trust_remote_code=model_name in ModelConfig.EXTERNAL_MODELS, + **custom_config.get("additional_params", {}), + ) + if model_name in ModelConfig.REPEAT_KV_TEST_MODELS: + if model_name in ModelConfig.QUANTIZED_MODELS: + n_layer = get_custom_n_layers(model_name) + check_kv_repeat_causal_lm_pytorch_vs_ai100(model_name, manual_cleanup=manual_cleanup, n_layer=n_layer) + else: + check_kv_repeat_causal_lm_pytorch_vs_ai100(model_name, manual_cleanup=manual_cleanup, config=hf_config) + else: + pytest.skip(f"Skipping {model_name} as it is not in REPEAT_KV_TEST_MODELS") + + @pytest.mark.full_layers @pytest.mark.on_qaic @pytest.mark.llm_model diff --git a/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py b/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py index f52c0ab5d0..e4958e0f1a 100644 --- a/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py +++ b/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py @@ -30,6 +30,7 @@ from QEfficient.utils.test_utils import ( InternProcessor, ModelConfig, + get_text_config, load_vlm_model, load_vlm_model_from_config, set_num_layers_vlm, @@ -56,6 +57,9 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, config: Optional[AutoConfig] = None, + qaic_config: Optional[dict] = None, + num_kv_heads_repeat: Optional[int] = 1, + test_kv_replicate: Optional[bool] = None, torch_dtype: Optional[torch.dtype] = torch.float32, compare_results: Optional[bool] = False, ): @@ -70,11 +74,17 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( pytorch_kv_tokens = None ort_tokens = None n_layer = num_hidden_layers + qaic_config = copy.deepcopy(qaic_config) if qaic_config is not None else None if config is None: config = AutoConfig.from_pretrained( model_name, trust_remote_code=True, padding=model_name not in ModelConfig.MOLMO_MODELS ) config = set_num_layers_vlm(config, n_layer=n_layer) + if test_kv_replicate: + text_config = get_text_config(config) + num_kv_heads_repeat = text_config.num_attention_heads // text_config.num_key_value_heads + qaic_config = qaic_config or {} + qaic_config["num_kv_heads_repeat"] = num_kv_heads_repeat if hasattr(config, "model_type") and config.model_type in ["gemma3"]: config.text_config._sliding_window_pattern = 2 config.text_config.layer_types = ["sliding_attention", "full_attention"] @@ -92,7 +102,9 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name, kv_offload=kv_offload, config=config, + qaic_config=qaic_config, torch_dtype=torch_dtype, + num_kv_heads_repeat=num_kv_heads_repeat, ) else: model_hf = load_vlm_model(config) @@ -100,15 +112,24 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name, kv_offload=kv_offload, config=config, + qaic_config=qaic_config, torch_dtype=torch_dtype, + num_kv_heads_repeat=num_kv_heads_repeat, ) else: + if test_kv_replicate: + text_config = get_text_config(config) + num_kv_heads_repeat = text_config.num_attention_heads // text_config.num_key_value_heads + qaic_config = qaic_config or {} + qaic_config["num_kv_heads_repeat"] = num_kv_heads_repeat model_hf = load_vlm_model_from_config(config) qeff_model = QEFFAutoModelForImageTextToText( copy.deepcopy(model_hf), kv_offload=kv_offload, config=model_hf.config, + qaic_config=qaic_config, torch_dtype=torch_dtype, + num_kv_heads_repeat=num_kv_heads_repeat, ) compile_kwargs = { "num_devices": num_devices, @@ -117,6 +138,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( "mxfp6": False, "enable_qnn": enable_qnn, "qnn_config": qnn_config, + "qaic_config": qaic_config, } if model_name in ModelConfig.INTERNVL_MODELS: @@ -237,7 +259,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( # "Tokens don't match for pytorch HF output and pytorch KV output" # ) - _ = qeff_model.export() + # _ = qeff_model.export() # ort_tokens = api_runner.run_vlm_kv_model_on_ort(onnx_model_path) # assert (pytorch_hf_tokens == ort_tokens).all(), "Tokens don't match for pytorch HF output and ORT output" @@ -335,6 +357,57 @@ def test_dummy_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(model_name, kv_o ) +@pytest.mark.on_qaic +@pytest.mark.multimodal +@pytest.mark.dummy_layers +@pytest.mark.parametrize("model_name", test_mm_models) +@pytest.mark.parametrize("kv_offload", [True, False]) +def test_custom_replicate_kv_pytorch_vs_ai100( + model_name, + kv_offload, + manual_cleanup, +): + """ + Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, without continuous batching. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + """ + torch.manual_seed(42) + if model_name in ModelConfig.SKIPPED_MODELS: + pytest.skip("Test skipped for this model due to some issues.") + if model_name in ModelConfig.DUAL_QPC_MODELS and not kv_offload: + pytest.skip("These models require kv_offload=True for testing.") + + if model_name in ModelConfig.REPEAT_KV_TEST_MODELS: + hf_config = None + if model_name in ModelConfig.STANDARD_VLM_MODELS: + model_type = model_config_dict[model_name].get("model_type") + custom_config = model_config_dict[model_name].get("additional_params", {}) + hf_config = AutoConfig.for_model(model_type, trust_remote_code=True, **custom_config) + hf_config.name_or_path = model_name + + if hf_config is not None: + check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( + model_name=model_name, + kv_offload=kv_offload, + config=hf_config, + qaic_config={}, + test_kv_replicate=True, + manual_cleanup=manual_cleanup, + ) + else: + check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( + model_name=model_name, + num_hidden_layers=model_config_dict[model_name]["num_layers"], + kv_offload=kv_offload, + qaic_config={}, + test_kv_replicate=True, + manual_cleanup=manual_cleanup, + ) + else: + pytest.skip(f"Skipping replicate KV test for {model_name} as it's not in REPEAT_KV_TEST_MODELS") + + ################################ QNN Tests ################################