From 9fc7083298b33ff3836ee99fb85f2be8aa027ae6 Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Tue, 2 Jun 2026 20:37:21 -0500 Subject: [PATCH 1/2] Fix bug where attributes were being unintentionally dropped --- .../model_bridge/test_cohere_adapter.py | 31 +++++++++++++++++-- .../model_bridge/sources/_bridge_builder.py | 4 +++ .../model_bridge/sources/transformers.py | 4 +++ 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/tests/integration/model_bridge/test_cohere_adapter.py b/tests/integration/model_bridge/test_cohere_adapter.py index 5e92c1616..00cf762da 100644 --- a/tests/integration/model_bridge/test_cohere_adapter.py +++ b/tests/integration/model_bridge/test_cohere_adapter.py @@ -3,7 +3,8 @@ Model: trl-internal-testing/tiny-CohereForCausalLM - 2 layers, ~8M params, CPU-safe, no gating required - tie_word_embeddings=True by default - - logit_scale=0.0625 (1/16) + - logit_scale=0.125 (canonical Command-R uses 0.0625; the tiny model + deliberately diverges so regression tests can catch silent fallback bugs) NOTE: The tiny model has use_qk_norm=False, so QK-norm is not exercised here. Cohere's QK-norm is a per-head LayerNorm inside CohereAttention.forward; it is @@ -96,8 +97,32 @@ def test_cfg_uses_rms_norm_false(self, cohere_bridge: TransformerBridge) -> None def test_cfg_logit_scale_is_float(self, cohere_bridge: TransformerBridge) -> None: assert isinstance(getattr(cohere_bridge.cfg, "logit_scale"), float) - def test_cfg_logit_scale_value(self, cohere_bridge: TransformerBridge) -> None: - assert getattr(cohere_bridge.cfg, "logit_scale") == pytest.approx(0.0625) + def test_cfg_logit_scale_matches_hf( + self, cohere_bridge: TransformerBridge, cohere_hf: Any + ) -> None: + """Regression: bridge.cfg.logit_scale must equal hf_model.config.logit_scale. + + Catches the silent-fallback bug where logit_scale was missing from + _HF_PASSTHROUGH_ATTRS — the adapter's getattr(cfg, "logit_scale", None) + returned None and fell back to a hardcoded 0.0625 regardless of model. + The tiny model's logit_scale is 0.125 (canonical Command-R is 0.0625), + so this assertion catches the fallback by construction. + """ + assert cohere_bridge.cfg.logit_scale == cohere_hf.config.logit_scale # type: ignore[attr-defined] + # Anchor the expected value so a future passthrough regression also trips here. + assert cohere_bridge.cfg.logit_scale == pytest.approx(0.125) # type: ignore[attr-defined] + + def test_cfg_rope_parameters_matches_hf( + self, cohere_bridge: TransformerBridge, cohere_hf: Any + ) -> None: + """Regression: bridge.cfg.rope_parameters must equal hf_model.config.rope_parameters. + + Same passthrough trap as logit_scale — the Cohere adapter reads + rope_parameters via getattr() and falls back to a 10000.0 default theta. + For the tiny model (rope_theta=10000.0) the fallback happens to match, + but a model with non-default theta would silently use 10000.0. + """ + assert cohere_bridge.cfg.rope_parameters == cohere_hf.config.rope_parameters # type: ignore[attr-defined] # --------------------------------------------------------------------------- diff --git a/transformer_lens/model_bridge/sources/_bridge_builder.py b/transformer_lens/model_bridge/sources/_bridge_builder.py index 5ae2dc466..23cb9547b 100644 --- a/transformer_lens/model_bridge/sources/_bridge_builder.py +++ b/transformer_lens/model_bridge/sources/_bridge_builder.py @@ -39,6 +39,10 @@ "chunk_size", # Multimodal "vision_config", + # Cohere (and any architecture that reads logit_scale / rope_parameters + # directly from cfg in its adapter) + "logit_scale", + "rope_parameters", ] diff --git a/transformer_lens/model_bridge/sources/transformers.py b/transformer_lens/model_bridge/sources/transformers.py index e30a022f4..2770e385b 100644 --- a/transformer_lens/model_bridge/sources/transformers.py +++ b/transformer_lens/model_bridge/sources/transformers.py @@ -502,6 +502,10 @@ def boot( "chunk_size", # Multimodal "vision_config", + # Cohere (and any architecture that reads logit_scale / rope_parameters + # directly from cfg in its adapter) + "logit_scale", + "rope_parameters", ] for attr in _HF_PASSTHROUGH_ATTRS: val = getattr(hf_config, attr, None) From 5bc9d3dec47e921e88070f9d5150b14b26cfe860 Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Tue, 2 Jun 2026 20:42:18 -0500 Subject: [PATCH 2/2] format cleanup --- .../model_bridge/test_cohere_adapter.py | 30 ++++++------------- .../model_bridge/sources/_bridge_builder.py | 3 +- .../model_bridge/sources/transformers.py | 3 +- 3 files changed, 11 insertions(+), 25 deletions(-) diff --git a/tests/integration/model_bridge/test_cohere_adapter.py b/tests/integration/model_bridge/test_cohere_adapter.py index 00cf762da..6111ed834 100644 --- a/tests/integration/model_bridge/test_cohere_adapter.py +++ b/tests/integration/model_bridge/test_cohere_adapter.py @@ -3,8 +3,8 @@ Model: trl-internal-testing/tiny-CohereForCausalLM - 2 layers, ~8M params, CPU-safe, no gating required - tie_word_embeddings=True by default - - logit_scale=0.125 (canonical Command-R uses 0.0625; the tiny model - deliberately diverges so regression tests can catch silent fallback bugs) + - logit_scale=0.125 (canonical Command-R is 0.0625; tiny diverges so + regression tests catch silent-fallback bugs in the passthrough) NOTE: The tiny model has use_qk_norm=False, so QK-norm is not exercised here. Cohere's QK-norm is a per-head LayerNorm inside CohereAttention.forward; it is @@ -100,29 +100,17 @@ def test_cfg_logit_scale_is_float(self, cohere_bridge: TransformerBridge) -> Non def test_cfg_logit_scale_matches_hf( self, cohere_bridge: TransformerBridge, cohere_hf: Any ) -> None: - """Regression: bridge.cfg.logit_scale must equal hf_model.config.logit_scale. - - Catches the silent-fallback bug where logit_scale was missing from - _HF_PASSTHROUGH_ATTRS — the adapter's getattr(cfg, "logit_scale", None) - returned None and fell back to a hardcoded 0.0625 regardless of model. - The tiny model's logit_scale is 0.125 (canonical Command-R is 0.0625), - so this assertion catches the fallback by construction. - """ - assert cohere_bridge.cfg.logit_scale == cohere_hf.config.logit_scale # type: ignore[attr-defined] - # Anchor the expected value so a future passthrough regression also trips here. - assert cohere_bridge.cfg.logit_scale == pytest.approx(0.125) # type: ignore[attr-defined] + """Regression: logit_scale must propagate from HF (not silently fall back to 0.0625).""" + bridge_scale = getattr(cohere_bridge.cfg, "logit_scale") + assert bridge_scale == cohere_hf.config.logit_scale + # Anchor 0.125 so a passthrough regression that defaults to 0.0625 also trips here. + assert bridge_scale == pytest.approx(0.125) def test_cfg_rope_parameters_matches_hf( self, cohere_bridge: TransformerBridge, cohere_hf: Any ) -> None: - """Regression: bridge.cfg.rope_parameters must equal hf_model.config.rope_parameters. - - Same passthrough trap as logit_scale — the Cohere adapter reads - rope_parameters via getattr() and falls back to a 10000.0 default theta. - For the tiny model (rope_theta=10000.0) the fallback happens to match, - but a model with non-default theta would silently use 10000.0. - """ - assert cohere_bridge.cfg.rope_parameters == cohere_hf.config.rope_parameters # type: ignore[attr-defined] + """Regression: rope_parameters must propagate from HF (same passthrough trap as logit_scale).""" + assert getattr(cohere_bridge.cfg, "rope_parameters") == cohere_hf.config.rope_parameters # --------------------------------------------------------------------------- diff --git a/transformer_lens/model_bridge/sources/_bridge_builder.py b/transformer_lens/model_bridge/sources/_bridge_builder.py index 23cb9547b..0f5adb927 100644 --- a/transformer_lens/model_bridge/sources/_bridge_builder.py +++ b/transformer_lens/model_bridge/sources/_bridge_builder.py @@ -39,8 +39,7 @@ "chunk_size", # Multimodal "vision_config", - # Cohere (and any architecture that reads logit_scale / rope_parameters - # directly from cfg in its adapter) + # Cohere "logit_scale", "rope_parameters", ] diff --git a/transformer_lens/model_bridge/sources/transformers.py b/transformer_lens/model_bridge/sources/transformers.py index 2770e385b..049cda274 100644 --- a/transformer_lens/model_bridge/sources/transformers.py +++ b/transformer_lens/model_bridge/sources/transformers.py @@ -502,8 +502,7 @@ def boot( "chunk_size", # Multimodal "vision_config", - # Cohere (and any architecture that reads logit_scale / rope_parameters - # directly from cfg in its adapter) + # Cohere "logit_scale", "rope_parameters", ]